Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52866 views
1
/*****************************************************************************
2
* slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3
*****************************************************************************
4
* Copyright (C) 2012-2016 x264 project
5
*
6
* Authors: Steve Borho <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "macroblock.h"
28
#include "me.h"
29
30
#if HAVE_OPENCL
31
#ifdef _WIN32
32
#include <windows.h>
33
#endif
34
35
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
36
37
/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
38
* in the OpenCL headers shipped with NVIDIA drivers. We need to be
39
* able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
40
#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
41
42
#define OCLCHECK( method, ... )\
43
do\
44
{\
45
if( h->opencl.b_fatal_error )\
46
return -1;\
47
status = ocl->method( __VA_ARGS__ );\
48
if( status != CL_SUCCESS ) {\
49
h->param.b_opencl = 0;\
50
h->opencl.b_fatal_error = 1;\
51
x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
52
return -1;\
53
}\
54
} while( 0 )
55
56
void x264_opencl_flush( x264_t *h )
57
{
58
x264_opencl_function_t *ocl = h->opencl.ocl;
59
60
ocl->clFinish( h->opencl.queue );
61
62
/* Finish copies from the GPU by copying from the page-locked buffer to
63
* their final destination */
64
for( int i = 0; i < h->opencl.num_copies; i++ )
65
memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
66
h->opencl.num_copies = 0;
67
h->opencl.pl_occupancy = 0;
68
}
69
70
static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
71
{
72
if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
73
x264_opencl_flush( h );
74
assert( bytes < PAGE_LOCKED_BUF_SIZE );
75
char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
76
h->opencl.pl_occupancy += bytes;
77
return ptr;
78
}
79
80
int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
81
{
82
if( fenc->b_intra_calculated )
83
return 0;
84
fenc->b_intra_calculated = 1;
85
86
x264_opencl_function_t *ocl = h->opencl.ocl;
87
int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
88
89
#define CREATEBUF( out, flags, size )\
90
out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
91
if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
92
#define CREATEIMAGE( out, flags, pf, width, height )\
93
out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
94
if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
95
96
int mb_count = h->mb.i_mb_count;
97
cl_int status;
98
99
if( !h->opencl.lowres_mv_costs )
100
{
101
/* Allocate shared memory buffers */
102
int width = h->mb.i_mb_width * 8 * sizeof(pixel);
103
int height = h->mb.i_mb_height * 8 * sizeof(pixel);
104
105
cl_image_format pixel_format;
106
pixel_format.image_channel_order = CL_R;
107
pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
108
CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
109
110
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
111
{
112
pixel_format.image_channel_order = CL_RGBA;
113
pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
114
CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
115
width >>= 1;
116
height >>= 1;
117
}
118
119
CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
120
CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
121
CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
122
CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
123
CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
124
CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
125
CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
126
CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
127
CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
128
CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
129
CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
130
CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
131
}
132
133
if( !fenc->opencl.intra_cost )
134
{
135
/* Allocate per-frame buffers */
136
int width = h->mb.i_mb_width * 8 * sizeof(pixel);
137
int height = h->mb.i_mb_height * 8 * sizeof(pixel);
138
139
cl_image_format pixel_format;
140
pixel_format.image_channel_order = CL_R;
141
pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
142
CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
143
144
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
145
{
146
pixel_format.image_channel_order = CL_RGBA;
147
pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
148
CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
149
width >>= 1;
150
height >>= 1;
151
}
152
CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
153
CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
154
CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
155
CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
156
CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
157
CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
158
}
159
#undef CREATEBUF
160
#undef CREATEIMAGE
161
162
/* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
163
164
char *locked = x264_opencl_alloc_locked( h, luma_length );
165
memcpy( locked, fenc->plane[0], luma_length );
166
OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
167
168
size_t gdim[2];
169
if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
170
{
171
int size = h->mb.i_mb_count * sizeof(int16_t);
172
locked = x264_opencl_alloc_locked( h, size );
173
memcpy( locked, fenc->i_inv_qscale_factor, size );
174
OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
175
}
176
else
177
{
178
/* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
179
cl_uint arg = 0;
180
int16_t value = 256;
181
OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
182
OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
183
gdim[0] = h->mb.i_mb_count;
184
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
185
}
186
187
int stride = fenc->i_stride[0];
188
cl_uint arg = 0;
189
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
190
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
191
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
192
OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
193
gdim[0] = 8 * h->mb.i_mb_width;
194
gdim[1] = 8 * h->mb.i_mb_height;
195
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
196
197
for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
198
{
199
/* Workaround for AMD Southern Island:
200
*
201
* Alternate kernel instances. No perf impact to this, so we do it for
202
* all GPUs. It prevents the same kernel from being enqueued
203
* back-to-back, avoiding a dependency calculation bug in the driver.
204
*/
205
cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
206
207
arg = 0;
208
OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
209
OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
210
gdim[0] >>= 1;
211
gdim[1] >>= 1;
212
if( gdim[0] < 16 || gdim[1] < 16 )
213
break;
214
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
215
}
216
217
size_t ldim[2];
218
gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
219
gdim[1] = 8*h->mb.i_mb_height;
220
ldim[0] = 32;
221
ldim[1] = 8;
222
arg = 0;
223
224
/* For presets slow, slower, and placebo, check all 10 intra modes that the
225
* C lookahead supports. For faster presets, only check the most frequent 8
226
* modes
227
*/
228
int slow = h->param.analyse.i_subpel_refine > 7;
229
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
230
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
231
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
232
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
233
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
234
OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
235
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
236
237
gdim[0] = 256;
238
gdim[1] = h->mb.i_mb_height;
239
ldim[0] = 256;
240
ldim[1] = 1;
241
arg = 0;
242
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
243
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
244
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
245
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
246
OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
247
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
248
249
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
250
x264_opencl_flush( h );
251
252
int size = h->mb.i_mb_count * sizeof(int16_t);
253
locked = x264_opencl_alloc_locked( h, size );
254
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
255
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
256
h->opencl.copies[h->opencl.num_copies].src = locked;
257
h->opencl.copies[h->opencl.num_copies].bytes = size;
258
h->opencl.num_copies++;
259
260
size = h->mb.i_mb_height * sizeof(int);
261
locked = x264_opencl_alloc_locked( h, size );
262
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
263
h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
264
h->opencl.copies[h->opencl.num_copies].src = locked;
265
h->opencl.copies[h->opencl.num_copies].bytes = size;
266
h->opencl.num_copies++;
267
268
size = sizeof(int) * 4;
269
locked = x264_opencl_alloc_locked( h, size );
270
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
271
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
272
h->opencl.copies[h->opencl.num_copies].src = locked;
273
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
274
h->opencl.num_copies++;
275
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
276
h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
277
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
278
h->opencl.num_copies++;
279
280
h->opencl.last_buf = !h->opencl.last_buf;
281
return 0;
282
}
283
284
/* This function was tested emprically on a number of AMD and NV GPUs. Making a
285
* function which returns perfect launch dimensions is impossible; some
286
* applications will have self-tuning code to try many possible variables and
287
* measure the runtime. Here we simply make an educated guess based on what we
288
* know GPUs typically prefer. */
289
static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
290
{
291
x264_opencl_function_t *ocl = h->opencl.ocl;
292
size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
293
size_t preferred_multiple = 64;
294
cl_uint num_cus = 6;
295
296
ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
297
ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
298
ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
299
300
ldims[0] = preferred_multiple;
301
ldims[1] = 8;
302
303
/* make ldims[1] an even divisor of gdims[1] */
304
while( gdims[1] & (ldims[1] - 1) )
305
{
306
ldims[0] <<= 1;
307
ldims[1] >>= 1;
308
}
309
/* make total ldims fit under the max work-group dimensions for the device */
310
while( ldims[0] * ldims[1] > max_work_group )
311
{
312
if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
313
ldims[1] >>= 1;
314
else
315
ldims[0] >>= 1;
316
}
317
318
if( ldims[0] > gdims[0] )
319
{
320
/* remove preferred multiples until we're close to gdims[0] */
321
while( gdims[0] + preferred_multiple < ldims[0] )
322
ldims[0] -= preferred_multiple;
323
gdims[0] = ldims[0];
324
}
325
else
326
{
327
/* make gdims an even multiple of ldims */
328
gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
329
gdims[0] *= ldims[0];
330
}
331
332
/* make ldims smaller to spread work across compute units */
333
while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
334
{
335
if( ldims[0] > preferred_multiple )
336
ldims[0] >>= 1;
337
else if( ldims[1] > 1 )
338
ldims[1] >>= 1;
339
else
340
break;
341
}
342
/* for smaller GPUs, try not to abuse their texture cache */
343
if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
344
ldims[0] = 32;
345
}
346
347
int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
348
{
349
x264_opencl_function_t *ocl = h->opencl.ocl;
350
x264_frame_t *fenc = frames[b];
351
x264_frame_t *fref = frames[ref];
352
353
cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
354
cl_mem ref_luma_hpel;
355
cl_int status;
356
357
if( w && w->weightfn )
358
{
359
size_t gdims[2];
360
361
gdims[0] = 8 * h->mb.i_mb_width;
362
gdims[1] = 8 * h->mb.i_mb_height;
363
364
/* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
365
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
366
{
367
cl_uint arg = 0;
368
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
369
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
370
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
371
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
372
OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
373
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
374
375
gdims[0] >>= 1;
376
gdims[1] >>= 1;
377
if( gdims[0] < 16 || gdims[1] < 16 )
378
break;
379
}
380
381
cl_uint arg = 0;
382
gdims[0] = 8 * h->mb.i_mb_width;
383
gdims[1] = 8 * h->mb.i_mb_height;
384
385
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
386
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
387
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
388
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
389
OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
390
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
391
392
/* Use weighted reference planes for motion search */
393
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
394
ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
395
ref_luma_hpel = h->opencl.weighted_luma_hpel;
396
}
397
else
398
{
399
/* Use unweighted reference planes for motion search */
400
for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
401
ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
402
ref_luma_hpel = fref->opencl.luma_hpel;
403
}
404
405
const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
406
int b_first_iteration = 1;
407
int b_reverse_references = 1;
408
int A = 1;
409
410
411
int mb_per_group = 0;
412
int cost_local_size = 0;
413
int mvc_local_size = 0;
414
int mb_width;
415
416
size_t gdims[2];
417
size_t ldims[2];
418
419
/* scale 0 is 8x8 */
420
for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
421
{
422
mb_width = h->mb.i_mb_width >> scale;
423
gdims[0] = mb_width;
424
gdims[1] = h->mb.i_mb_height >> scale;
425
if( gdims[0] < 2 || gdims[1] < 2 )
426
continue;
427
gdims[0] <<= 2;
428
x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
429
430
mb_per_group = (ldims[0] >> 2) * ldims[1];
431
cost_local_size = 4 * mb_per_group * sizeof(int16_t);
432
mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
433
int scaled_me_range = h->param.analyse.i_me_range >> scale;
434
int b_shift_index = 1;
435
436
cl_uint arg = 0;
437
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
438
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
439
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
440
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
441
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
442
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
443
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
444
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
445
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
446
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
447
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
448
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
449
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
450
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
451
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
452
453
for( int iter = 0; iter < num_iterations[scale]; iter++ )
454
{
455
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
456
457
b_shift_index = 0;
458
b_first_iteration = 0;
459
460
/* alternate top-left vs bot-right MB references at lower scales, so
461
* motion field smooths more quickly. */
462
if( scale > 2 )
463
b_reverse_references ^= 1;
464
else
465
b_reverse_references = 0;
466
A = !A;
467
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
468
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
469
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
470
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
471
OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
472
}
473
}
474
475
int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
476
cl_uint arg = 0;
477
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
478
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
479
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
480
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
481
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
482
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
483
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
484
485
if( b_islist1 )
486
{
487
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
488
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
489
}
490
else
491
{
492
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
493
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
494
}
495
496
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
497
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
498
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
499
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
500
OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
501
502
if( h->opencl.b_device_AMD_SI )
503
{
504
/* workaround for AMD Southern Island driver scheduling bug (fixed in
505
* July 2012), perform meaningless small copy to add a data dependency */
506
OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
507
}
508
509
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
510
511
int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
512
513
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
514
x264_opencl_flush( h );
515
516
char *locked = x264_opencl_alloc_locked( h, mvlen );
517
h->opencl.copies[h->opencl.num_copies].src = locked;
518
h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
519
520
if( b_islist1 )
521
{
522
int mvs_offset = mvlen * (ref - b - 1);
523
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
524
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
525
}
526
else
527
{
528
int mvs_offset = mvlen * (b - ref - 1);
529
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
530
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
531
}
532
533
h->opencl.num_copies++;
534
535
return 0;
536
}
537
538
int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
539
{
540
x264_opencl_function_t *ocl = h->opencl.ocl;
541
cl_int status;
542
x264_frame_t *fenc = frames[b];
543
x264_frame_t *fref0 = frames[p0];
544
x264_frame_t *fref1 = frames[p1];
545
546
int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
547
548
/* Tasks for this kernel:
549
* 1. Select least cost mode (intra, ref0, ref1)
550
* list_used 0, 1, 2, or 3. if B frame, do not allow intra
551
* 2. if B frame, try bidir predictions.
552
* 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
553
size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
554
size_t ldim_bidir[2];
555
size_t *ldims = NULL;
556
int cost_local_size = 4;
557
int satd_local_size = 4;
558
if( b < p1 )
559
{
560
/* For B frames, use 4 threads per MB for BIDIR checks */
561
ldims = ldim_bidir;
562
gdims[0] <<= 2;
563
x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
564
int mb_per_group = (ldims[0] >> 2) * ldims[1];
565
cost_local_size = 4 * mb_per_group * sizeof(int16_t);
566
satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
567
}
568
569
cl_uint arg = 0;
570
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
571
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
572
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
573
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
574
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
575
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
576
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
577
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
578
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
579
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
580
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
581
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
582
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
583
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
584
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
585
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
586
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
587
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
588
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
589
OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
590
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
591
592
/* Sum costs across rows, atomicAdd down frame */
593
size_t gdim[2] = { 256, h->mb.i_mb_height };
594
size_t ldim[2] = { 256, 1 };
595
596
arg = 0;
597
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
598
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
599
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
600
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
601
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
602
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
603
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
604
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
605
OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
606
OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
607
608
if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
609
x264_opencl_flush( h );
610
611
int size = h->mb.i_mb_count * sizeof(int16_t);
612
char *locked = x264_opencl_alloc_locked( h, size );
613
h->opencl.copies[h->opencl.num_copies].src = locked;
614
h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
615
h->opencl.copies[h->opencl.num_copies].bytes = size;
616
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
617
h->opencl.num_copies++;
618
619
size = h->mb.i_mb_height * sizeof(int);
620
locked = x264_opencl_alloc_locked( h, size );
621
h->opencl.copies[h->opencl.num_copies].src = locked;
622
h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
623
h->opencl.copies[h->opencl.num_copies].bytes = size;
624
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
625
h->opencl.num_copies++;
626
627
size = 4 * sizeof(int);
628
locked = x264_opencl_alloc_locked( h, size );
629
OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
630
h->opencl.last_buf = !h->opencl.last_buf;
631
632
h->opencl.copies[h->opencl.num_copies].src = locked;
633
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
634
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
635
h->opencl.num_copies++;
636
h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
637
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
638
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
639
h->opencl.num_copies++;
640
641
if( b == p1 ) // P frames only
642
{
643
h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
644
h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
645
h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
646
h->opencl.num_copies++;
647
}
648
return 0;
649
}
650
651
void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
652
{
653
if( h->param.b_opencl )
654
{
655
#ifdef _WIN32
656
/* Temporarily boost priority of this lookahead thread and the OpenCL
657
* driver's thread until the end of this function. On AMD GPUs this
658
* greatly reduces the latency of enqueuing kernels and getting results
659
* on Windows. */
660
HANDLE id = GetCurrentThread();
661
h->opencl.lookahead_thread_pri = GetThreadPriority( id );
662
SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
663
x264_opencl_function_t *ocl = h->opencl.ocl;
664
cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
665
if( status == CL_SUCCESS )
666
{
667
h->opencl.opencl_thread_pri = GetThreadPriority( id );
668
SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
669
}
670
#endif
671
672
/* precalculate intra and I frames */
673
for( int i = 0; i <= num_frames; i++ )
674
x264_opencl_lowres_init( h, frames[i], lambda );
675
x264_opencl_flush( h );
676
677
if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
678
{
679
/* For trellis B-Adapt, precompute exhaustive motion searches */
680
for( int b = 0; b <= num_frames; b++ )
681
{
682
for( int j = 1; j < h->param.i_bframe; j++ )
683
{
684
int p0 = b - j;
685
if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
686
{
687
const x264_weight_t *w = x264_weight_none;
688
689
if( h->param.analyse.i_weighted_pred )
690
{
691
x264_emms();
692
x264_weights_analyse( h, frames[b], frames[p0], 1 );
693
w = frames[b]->weight[0];
694
}
695
frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
696
x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
697
}
698
int p1 = b + j;
699
if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
700
{
701
frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
702
x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
703
}
704
}
705
}
706
707
x264_opencl_flush( h );
708
}
709
}
710
}
711
712
713
void x264_opencl_slicetype_end( x264_t *h )
714
{
715
#ifdef _WIN32
716
if( h->param.b_opencl )
717
{
718
HANDLE id = GetCurrentThread();
719
SetThreadPriority( id, h->opencl.lookahead_thread_pri );
720
x264_opencl_function_t *ocl = h->opencl.ocl;
721
cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
722
if( status == CL_SUCCESS )
723
SetThreadPriority( id, h->opencl.opencl_thread_pri );
724
}
725
#endif
726
}
727
728
int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
729
{
730
if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
731
return 0;
732
else
733
{
734
int do_search[2];
735
int dist_scale_factor = 128;
736
const x264_weight_t *w = x264_weight_none;
737
738
// avoid duplicating work
739
frames[b]->i_cost_est[b-p0][p1-b] = 0;
740
741
do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
742
do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
743
if( do_search[0] )
744
{
745
if( h->param.analyse.i_weighted_pred && b == p1 )
746
{
747
x264_emms();
748
x264_weights_analyse( h, frames[b], frames[p0], 1 );
749
w = frames[b]->weight[0];
750
}
751
frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
752
}
753
if( do_search[1] )
754
frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
755
if( b == p1 )
756
frames[b]->i_intra_mbs[b-p0] = 0;
757
if( p1 != p0 )
758
dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
759
760
frames[b]->i_cost_est[b-p0][p1-b] = 0;
761
frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
762
763
x264_opencl_lowres_init( h, frames[b], lambda );
764
765
if( do_search[0] )
766
{
767
x264_opencl_lowres_init( h, frames[p0], lambda );
768
x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
769
}
770
if( do_search[1] )
771
{
772
x264_opencl_lowres_init( h, frames[p1], lambda );
773
x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
774
}
775
x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
776
return 1;
777
}
778
}
779
780
#endif
781
782