CoCalc -- slicetype-cl.c

05. Matplotlib / ffmpeg-3.0 / libx264 / encoder / slicetype-cl.c
⁵²⁸⁶⁶ views
1
/*****************************************************************************
2
 * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3
 *****************************************************************************
4
 * Copyright (C) 2012-2016 x264 project
5
 *
6
 * Authors: Steve Borho <[email protected]>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at [email protected].
24
 *****************************************************************************/
25

26
#include "common/common.h"
27
#include "macroblock.h"
28
#include "me.h"
29

30
#if HAVE_OPENCL
31
#ifdef _WIN32
32
#include <windows.h>
33
#endif
34

35
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
36

37
/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
38
 * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
39
 * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
40
#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
41

42
#define OCLCHECK( method, ... )\
43
do\
44
{\
45
    if( h->opencl.b_fatal_error )\
46
        return -1;\
47
    status = ocl->method( __VA_ARGS__ );\
48
    if( status != CL_SUCCESS ) {\
49
        h->param.b_opencl = 0;\
50
        h->opencl.b_fatal_error = 1;\
51
        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
52
        return -1;\
53
    }\
54
} while( 0 )
55

56
void x264_opencl_flush( x264_t *h )
57
{
58
    x264_opencl_function_t *ocl = h->opencl.ocl;
59

60
    ocl->clFinish( h->opencl.queue );
61

62
    /* Finish copies from the GPU by copying from the page-locked buffer to
63
     * their final destination */
64
    for( int i = 0; i < h->opencl.num_copies; i++ )
65
        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
66
    h->opencl.num_copies = 0;
67
    h->opencl.pl_occupancy = 0;
68
}
69

70
static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
71
{
72
    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
73
        x264_opencl_flush( h );
74
    assert( bytes < PAGE_LOCKED_BUF_SIZE );
75
    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
76
    h->opencl.pl_occupancy += bytes;
77
    return ptr;
78
}
79

80
int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
81
{
82
    if( fenc->b_intra_calculated )
83
        return 0;
84
    fenc->b_intra_calculated = 1;
85

86
    x264_opencl_function_t *ocl = h->opencl.ocl;
87
    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
88

89
#define CREATEBUF( out, flags, size )\
90
    out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
91
    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
92
#define CREATEIMAGE( out, flags, pf, width, height )\
93
    out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
94
    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
95

96
    int mb_count = h->mb.i_mb_count;
97
    cl_int status;
98

99
    if( !h->opencl.lowres_mv_costs )
100
    {
101
        /* Allocate shared memory buffers */
102
        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
103
        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
104

105
        cl_image_format pixel_format;
106
        pixel_format.image_channel_order = CL_R;
107
        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
108
        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
109

110
        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
111
        {
112
            pixel_format.image_channel_order = CL_RGBA;
113
            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
114
            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
115
            width >>= 1;
116
            height >>= 1;
117
        }
118

119
        CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
120
        CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
121
        CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
122
        CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
123
        CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
124
        CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
125
        CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
126
        CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
127
        CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
128
        CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
129
        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
130
        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
131
    }
132

133
    if( !fenc->opencl.intra_cost )
134
    {
135
        /* Allocate per-frame buffers */
136
        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
137
        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
138

139
        cl_image_format pixel_format;
140
        pixel_format.image_channel_order = CL_R;
141
        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
142
        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
143

144
        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
145
        {
146
            pixel_format.image_channel_order = CL_RGBA;
147
            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
148
            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
149
            width >>= 1;
150
            height >>= 1;
151
        }
152
        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
153
        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
154
        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
155
        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
156
        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
157
        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
158
    }
159
#undef CREATEBUF
160
#undef CREATEIMAGE
161

162
    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
163

164
    char *locked = x264_opencl_alloc_locked( h, luma_length );
165
    memcpy( locked, fenc->plane[0], luma_length );
166
    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
167

168
    size_t gdim[2];
169
    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
170
    {
171
        int size = h->mb.i_mb_count * sizeof(int16_t);
172
        locked = x264_opencl_alloc_locked( h, size );
173
        memcpy( locked, fenc->i_inv_qscale_factor, size );
174
        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
175
    }
176
    else
177
    {
178
        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
179
        cl_uint arg = 0;
180
        int16_t value = 256;
181
        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
182
        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
183
        gdim[0] = h->mb.i_mb_count;
184
        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
185
    }
186

187
    int stride = fenc->i_stride[0];
188
    cl_uint arg = 0;
189
    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
190
    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
191
    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
192
    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
193
    gdim[0] = 8 * h->mb.i_mb_width;
194
    gdim[1] = 8 * h->mb.i_mb_height;
195
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
196

197
    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
198
    {
199
        /* Workaround for AMD Southern Island:
200
         *
201
         * Alternate kernel instances.  No perf impact to this, so we do it for
202
         * all GPUs.  It prevents the same kernel from being enqueued
203
         * back-to-back, avoiding a dependency calculation bug in the driver.
204
         */
205
        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
206

207
        arg = 0;
208
        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
209
        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
210
        gdim[0] >>= 1;
211
        gdim[1] >>= 1;
212
        if( gdim[0] < 16 || gdim[1] < 16 )
213
            break;
214
        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
215
    }
216

217
    size_t ldim[2];
218
    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
219
    gdim[1] = 8*h->mb.i_mb_height;
220
    ldim[0] = 32;
221
    ldim[1] = 8;
222
    arg = 0;
223

224
    /* For presets slow, slower, and placebo, check all 10 intra modes that the
225
     * C lookahead supports.  For faster presets, only check the most frequent 8
226
     * modes
227
     */
228
    int slow = h->param.analyse.i_subpel_refine > 7;
229
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
230
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
231
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
232
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
233
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
234
    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
235
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
236

237
    gdim[0] = 256;
238
    gdim[1] = h->mb.i_mb_height;
239
    ldim[0] = 256;
240
    ldim[1] = 1;
241
    arg = 0;
242
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
243
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
244
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
245
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
246
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
247
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
248

249
    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
250
        x264_opencl_flush( h );
251

252
    int size = h->mb.i_mb_count * sizeof(int16_t);
253
    locked = x264_opencl_alloc_locked( h, size );
254
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
255
    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
256
    h->opencl.copies[h->opencl.num_copies].src = locked;
257
    h->opencl.copies[h->opencl.num_copies].bytes = size;
258
    h->opencl.num_copies++;
259

260
    size = h->mb.i_mb_height * sizeof(int);
261
    locked = x264_opencl_alloc_locked( h, size );
262
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
263
    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
264
    h->opencl.copies[h->opencl.num_copies].src = locked;
265
    h->opencl.copies[h->opencl.num_copies].bytes = size;
266
    h->opencl.num_copies++;
267

268
    size = sizeof(int) * 4;
269
    locked = x264_opencl_alloc_locked( h, size );
270
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
271
    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
272
    h->opencl.copies[h->opencl.num_copies].src = locked;
273
    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
274
    h->opencl.num_copies++;
275
    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
276
    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
277
    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
278
    h->opencl.num_copies++;
279

280
    h->opencl.last_buf = !h->opencl.last_buf;
281
    return 0;
282
}
283

284
/* This function was tested emprically on a number of AMD and NV GPUs.  Making a
285
 * function which returns perfect launch dimensions is impossible; some
286
 * applications will have self-tuning code to try many possible variables and
287
 * measure the runtime.  Here we simply make an educated guess based on what we
288
 * know GPUs typically prefer.  */
289
static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
290
{
291
    x264_opencl_function_t *ocl = h->opencl.ocl;
292
    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
293
    size_t preferred_multiple = 64;
294
    cl_uint num_cus = 6;
295

296
    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
297
    ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
298
    ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
299

300
    ldims[0] = preferred_multiple;
301
    ldims[1] = 8;
302

303
    /* make ldims[1] an even divisor of gdims[1] */
304
    while( gdims[1] & (ldims[1] - 1) )
305
    {
306
        ldims[0] <<= 1;
307
        ldims[1] >>= 1;
308
    }
309
    /* make total ldims fit under the max work-group dimensions for the device */
310
    while( ldims[0] * ldims[1] > max_work_group )
311
    {
312
        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
313
            ldims[1] >>= 1;
314
        else
315
            ldims[0] >>= 1;
316
    }
317

318
    if( ldims[0] > gdims[0] )
319
    {
320
        /* remove preferred multiples until we're close to gdims[0] */
321
        while( gdims[0] + preferred_multiple < ldims[0] )
322
            ldims[0] -= preferred_multiple;
323
        gdims[0] = ldims[0];
324
    }
325
    else
326
    {
327
        /* make gdims an even multiple of ldims */
328
        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
329
        gdims[0] *= ldims[0];
330
    }
331

332
    /* make ldims smaller to spread work across compute units */
333
    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
334
    {
335
        if( ldims[0] > preferred_multiple )
336
            ldims[0] >>= 1;
337
        else if( ldims[1] > 1 )
338
            ldims[1] >>= 1;
339
        else
340
            break;
341
    }
342
    /* for smaller GPUs, try not to abuse their texture cache */
343
    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
344
        ldims[0] = 32;
345
}
346

347
int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
348
{
349
    x264_opencl_function_t *ocl = h->opencl.ocl;
350
    x264_frame_t *fenc = frames[b];
351
    x264_frame_t *fref = frames[ref];
352

353
    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
354
    cl_mem ref_luma_hpel;
355
    cl_int status;
356

357
    if( w && w->weightfn )
358
    {
359
        size_t gdims[2];
360

361
        gdims[0] = 8 * h->mb.i_mb_width;
362
        gdims[1] = 8 * h->mb.i_mb_height;
363

364
        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
365
        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
366
        {
367
            cl_uint arg = 0;
368
            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
369
            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
370
            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
371
            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
372
            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
373
            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
374

375
            gdims[0] >>= 1;
376
            gdims[1] >>= 1;
377
            if( gdims[0] < 16 || gdims[1] < 16 )
378
                break;
379
        }
380

381
        cl_uint arg = 0;
382
        gdims[0] = 8 * h->mb.i_mb_width;
383
        gdims[1] = 8 * h->mb.i_mb_height;
384

385
        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
386
        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
387
        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
388
        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
389
        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
390
        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
391

392
        /* Use weighted reference planes for motion search */
393
        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
394
            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
395
        ref_luma_hpel = h->opencl.weighted_luma_hpel;
396
    }
397
    else
398
    {
399
        /* Use unweighted reference planes for motion search */
400
        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
401
            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
402
        ref_luma_hpel = fref->opencl.luma_hpel;
403
    }
404

405
    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
406
    int b_first_iteration = 1;
407
    int b_reverse_references = 1;
408
    int A = 1;
409

410

411
    int mb_per_group = 0;
412
    int cost_local_size = 0;
413
    int mvc_local_size = 0;
414
    int mb_width;
415

416
    size_t gdims[2];
417
    size_t ldims[2];
418

419
    /* scale 0 is 8x8 */
420
    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
421
    {
422
        mb_width = h->mb.i_mb_width >> scale;
423
        gdims[0] = mb_width;
424
        gdims[1] = h->mb.i_mb_height >> scale;
425
        if( gdims[0] < 2 || gdims[1] < 2 )
426
            continue;
427
        gdims[0] <<= 2;
428
        x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
429

430
        mb_per_group = (ldims[0] >> 2) * ldims[1];
431
        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
432
        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
433
        int scaled_me_range = h->param.analyse.i_me_range >> scale;
434
        int b_shift_index = 1;
435

436
        cl_uint arg = 0;
437
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
438
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
439
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
440
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
441
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
442
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
443
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
444
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
445
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
446
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
447
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
448
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
449
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
450
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
451
        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
452

453
        for( int iter = 0; iter < num_iterations[scale]; iter++ )
454
        {
455
            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
456

457
            b_shift_index = 0;
458
            b_first_iteration = 0;
459

460
            /* alternate top-left vs bot-right MB references at lower scales, so
461
             * motion field smooths more quickly.  */
462
            if( scale > 2 )
463
                b_reverse_references ^= 1;
464
            else
465
                b_reverse_references = 0;
466
            A = !A;
467
            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
468
            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
469
            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
470
            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
471
            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
472
        }
473
    }
474

475
    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
476
    cl_uint arg = 0;
477
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
478
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
479
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
480
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
481
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
482
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
483
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
484

485
    if( b_islist1 )
486
    {
487
        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
488
        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
489
    }
490
    else
491
    {
492
        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
493
        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
494
    }
495

496
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
497
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
498
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
499
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
500
    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
501

502
    if( h->opencl.b_device_AMD_SI )
503
    {
504
        /* workaround for AMD Southern Island driver scheduling bug (fixed in
505
         * July 2012), perform meaningless small copy to add a data dependency */
506
        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
507
    }
508

509
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
510

511
    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
512

513
    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
514
        x264_opencl_flush( h );
515

516
    char *locked = x264_opencl_alloc_locked( h, mvlen );
517
    h->opencl.copies[h->opencl.num_copies].src = locked;
518
    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
519

520
    if( b_islist1 )
521
    {
522
        int mvs_offset = mvlen * (ref - b - 1);
523
        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
524
        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
525
    }
526
    else
527
    {
528
        int mvs_offset = mvlen * (b - ref - 1);
529
        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
530
        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
531
    }
532

533
    h->opencl.num_copies++;
534

535
    return 0;
536
}
537

538
int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
539
{
540
    x264_opencl_function_t *ocl = h->opencl.ocl;
541
    cl_int status;
542
    x264_frame_t *fenc = frames[b];
543
    x264_frame_t *fref0 = frames[p0];
544
    x264_frame_t *fref1 = frames[p1];
545

546
    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
547

548
    /* Tasks for this kernel:
549
     * 1. Select least cost mode (intra, ref0, ref1)
550
     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
551
     * 2. if B frame, try bidir predictions.
552
     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
553
    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
554
    size_t ldim_bidir[2];
555
    size_t *ldims = NULL;
556
    int cost_local_size = 4;
557
    int satd_local_size = 4;
558
    if( b < p1 )
559
    {
560
        /* For B frames, use 4 threads per MB for BIDIR checks */
561
        ldims = ldim_bidir;
562
        gdims[0] <<= 2;
563
        x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
564
        int mb_per_group = (ldims[0] >> 2) * ldims[1];
565
        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
566
        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
567
    }
568

569
    cl_uint arg = 0;
570
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
571
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
572
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
573
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
574
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
575
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
576
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
577
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
578
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
579
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
580
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
581
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
582
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
583
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
584
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
585
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
586
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
587
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
588
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
589
    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
590
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
591

592
    /* Sum costs across rows, atomicAdd down frame */
593
    size_t gdim[2] = { 256, h->mb.i_mb_height };
594
    size_t ldim[2] = { 256, 1 };
595

596
    arg = 0;
597
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
598
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
599
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
600
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
601
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
602
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
603
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
604
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
605
    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
606
    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
607

608
    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
609
        x264_opencl_flush( h );
610

611
    int size =  h->mb.i_mb_count * sizeof(int16_t);
612
    char *locked = x264_opencl_alloc_locked( h, size );
613
    h->opencl.copies[h->opencl.num_copies].src = locked;
614
    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
615
    h->opencl.copies[h->opencl.num_copies].bytes = size;
616
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
617
    h->opencl.num_copies++;
618

619
    size =  h->mb.i_mb_height * sizeof(int);
620
    locked = x264_opencl_alloc_locked( h, size );
621
    h->opencl.copies[h->opencl.num_copies].src = locked;
622
    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
623
    h->opencl.copies[h->opencl.num_copies].bytes = size;
624
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
625
    h->opencl.num_copies++;
626

627
    size =  4 * sizeof(int);
628
    locked = x264_opencl_alloc_locked( h, size );
629
    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
630
    h->opencl.last_buf = !h->opencl.last_buf;
631

632
    h->opencl.copies[h->opencl.num_copies].src = locked;
633
    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
634
    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
635
    h->opencl.num_copies++;
636
    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
637
    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
638
    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
639
    h->opencl.num_copies++;
640

641
    if( b == p1 ) // P frames only
642
    {
643
        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
644
        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
645
        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
646
        h->opencl.num_copies++;
647
    }
648
    return 0;
649
}
650

651
void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
652
{
653
    if( h->param.b_opencl )
654
    {
655
#ifdef _WIN32
656
        /* Temporarily boost priority of this lookahead thread and the OpenCL
657
         * driver's thread until the end of this function.  On AMD GPUs this
658
         * greatly reduces the latency of enqueuing kernels and getting results
659
         * on Windows. */
660
        HANDLE id = GetCurrentThread();
661
        h->opencl.lookahead_thread_pri = GetThreadPriority( id );
662
        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
663
        x264_opencl_function_t *ocl = h->opencl.ocl;
664
        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
665
        if( status == CL_SUCCESS )
666
        {
667
            h->opencl.opencl_thread_pri = GetThreadPriority( id );
668
            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
669
        }
670
#endif
671

672
        /* precalculate intra and I frames */
673
        for( int i = 0; i <= num_frames; i++ )
674
            x264_opencl_lowres_init( h, frames[i], lambda );
675
        x264_opencl_flush( h );
676

677
        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
678
        {
679
            /* For trellis B-Adapt, precompute exhaustive motion searches */
680
            for( int b = 0; b <= num_frames; b++ )
681
            {
682
                for( int j = 1; j < h->param.i_bframe; j++ )
683
                {
684
                    int p0 = b - j;
685
                    if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
686
                    {
687
                        const x264_weight_t *w = x264_weight_none;
688

689
                        if( h->param.analyse.i_weighted_pred )
690
                        {
691
                            x264_emms();
692
                            x264_weights_analyse( h, frames[b], frames[p0], 1 );
693
                            w = frames[b]->weight[0];
694
                        }
695
                        frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
696
                        x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
697
                    }
698
                    int p1 = b + j;
699
                    if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
700
                    {
701
                        frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
702
                        x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
703
                    }
704
                }
705
            }
706

707
            x264_opencl_flush( h );
708
        }
709
    }
710
}
711

712

713
void x264_opencl_slicetype_end( x264_t *h )
714
{
715
#ifdef _WIN32
716
    if( h->param.b_opencl )
717
    {
718
        HANDLE id = GetCurrentThread();
719
        SetThreadPriority( id, h->opencl.lookahead_thread_pri );
720
        x264_opencl_function_t *ocl = h->opencl.ocl;
721
        cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
722
        if( status == CL_SUCCESS )
723
            SetThreadPriority( id, h->opencl.opencl_thread_pri );
724
    }
725
#endif
726
}
727

728
int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
729
{
730
    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
731
        return 0;
732
    else
733
    {
734
        int do_search[2];
735
        int dist_scale_factor = 128;
736
        const x264_weight_t *w = x264_weight_none;
737

738
        // avoid duplicating work
739
        frames[b]->i_cost_est[b-p0][p1-b] = 0;
740

741
        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
742
        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
743
        if( do_search[0] )
744
        {
745
            if( h->param.analyse.i_weighted_pred && b == p1 )
746
            {
747
                x264_emms();
748
                x264_weights_analyse( h, frames[b], frames[p0], 1 );
749
                w = frames[b]->weight[0];
750
            }
751
            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
752
        }
753
        if( do_search[1] )
754
            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
755
        if( b == p1 )
756
            frames[b]->i_intra_mbs[b-p0] = 0;
757
        if( p1 != p0 )
758
            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
759

760
        frames[b]->i_cost_est[b-p0][p1-b] = 0;
761
        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
762

763
        x264_opencl_lowres_init( h, frames[b], lambda );
764

765
        if( do_search[0] )
766
        {
767
            x264_opencl_lowres_init( h, frames[p0], lambda );
768
            x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
769
        }
770
        if( do_search[1] )
771
        {
772
            x264_opencl_lowres_init( h, frames[p1], lambda );
773
            x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
774
        }
775
        x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
776
        return 1;
777
    }
778
}
779

780
#endif
781

782
Product

Resources

Company