CoCalc -- analyse.c

05. Matplotlib / ffmpeg-3.0 / libx264 / encoder / analyse.c
⁵²⁸⁶⁶ views
1
/*****************************************************************************
2
 * analyse.c: macroblock analysis
3
 *****************************************************************************
4
 * Copyright (C) 2003-2016 x264 project
5
 *
6
 * Authors: Laurent Aimar <[email protected]>
7
 *          Loren Merritt <[email protected]>
8
 *          Fiona Glaser <[email protected]>
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
 *
24
 * This program is also available under a commercial proprietary license.
25
 * For more information, contact us at [email protected].
26
 *****************************************************************************/
27

28
#define _ISOC99_SOURCE
29

30
#include "common/common.h"
31
#include "macroblock.h"
32
#include "me.h"
33
#include "ratecontrol.h"
34
#include "analyse.h"
35
#include "rdo.c"
36

37
typedef struct
38
{
39
    /* 16x16 */
40
    int       i_rd16x16;
41
    x264_me_t me16x16;
42
    x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
43

44
    /* 8x8 */
45
    int       i_cost8x8;
46
    /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47
    ALIGNED_4( int16_t mvc[32][5][2] );
48
    x264_me_t me8x8[4];
49

50
    /* Sub 4x4 */
51
    int       i_cost4x4[4]; /* cost per 8x8 partition */
52
    x264_me_t me4x4[4][4];
53

54
    /* Sub 8x4 */
55
    int       i_cost8x4[4]; /* cost per 8x8 partition */
56
    x264_me_t me8x4[4][2];
57

58
    /* Sub 4x8 */
59
    int       i_cost4x8[4]; /* cost per 8x8 partition */
60
    x264_me_t me4x8[4][2];
61

62
    /* 16x8 */
63
    int       i_cost16x8;
64
    x264_me_t me16x8[2];
65

66
    /* 8x16 */
67
    int       i_cost8x16;
68
    x264_me_t me8x16[2];
69

70
} x264_mb_analysis_list_t;
71

72
typedef struct
73
{
74
    /* conduct the analysis using this lamda and QP */
75
    int i_lambda;
76
    int i_lambda2;
77
    int i_qp;
78
    uint16_t *p_cost_mv;
79
    uint16_t *p_cost_ref[2];
80
    int i_mbrd;
81

82

83
    /* I: Intra part */
84
    /* Take some shortcuts in intra search if intra is deemed unlikely */
85
    int b_fast_intra;
86
    int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
87
    int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
88
    int b_try_skip;
89

90
    /* Luma part */
91
    int i_satd_i16x16;
92
    int i_satd_i16x16_dir[7];
93
    int i_predict16x16;
94

95
    int i_satd_i8x8;
96
    int i_cbp_i8x8_luma;
97
    ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
98
    int i_predict8x8[4];
99

100
    int i_satd_i4x4;
101
    int i_predict4x4[16];
102

103
    int i_satd_pcm;
104

105
    /* Chroma part */
106
    int i_satd_chroma;
107
    int i_satd_chroma_dir[7];
108
    int i_predict8x8chroma;
109

110
    /* II: Inter part P/B frame */
111
    x264_mb_analysis_list_t l0;
112
    x264_mb_analysis_list_t l1;
113

114
    int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115
    int i_cost16x16direct;
116
    int i_cost8x8bi;
117
    int i_cost8x8direct[4];
118
    int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119
    int i_cost_est16x8[2]; /* Per-partition estimated cost */
120
    int i_cost_est8x16[2];
121
    int i_cost16x8bi;
122
    int i_cost8x16bi;
123
    int i_rd16x16bi;
124
    int i_rd16x16direct;
125
    int i_rd16x8bi;
126
    int i_rd8x16bi;
127
    int i_rd8x8bi;
128

129
    int i_mb_partition16x8[2]; /* mb_partition_e */
130
    int i_mb_partition8x16[2];
131
    int i_mb_type16x8; /* mb_class_e */
132
    int i_mb_type8x16;
133

134
    int b_direct_available;
135
    int b_early_terminate;
136

137
} x264_mb_analysis_t;
138

139
/* lambda = pow(2,qp/6-2) */
140
const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
141
{
142
   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
143
   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
144
   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
145
   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
146
  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
147
  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
148
  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
149
 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
150
 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
151
1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
152
2048,2299,                               /* 80-81 */
153
};
154

155
/* lambda2 = pow(lambda,2) * .9 * 256 */
156
/* Capped to avoid overflow */
157
const int x264_lambda2_tab[QP_MAX_MAX+1] =
158
{
159
       14,       18,       22,       28,       36,       45,      57,      72, /*  0- 7 */
160
       91,      115,      145,      182,      230,      290,     365,     460, /*  8-15 */
161
      580,      731,      921,     1161,     1462,     1843,    2322,    2925, /* 16-23 */
162
     3686,     4644,     5851,     7372,     9289,    11703,   14745,   18578, /* 24-31 */
163
    23407,    29491,    37156,    46814,    58982,    74313,   93628,  117964, /* 32-39 */
164
   148626,   187257,   235929,   297252,   374514,   471859,  594505,  749029, /* 40-47 */
165
   943718,  1189010,  1498059,  1887436,  2378021,  2996119, 3774873, 4756042, /* 48-55 */
166
  5992238,  7549747,  9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
167
 38048341, 47937906, 60397977, 76096683, 95875813,120795955,                   /* 64-69 */
168
134217727,134217727,134217727,134217727,134217727,134217727,                   /* 70-75 */
169
134217727,134217727,134217727,134217727,134217727,134217727,                   /* 76-81 */
170
};
171

172
const uint8_t x264_exp2_lut[64] =
173
{
174
      0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
175
     48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
176
    106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
177
    175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
178
};
179

180
const float x264_log2_lut[128] =
181
{
182
    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
183
    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
184
    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
185
    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
186
    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
187
    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
188
    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
189
    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
190
    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
191
    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
192
    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
193
    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
194
    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
195
    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
196
    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
197
    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
198
};
199

200
/* Avoid an int/float conversion. */
201
const float x264_log2_lz_lut[32] =
202
{
203
    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
204
};
205

206
// should the intra and inter lambdas be different?
207
// I'm just matching the behaviour of deadzone quant.
208
static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
209
{
210
    // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
211
    {
212
               46,       58,       73,       92,      117,      147,
213
              185,      233,      294,      370,      466,      587,
214
              740,      932,     1174,     1480,     1864,     2349,
215
             2959,     3728,     4697,     5918,     7457,     9395,
216
            11837,    14914,    18790,    23674,    29828,    37581,
217
            47349,    59656,    75163,    94699,   119313,   150326,
218
           189399,   238627,   300652,   378798,   477255,   601304,
219
           757596,   954511,  1202608,  1515192,  1909022,  2405217,
220
          3030384,  3818045,  4810435,  6060769,  7636091,  9620872,
221
         12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
222
         48486154, 61088726, 76966972, 96972308,
223
        122177453,134217727,134217727,134217727,134217727,134217727,
224
        134217727,134217727,134217727,134217727,134217727,134217727,
225
    },
226
    // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
227
    {
228
               27,       34,       43,       54,       68,       86,
229
              108,      136,      172,      216,      273,      343,
230
              433,      545,      687,      865,     1090,     1374,
231
             1731,     2180,     2747,     3461,     4361,     5494,
232
             6922,     8721,    10988,    13844,    17442,    21976,
233
            27688,    34885,    43953,    55377,    69771,    87906,
234
           110755,   139543,   175813,   221511,   279087,   351627,
235
           443023,   558174,   703255,   886046,  1116348,  1406511,
236
          1772093,  2232697,  2813022,  3544186,  4465396,  5626046,
237
          7088374,  8930791, 11252092, 14176748, 17861583, 22504184,
238
         28353495, 35723165, 45008368, 56706990,
239
         71446330, 90016736,113413980,134217727,134217727,134217727,
240
        134217727,134217727,134217727,134217727,134217727,134217727,
241
        134217727,134217727,134217727,134217727,134217727,134217727,
242
    }
243
};
244

245
#define MAX_CHROMA_LAMBDA_OFFSET 36
246
static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
247
{
248
       16,    20,    25,    32,    40,    50,
249
       64,    80,   101,   128,   161,   203,
250
      256,   322,   406,   512,   645,   812,
251
     1024,  1290,  1625,  2048,  2580,  3250,
252
     4096,  5160,  6501,  8192, 10321, 13003,
253
    16384, 20642, 26007, 32768, 41285, 52015,
254
    65535
255
};
256

257
/* TODO: calculate CABAC costs */
258
static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
259
{
260
    9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
261
};
262
static const uint8_t i_mb_b16x8_cost_table[17] =
263
{
264
    0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
265
};
266
static const uint8_t i_sub_mb_b_cost_table[13] =
267
{
268
    7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
269
};
270
static const uint8_t i_sub_mb_p_cost_table[4] =
271
{
272
    5, 3, 3, 1
273
};
274

275
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
276

277
static uint16_t x264_cost_ref[QP_MAX+1][3][33];
278
static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279
static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
280

281
static int init_costs( x264_t *h, float *logs, int qp )
282
{
283
    int lambda = x264_lambda_tab[qp];
284
    if( h->cost_mv[qp] )
285
        return 0;
286
    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
287
    CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
288
    h->cost_mv[qp] += 2*4*2048;
289
    for( int i = 0; i <= 2*4*2048; i++ )
290
    {
291
        h->cost_mv[qp][-i] =
292
        h->cost_mv[qp][i]  = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
293
    }
294
    x264_pthread_mutex_lock( &cost_ref_mutex );
295
    for( int i = 0; i < 3; i++ )
296
        for( int j = 0; j < 33; j++ )
297
            x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
298
    x264_pthread_mutex_unlock( &cost_ref_mutex );
299
    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
300
    {
301
        for( int j = 0; j < 4; j++ )
302
        {
303
            CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
304
            h->cost_mv_fpel[qp][j] += 2*2048;
305
            for( int i = -2*2048; i < 2*2048; i++ )
306
                h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
307
        }
308
    }
309
    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
310
    for( int i = 0; i < 17; i++ )
311
        cost_i4x4_mode[i] = 3*lambda*(i!=8);
312
    return 0;
313
fail:
314
    return -1;
315
}
316

317
int x264_analyse_init_costs( x264_t *h )
318
{
319
    float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
320
    if( !logs )
321
        return -1;
322

323
    logs[0] = 0.718f;
324
    for( int i = 1; i <= 2*4*2048; i++ )
325
        logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
326

327
    for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
328
        if( init_costs( h, logs, qp ) )
329
            goto fail;
330

331
    if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
332
        goto fail;
333

334
    x264_free( logs );
335
    return 0;
336
fail:
337
    x264_free( logs );
338
    return -1;
339
}
340

341
void x264_analyse_free_costs( x264_t *h )
342
{
343
    for( int i = 0; i < QP_MAX+1; i++ )
344
    {
345
        if( h->cost_mv[i] )
346
            x264_free( h->cost_mv[i] - 2*4*2048 );
347
        if( h->cost_mv_fpel[i][0] )
348
            for( int j = 0; j < 4; j++ )
349
                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
350
    }
351
}
352

353
void x264_analyse_weight_frame( x264_t *h, int end )
354
{
355
    for( int j = 0; j < h->i_ref[0]; j++ )
356
    {
357
        if( h->sh.weight[j][0].weightfn )
358
        {
359
            x264_frame_t *frame = h->fref[0][j];
360
            int width = frame->i_width[0] + 2*PADH;
361
            int i_padv = PADV << PARAM_INTERLACED;
362
            int offset, height;
363
            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
364
            height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
365
            offset = h->fenc->i_lines_weighted*frame->i_stride[0];
366
            h->fenc->i_lines_weighted += height;
367
            if( height )
368
                for( int k = j; k < h->i_ref[0]; k++ )
369
                    if( h->sh.weight[k][0].weightfn )
370
                    {
371
                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
372
                        x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
373
                                                 src + offset, frame->i_stride[0],
374
                                                 width, height, &h->sh.weight[k][0] );
375
                    }
376
            break;
377
        }
378
    }
379
}
380

381
/* initialize an array of lambda*nbits for all possible mvs */
382
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
383
{
384
    a->p_cost_mv = h->cost_mv[a->i_qp];
385
    a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
386
    a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
387
}
388

389
static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
390
{
391
    int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
392
    a->i_lambda = x264_lambda_tab[qp];
393
    a->i_lambda2 = x264_lambda2_tab[qp];
394

395
    h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
396
    if( h->param.analyse.i_trellis )
397
    {
398
        h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
399
        h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
400
        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
401
        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
402
    }
403
    h->mb.i_psy_rd_lambda = a->i_lambda;
404
    /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
405
    int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
406
    h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
407

408
    if( qp > QP_MAX_SPEC )
409
    {
410
        h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
411
        h->nr_residual_sum = h->nr_residual_sum_buf[1];
412
        h->nr_count = h->nr_count_buf[1];
413
        h->mb.b_noise_reduction = 1;
414
        qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
415
    }
416
    else
417
    {
418
        h->nr_offset = h->nr_offset_denoise;
419
        h->nr_residual_sum = h->nr_residual_sum_buf[0];
420
        h->nr_count = h->nr_count_buf[0];
421
        h->mb.b_noise_reduction = 0;
422
    }
423

424
    a->i_qp = h->mb.i_qp = qp;
425
    h->mb.i_chroma_qp = h->chroma_qp_table[qp];
426
}
427

428
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
429
{
430
    int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
431

432
    /* mbrd == 1 -> RD mode decision */
433
    /* mbrd == 2 -> RD refinement */
434
    /* mbrd == 3 -> QPRD */
435
    a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
436
    h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
437
    a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
438

439
    x264_mb_analyse_init_qp( h, a, qp );
440

441
    h->mb.b_transform_8x8 = 0;
442

443
    /* I: Intra part */
444
    a->i_satd_i16x16 =
445
    a->i_satd_i8x8   =
446
    a->i_satd_i4x4   =
447
    a->i_satd_chroma = COST_MAX;
448

449
    /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
450
     * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
451
    uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
452
    a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
453

454
    a->b_fast_intra = 0;
455
    a->b_avoid_topright = 0;
456
    h->mb.i_skip_intra =
457
        h->mb.b_lossless ? 0 :
458
        a->i_mbrd ? 2 :
459
        !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
460

461
    /* II: Inter part P/B frame */
462
    if( h->sh.i_type != SLICE_TYPE_I )
463
    {
464
        int i_fmv_range = 4 * h->param.analyse.i_mv_range;
465
        // limit motion search to a slightly smaller range than the theoretical limit,
466
        // since the search may go a few iterations past its given range
467
        int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
468

469
        /* Calculate max allowed MV range */
470
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
471
        h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
472
        h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
473
        h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
474
        h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
475
        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
476
        {
477
            int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
478
            int max_mv = max_x - 4*16*h->mb.i_mb_x;
479
            /* If we're left of the refresh bar, don't reference right of it. */
480
            if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
481
                h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
482
        }
483
        h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
484
        h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
485
        if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
486
        {
487
            int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
488
            int thread_mvy_range = i_fmv_range;
489

490
            if( h->i_thread_frames > 1 )
491
            {
492
                int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
493
                int thresh = pix_y + h->param.analyse.i_mv_range_thread;
494
                for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
495
                    for( int j = 0; j < h->i_ref[i]; j++ )
496
                    {
497
                        x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
498
                        thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
499
                    }
500

501
                if( h->param.b_deterministic )
502
                    thread_mvy_range = h->param.analyse.i_mv_range_thread;
503
                if( PARAM_INTERLACED )
504
                    thread_mvy_range >>= 1;
505

506
                x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
507
            }
508

509
            if( PARAM_INTERLACED )
510
            {
511
                /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
512
                for( int i = 0; i < 3; i++ )
513
                {
514
                    int j = i == 2;
515
                    mb_y = (h->mb.i_mb_y >> j) + (i == 1);
516
                    h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
517
                    h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
518
                    h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
519
                    h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
520
                    h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
521
                    h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
522
                    h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
523
                }
524
            }
525
            else
526
            {
527
                h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
528
                h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
529
                h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
530
                h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
531
                h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
532
                h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
533
                h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
534
            }
535
        }
536
        if( PARAM_INTERLACED )
537
        {
538
            int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
539
            h->mb.mv_min[1] = h->mb.mv_miny_row[i];
540
            h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
541
            h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
542
            h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
543
            h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
544
            h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
545
        }
546
#undef CLIP_FMV
547

548
        a->l0.me16x16.cost =
549
        a->l0.i_rd16x16    =
550
        a->l0.i_cost8x8    =
551
        a->l0.i_cost16x8   =
552
        a->l0.i_cost8x16   = COST_MAX;
553
        if( h->sh.i_type == SLICE_TYPE_B )
554
        {
555
            a->l1.me16x16.cost =
556
            a->l1.i_rd16x16    =
557
            a->l1.i_cost8x8    =
558
            a->i_cost8x8direct[0] =
559
            a->i_cost8x8direct[1] =
560
            a->i_cost8x8direct[2] =
561
            a->i_cost8x8direct[3] =
562
            a->l1.i_cost16x8   =
563
            a->l1.i_cost8x16   =
564
            a->i_rd16x16bi     =
565
            a->i_rd16x16direct =
566
            a->i_rd8x8bi       =
567
            a->i_rd16x8bi      =
568
            a->i_rd8x16bi      =
569
            a->i_cost16x16bi   =
570
            a->i_cost16x16direct =
571
            a->i_cost8x8bi     =
572
            a->i_cost16x8bi    =
573
            a->i_cost8x16bi    = COST_MAX;
574
        }
575
        else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
576
            for( int i = 0; i < 4; i++ )
577
            {
578
                a->l0.i_cost4x4[i] =
579
                a->l0.i_cost8x4[i] =
580
                a->l0.i_cost4x8[i] = COST_MAX;
581
            }
582

583
        /* Fast intra decision */
584
        if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
585
        {
586
            /* Always run in fast-intra mode for subme < 3 */
587
            if( h->mb.i_subpel_refine > 2 &&
588
              ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
589
                IS_INTRA( h->mb.i_mb_type_top ) ||
590
                IS_INTRA( h->mb.i_mb_type_topleft ) ||
591
                IS_INTRA( h->mb.i_mb_type_topright ) ||
592
                (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
593
                (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
594
            { /* intra is likely */ }
595
            else
596
            {
597
                a->b_fast_intra = 1;
598
            }
599
        }
600
        h->mb.b_skip_mc = 0;
601
        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
602
            h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
603
        {
604
            a->b_force_intra = 1;
605
            a->b_fast_intra = 0;
606
            a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
607
        }
608
        else
609
            a->b_force_intra = 0;
610
    }
611
}
612

613
/* Prediction modes allowed for various combinations of neighbors. */
614
/* Terminated by a -1. */
615
/* In order, no neighbors, left, top, top/left, top/left/topleft */
616
static const int8_t i16x16_mode_available[5][5] =
617
{
618
    {I_PRED_16x16_DC_128, -1, -1, -1, -1},
619
    {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
620
    {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
621
    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
622
    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
623
};
624

625
static const int8_t chroma_mode_available[5][5] =
626
{
627
    {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
628
    {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
629
    {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
630
    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
631
    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
632
};
633

634
static const int8_t i8x8_mode_available[2][5][10] =
635
{
636
    {
637
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
638
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
639
        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
640
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
641
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
642
    },
643
    {
644
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
645
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
646
        {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
647
        {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
648
        {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
649
    }
650
};
651

652
static const int8_t i4x4_mode_available[2][5][10] =
653
{
654
    {
655
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
656
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
657
        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
658
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
659
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
660
    },
661
    {
662
        {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
663
        {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
664
        {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
665
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
666
        {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
667
    }
668
};
669

670
static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
671
{
672
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
673
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
674
    return i16x16_mode_available[idx];
675
}
676

677
static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
678
{
679
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
680
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
681
    return chroma_mode_available[idx];
682
}
683

684
static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
685
{
686
    int avoid_topright = force_intra && (i&1);
687
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
688
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
689
    return i8x8_mode_available[avoid_topright][idx];
690
}
691

692
static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
693
{
694
    int avoid_topright = force_intra && ((i&5) == 5);
695
    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
696
    idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
697
    return i4x4_mode_available[avoid_topright][idx];
698
}
699

700
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
701
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
702
{
703
    ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
704

705
    if( do_both_dct || h->mb.b_transform_8x8 )
706
        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
707
    if( do_both_dct || !h->mb.b_transform_8x8 )
708
        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
709
}
710

711
/* Reset fenc satd scores cache for psy RD */
712
static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
713
{
714
    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
715
        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
716
    if( !h->mb.i_psy_rd )
717
        return;
718
    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
719
    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
720
    if( b_satd )
721
        h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
722
}
723

724
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
725
{
726
    if( a->i_satd_chroma < COST_MAX )
727
        return;
728

729
    if( CHROMA444 )
730
    {
731
        if( !h->mb.b_chroma_me )
732
        {
733
            a->i_satd_chroma = 0;
734
            return;
735
        }
736

737
        /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
738
        if( h->mb.b_lossless )
739
        {
740
            x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
741
            x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
742
        }
743
        else
744
        {
745
            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
746
            h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
747
        }
748
        a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
749
                         + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
750
        return;
751
    }
752

753
    const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
754
    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
755

756
    /* Prediction selection for chroma */
757
    if( predict_mode[3] >= 0 && !h->mb.b_lossless )
758
    {
759
        int satdu[4], satdv[4];
760
        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
761
        h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
762
        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
763
        h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
764
        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
765
        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
766

767
        for( ; *predict_mode >= 0; predict_mode++ )
768
        {
769
            int i_mode = *predict_mode;
770
            int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
771

772
            a->i_satd_chroma_dir[i_mode] = i_satd;
773
            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
774
        }
775
    }
776
    else
777
    {
778
        for( ; *predict_mode >= 0; predict_mode++ )
779
        {
780
            int i_satd;
781
            int i_mode = *predict_mode;
782

783
            /* we do the prediction */
784
            if( h->mb.b_lossless )
785
                x264_predict_lossless_chroma( h, i_mode );
786
            else
787
            {
788
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
789
                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
790
            }
791

792
            /* we calculate the cost */
793
            i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
794
                     h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
795
                     a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
796

797
            a->i_satd_chroma_dir[i_mode] = i_satd;
798
            COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
799
        }
800
    }
801

802
    h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
803
}
804

805
/* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
806
static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
807
{
808
    const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
809
    pixel *p_src = h->mb.pic.p_fenc[0];
810
    pixel *p_dst = h->mb.pic.p_fdec[0];
811
    static const int8_t intra_analysis_shortcut[2][2][2][5] =
812
    {
813
        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
814
          {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
815
         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
816
          {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
817
        {{{I_PRED_4x4_HU, -1, -1, -1, -1},
818
          {-1, -1, -1, -1, -1}},
819
         {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
820
          {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
821
    };
822

823
    int idx;
824
    int lambda = a->i_lambda;
825

826
    /*---------------- Try all mode and calculate their score ---------------*/
827
    /* Disabled i16x16 for AVC-Intra compat */
828
    if( !h->param.i_avcintra_class )
829
    {
830
        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
831

832
        /* Not heavily tuned */
833
        static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
834
        int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
835

836
        if( !h->mb.b_lossless && predict_mode[3] >= 0 )
837
        {
838
            h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
839
            a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
840
            a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
841
            a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
842
            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
843
            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
844
            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
845

846
            /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
847
            if( a->i_satd_i16x16 <= i16x16_thresh )
848
            {
849
                h->predict_16x16[I_PRED_16x16_P]( p_dst );
850
                a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
851
                a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
852
                COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
853
            }
854
        }
855
        else
856
        {
857
            for( ; *predict_mode >= 0; predict_mode++ )
858
            {
859
                int i_satd;
860
                int i_mode = *predict_mode;
861

862
                if( h->mb.b_lossless )
863
                    x264_predict_lossless_16x16( h, 0, i_mode );
864
                else
865
                    h->predict_16x16[i_mode]( p_dst );
866

867
                i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
868
                         lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
869
                COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
870
                a->i_satd_i16x16_dir[i_mode] = i_satd;
871
            }
872
        }
873

874
        if( h->sh.i_type == SLICE_TYPE_B )
875
            /* cavlc mb type prefix */
876
            a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
877

878
        if( a->i_satd_i16x16 > i16x16_thresh )
879
            return;
880
    }
881

882
    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
883
    /* 8x8 prediction selection */
884
    if( flags & X264_ANALYSE_I8x8 )
885
    {
886
        ALIGNED_ARRAY_32( pixel, edge,[36] );
887
        x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
888
        int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
889

890
        // FIXME some bias like in i4x4?
891
        int i_cost = lambda * 4; /* base predmode costs */
892
        h->mb.i_cbp_luma = 0;
893

894
        if( h->sh.i_type == SLICE_TYPE_B )
895
            i_cost += lambda * i_mb_b_cost_table[I_8x8];
896

897
        for( idx = 0;; idx++ )
898
        {
899
            int x = idx&1;
900
            int y = idx>>1;
901
            pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
902
            pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
903
            int i_best = COST_MAX;
904
            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
905

906
            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
907
            h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
908

909
            if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
910
            {
911
                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
912
                i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
913
                i_cost += i_best & 0xffff;
914
                i_best >>= 16;
915
                a->i_predict8x8[idx] = i_best;
916
                if( idx == 3 || i_cost > i_satd_thresh )
917
                    break;
918
                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
919
            }
920
            else
921
            {
922
                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
923
                {
924
                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
925
                    h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
926
                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
927
                    satd[i_pred_mode] -= 3 * lambda;
928
                    for( int i = 2; i >= 0; i-- )
929
                    {
930
                        int cost = satd[i];
931
                        a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
932
                        COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
933
                    }
934

935
                    /* Take analysis shortcuts: don't analyse modes that are too
936
                     * far away direction-wise from the favored mode. */
937
                    if( a->i_mbrd < 1 + a->b_fast_intra )
938
                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
939
                    else
940
                        predict_mode += 3;
941
                }
942

943
                for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
944
                {
945
                    int i_satd;
946
                    int i_mode = *predict_mode;
947

948
                    if( h->mb.b_lossless )
949
                        x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
950
                    else
951
                        h->predict_8x8[i_mode]( p_dst_by, edge );
952

953
                    i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
954
                    if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
955
                        i_satd -= 3 * lambda;
956

957
                    COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
958
                    a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
959
                }
960
                i_cost += i_best + 3*lambda;
961

962
                if( idx == 3 || i_cost > i_satd_thresh )
963
                    break;
964
                if( h->mb.b_lossless )
965
                    x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
966
                else
967
                    h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
968
                x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
969
            }
970
            /* we need to encode this block now (for next ones) */
971
            x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
972
        }
973

974
        if( idx == 3 )
975
        {
976
            a->i_satd_i8x8 = i_cost;
977
            if( h->mb.i_skip_intra )
978
            {
979
                h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
980
                h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
981
                h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
982
                h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
983
                h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
984
                h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
985
                if( h->mb.i_skip_intra == 2 )
986
                    h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
987
            }
988
        }
989
        else
990
        {
991
            static const uint16_t cost_div_fix8[3] = {1024,512,341};
992
            a->i_satd_i8x8 = COST_MAX;
993
            i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
994
        }
995
        /* Not heavily tuned */
996
        static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
997
        if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
998
            return;
999
    }
1000

1001
    /* 4x4 prediction selection */
1002
    if( flags & X264_ANALYSE_I4x4 )
1003
    {
1004
        int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1005
        int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1006
        h->mb.i_cbp_luma = 0;
1007

1008
        if( a->b_early_terminate && a->i_mbrd )
1009
            i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1010

1011
        if( h->sh.i_type == SLICE_TYPE_B )
1012
            i_cost += lambda * i_mb_b_cost_table[I_4x4];
1013

1014
        for( idx = 0;; idx++ )
1015
        {
1016
            pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1017
            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1018
            int i_best = COST_MAX;
1019
            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1020

1021
            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1022

1023
            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1024
                /* emulate missing topright samples */
1025
                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1026

1027
            if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1028
            {
1029
                /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1030
                i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1031
                i_cost += i_best & 0xffff;
1032
                i_best >>= 16;
1033
                a->i_predict4x4[idx] = i_best;
1034
                if( i_cost > i_satd_thresh || idx == 15 )
1035
                    break;
1036
                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1037
            }
1038
            else
1039
            {
1040
                if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1041
                {
1042
                    ALIGNED_ARRAY_16( int32_t, satd,[9] );
1043
                    h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1044
                    int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1045
                    satd[i_pred_mode] -= 3 * lambda;
1046
                    i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1047
                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1048
                    COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1049

1050
                    /* Take analysis shortcuts: don't analyse modes that are too
1051
                     * far away direction-wise from the favored mode. */
1052
                    if( a->i_mbrd < 1 + a->b_fast_intra )
1053
                        predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1054
                    else
1055
                        predict_mode += 3;
1056
                }
1057

1058
                if( i_best > 0 )
1059
                {
1060
                    for( ; *predict_mode >= 0; predict_mode++ )
1061
                    {
1062
                        int i_satd;
1063
                        int i_mode = *predict_mode;
1064

1065
                        if( h->mb.b_lossless )
1066
                            x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1067
                        else
1068
                            h->predict_4x4[i_mode]( p_dst_by );
1069

1070
                        i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1071
                        if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1072
                        {
1073
                            i_satd -= lambda * 3;
1074
                            if( i_satd <= 0 )
1075
                            {
1076
                                i_best = i_satd;
1077
                                a->i_predict4x4[idx] = i_mode;
1078
                                break;
1079
                            }
1080
                        }
1081

1082
                        COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1083
                    }
1084
                }
1085

1086
                i_cost += i_best + 3 * lambda;
1087
                if( i_cost > i_satd_thresh || idx == 15 )
1088
                    break;
1089
                if( h->mb.b_lossless )
1090
                    x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1091
                else
1092
                    h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1093
                h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1094
            }
1095
            /* we need to encode this block now (for next ones) */
1096
            x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1097
        }
1098
        if( idx == 15 )
1099
        {
1100
            a->i_satd_i4x4 = i_cost;
1101
            if( h->mb.i_skip_intra )
1102
            {
1103
                h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1104
                h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1105
                h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1106
                h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1107
                h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1108
                h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1109
                if( h->mb.i_skip_intra == 2 )
1110
                    h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1111
            }
1112
        }
1113
        else
1114
            a->i_satd_i4x4 = COST_MAX;
1115
    }
1116
}
1117

1118
static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1119
{
1120
    if( !a->b_early_terminate )
1121
        i_satd_thresh = COST_MAX;
1122

1123
    if( a->i_satd_i16x16 < i_satd_thresh )
1124
    {
1125
        h->mb.i_type = I_16x16;
1126
        x264_analyse_update_cache( h, a );
1127
        a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1128
    }
1129
    else
1130
        a->i_satd_i16x16 = COST_MAX;
1131

1132
    if( a->i_satd_i4x4 < i_satd_thresh )
1133
    {
1134
        h->mb.i_type = I_4x4;
1135
        x264_analyse_update_cache( h, a );
1136
        a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1137
    }
1138
    else
1139
        a->i_satd_i4x4 = COST_MAX;
1140

1141
    if( a->i_satd_i8x8 < i_satd_thresh )
1142
    {
1143
        h->mb.i_type = I_8x8;
1144
        x264_analyse_update_cache( h, a );
1145
        a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1146
        a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1147
    }
1148
    else
1149
        a->i_satd_i8x8 = COST_MAX;
1150
}
1151

1152
static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1153
{
1154
    uint64_t i_satd, i_best;
1155
    int plane_count = CHROMA444 ? 3 : 1;
1156
    h->mb.i_skip_intra = 0;
1157

1158
    if( h->mb.i_type == I_16x16 )
1159
    {
1160
        int old_pred_mode = a->i_predict16x16;
1161
        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1162
        int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1163
        i_best = a->i_satd_i16x16;
1164
        for( ; *predict_mode >= 0; predict_mode++ )
1165
        {
1166
            int i_mode = *predict_mode;
1167
            if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1168
                continue;
1169
            h->mb.i_intra16x16_pred_mode = i_mode;
1170
            i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1171
            COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1172
        }
1173
    }
1174

1175
    /* RD selection for chroma prediction */
1176
    if( !CHROMA444 )
1177
    {
1178
        const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1179
        if( predict_mode[1] >= 0 )
1180
        {
1181
            int8_t predict_mode_sorted[4];
1182
            int i_max;
1183
            int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1184

1185
            for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1186
            {
1187
                int i_mode = *predict_mode;
1188
                if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1189
                    predict_mode_sorted[i_max++] = i_mode;
1190
            }
1191

1192
            if( i_max > 0 )
1193
            {
1194
                int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1195
                int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1196
                /* the previous thing encoded was x264_intra_rd(), so the pixels and
1197
                 * coefs for the current chroma mode are still around, so we only
1198
                 * have to recount the bits. */
1199
                i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1200
                for( int i = 0; i < i_max; i++ )
1201
                {
1202
                    int i_mode = predict_mode_sorted[i];
1203
                    if( h->mb.b_lossless )
1204
                        x264_predict_lossless_chroma( h, i_mode );
1205
                    else
1206
                    {
1207
                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1208
                        h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1209
                    }
1210
                    /* if we've already found a mode that needs no residual, then
1211
                     * probably any mode with a residual will be worse.
1212
                     * so avoid dct on the remaining modes to improve speed. */
1213
                    i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1214
                    COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1215
                }
1216
                h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1217
                h->mb.i_cbp_chroma = i_cbp_chroma_best;
1218
            }
1219
        }
1220
    }
1221

1222
    if( h->mb.i_type == I_4x4 )
1223
    {
1224
        pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1225
        int nnz[3] = {0};
1226
        for( int idx = 0; idx < 16; idx++ )
1227
        {
1228
            pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1229
                             h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1230
                             h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1231
            i_best = COST_MAX64;
1232

1233
            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1234

1235
            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1236
                for( int p = 0; p < plane_count; p++ )
1237
                    /* emulate missing topright samples */
1238
                    MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1239

1240
            for( ; *predict_mode >= 0; predict_mode++ )
1241
            {
1242
                int i_mode = *predict_mode;
1243
                i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1244

1245
                if( i_best > i_satd )
1246
                {
1247
                    a->i_predict4x4[idx] = i_mode;
1248
                    i_best = i_satd;
1249
                    for( int p = 0; p < plane_count; p++ )
1250
                    {
1251
                        pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1252
                        pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1253
                        pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1254
                        pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1255
                        nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1256
                    }
1257
                }
1258
            }
1259

1260
            for( int p = 0; p < plane_count; p++ )
1261
            {
1262
                MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1263
                MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1264
                MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1265
                MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1266
                h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1267
            }
1268

1269
            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1270
        }
1271
    }
1272
    else if( h->mb.i_type == I_8x8 )
1273
    {
1274
        ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1275
        pixel4 pels_h[3][2] = {{0}};
1276
        pixel pels_v[3][7] = {{0}};
1277
        uint16_t nnz[3][2] = {{0}}; //shut up gcc
1278
        for( int idx = 0; idx < 4; idx++ )
1279
        {
1280
            int x = idx&1;
1281
            int y = idx>>1;
1282
            int s8 = X264_SCAN8_0 + 2*x + 16*y;
1283
            pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1284
                             h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1285
                             h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1286
            int cbp_luma_new = 0;
1287
            int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1288

1289
            i_best = COST_MAX64;
1290

1291
            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1292
            for( int p = 0; p < plane_count; p++ )
1293
                h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1294

1295
            for( ; *predict_mode >= 0; predict_mode++ )
1296
            {
1297
                int i_mode = *predict_mode;
1298
                if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1299
                    continue;
1300

1301
                h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1302
                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1303

1304
                if( i_best > i_satd )
1305
                {
1306
                    a->i_predict8x8[idx] = i_mode;
1307
                    cbp_luma_new = h->mb.i_cbp_luma;
1308
                    i_best = i_satd;
1309

1310
                    for( int p = 0; p < plane_count; p++ )
1311
                    {
1312
                        pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1313
                        pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1314
                        if( !(idx&1) )
1315
                            for( int j = 0; j < 7; j++ )
1316
                                pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1317
                        nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1318
                        nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1319
                    }
1320
                }
1321
            }
1322
            a->i_cbp_i8x8_luma = cbp_luma_new;
1323
            for( int p = 0; p < plane_count; p++ )
1324
            {
1325
                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1326
                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1327
                if( !(idx&1) )
1328
                    for( int j = 0; j < 7; j++ )
1329
                        dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1330
                M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1331
                M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1332
            }
1333

1334
            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1335
        }
1336
    }
1337
}
1338

1339
#define LOAD_FENC(m, src, xoff, yoff) \
1340
{ \
1341
    (m)->p_cost_mv = a->p_cost_mv; \
1342
    (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1343
    (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1344
    (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1345
    (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1346
    (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1347
    (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1348
}
1349

1350
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1351
{ \
1352
    (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353
    (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1354
    (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1355
    (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1356
    if( CHROMA444 ) \
1357
    { \
1358
        (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359
        (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360
        (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1361
        (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1362
        (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363
        (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1364
        (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1365
        (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1366
    } \
1367
    else \
1368
        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1369
    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1370
    (m)->weight = x264_weight_none; \
1371
    (m)->i_ref = ref; \
1372
}
1373

1374
#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1375
    (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1376
    (m)->weight = h->sh.weight[i_ref];
1377

1378
#define REF_COST(list, ref) \
1379
    (a->p_cost_ref[list][ref])
1380

1381
static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1382
{
1383
    x264_me_t m;
1384
    int i_mvc;
1385
    ALIGNED_4( int16_t mvc[8][2] );
1386
    int i_halfpel_thresh = INT_MAX;
1387
    int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1388

1389
    /* 16x16 Search on all ref frame */
1390
    m.i_pixel = PIXEL_16x16;
1391
    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1392

1393
    a->l0.me16x16.cost = INT_MAX;
1394
    for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1395
    {
1396
        m.i_ref_cost = REF_COST( 0, i_ref );
1397
        i_halfpel_thresh -= m.i_ref_cost;
1398

1399
        /* search with ref */
1400
        LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1401
        LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1402

1403
        x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1404

1405
        if( h->mb.ref_blind_dupe == i_ref )
1406
        {
1407
            CP32( m.mv, a->l0.mvc[0][0] );
1408
            x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1409
        }
1410
        else
1411
        {
1412
            x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1413
            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1414
        }
1415

1416
        /* save mv for predicting neighbors */
1417
        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1418
        CP32( a->l0.mvc[i_ref][0], m.mv );
1419

1420
        /* early termination
1421
         * SSD threshold would probably be better than SATD */
1422
        if( i_ref == 0
1423
            && a->b_try_skip
1424
            && m.cost-m.cost_mv < 300*a->i_lambda
1425
            &&  abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1426
              + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1427
            && x264_macroblock_probe_pskip( h ) )
1428
        {
1429
            h->mb.i_type = P_SKIP;
1430
            x264_analyse_update_cache( h, a );
1431
            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1432
            return;
1433
        }
1434

1435
        m.cost += m.i_ref_cost;
1436
        i_halfpel_thresh += m.i_ref_cost;
1437

1438
        if( m.cost < a->l0.me16x16.cost )
1439
            h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1440
    }
1441

1442
    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1443
    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1444

1445
    h->mb.i_type = P_L0;
1446
    if( a->i_mbrd )
1447
    {
1448
        x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1449
        if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1450
        {
1451
            h->mb.i_partition = D_16x16;
1452
            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1453
            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1454
            if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1455
                h->mb.i_type = P_SKIP;
1456
        }
1457
    }
1458
}
1459

1460
static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1461
{
1462
    x264_me_t m;
1463
    pixel **p_fenc = h->mb.pic.p_fenc;
1464
    int i_maxref = h->mb.pic.i_fref[0]-1;
1465

1466
    h->mb.i_partition = D_8x8;
1467

1468
    #define CHECK_NEIGHBOUR(i)\
1469
    {\
1470
        int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1471
        if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1472
            i_maxref = ref;\
1473
    }
1474

1475
    /* early termination: if 16x16 chose ref 0, then evalute no refs older
1476
     * than those used by the neighbors */
1477
    if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1478
        h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1479
    {
1480
        i_maxref = 0;
1481
        CHECK_NEIGHBOUR(  -8 - 1 );
1482
        CHECK_NEIGHBOUR(  -8 + 0 );
1483
        CHECK_NEIGHBOUR(  -8 + 2 );
1484
        CHECK_NEIGHBOUR(  -8 + 4 );
1485
        CHECK_NEIGHBOUR(   0 - 1 );
1486
        CHECK_NEIGHBOUR( 2*8 - 1 );
1487
    }
1488
    #undef CHECK_NEIGHBOUR
1489

1490
    for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1491
        CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1492

1493
    for( int i = 0; i < 4; i++ )
1494
    {
1495
        x264_me_t *l0m = &a->l0.me8x8[i];
1496
        int x8 = i&1;
1497
        int y8 = i>>1;
1498

1499
        m.i_pixel = PIXEL_8x8;
1500

1501
        LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1502
        l0m->cost = INT_MAX;
1503
        for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1504
        {
1505
            m.i_ref_cost = REF_COST( 0, i_ref );
1506

1507
            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1508
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1509

1510
            x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1511
            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1512
            if( h->mb.ref_blind_dupe == i_ref )
1513
            {
1514
                CP32( m.mv, a->l0.mvc[0][i+1] );
1515
                x264_me_refine_qpel_refdupe( h, &m, NULL );
1516
            }
1517
            else
1518
                x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1519

1520
            m.cost += m.i_ref_cost;
1521

1522
            CP32( a->l0.mvc[i_ref][i+1], m.mv );
1523

1524
            if( m.cost < l0m->cost )
1525
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1526
            if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1527
                i_ref = h->mb.ref_blind_dupe;
1528
            else
1529
                i_ref++;
1530
        }
1531
        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1532
        x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1533

1534
        a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1535

1536
        /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1537
           are effectively zero. */
1538
        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1539
            l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1540
    }
1541

1542
    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1543
                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1544
    /* P_8x8 ref0 has no ref cost */
1545
    if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1546
                               a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1547
        a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1548
    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1549
    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1550
}
1551

1552
static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1553
{
1554
    /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1555
     * reference frame flags.  Thus, if we're not doing mixedrefs, just
1556
     * don't bother analysing the dupes. */
1557
    const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1558
    const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1559
    pixel **p_fenc = h->mb.pic.p_fenc;
1560
    int i_mvc;
1561
    int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1562

1563
    /* XXX Needed for x264_mb_predict_mv */
1564
    h->mb.i_partition = D_8x8;
1565

1566
    i_mvc = 1;
1567
    CP32( mvc[0], a->l0.me16x16.mv );
1568

1569
    for( int i = 0; i < 4; i++ )
1570
    {
1571
        x264_me_t *m = &a->l0.me8x8[i];
1572
        int x8 = i&1;
1573
        int y8 = i>>1;
1574

1575
        m->i_pixel = PIXEL_8x8;
1576
        m->i_ref_cost = i_ref_cost;
1577

1578
        LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1579
        LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1580
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1581

1582
        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1583
        x264_me_search( h, m, mvc, i_mvc );
1584

1585
        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1586

1587
        CP32( mvc[i_mvc], m->mv );
1588
        i_mvc++;
1589

1590
        a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1591

1592
        /* mb type cost */
1593
        m->cost += i_ref_cost;
1594
        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1595
            m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1596
    }
1597

1598
    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1599
                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1600
    /* theoretically this should include 4*ref_cost,
1601
     * but 3 seems a better approximation of cabac. */
1602
    if( h->param.b_cabac )
1603
        a->l0.i_cost8x8 -= i_ref_cost;
1604
    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1605
    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1606
}
1607

1608
static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1609
{
1610
    x264_me_t m;
1611
    pixel **p_fenc = h->mb.pic.p_fenc;
1612
    ALIGNED_4( int16_t mvc[3][2] );
1613

1614
    /* XXX Needed for x264_mb_predict_mv */
1615
    h->mb.i_partition = D_16x8;
1616

1617
    for( int i = 0; i < 2; i++ )
1618
    {
1619
        x264_me_t *l0m = &a->l0.me16x8[i];
1620
        const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1621
        const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1622
        const int ref8[2] = { minref, maxref };
1623
        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1624

1625
        m.i_pixel = PIXEL_16x8;
1626

1627
        LOAD_FENC( &m, p_fenc, 0, 8*i );
1628
        l0m->cost = INT_MAX;
1629
        for( int j = 0; j < i_ref8s; j++ )
1630
        {
1631
            const int i_ref = ref8[j];
1632
            m.i_ref_cost = REF_COST( 0, i_ref );
1633

1634
            /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1635
            CP32( mvc[0], a->l0.mvc[i_ref][0] );
1636
            CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1637
            CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1638

1639
            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1640
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1641

1642
            x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1643
            x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1644
            /* We can only take this shortcut if the first search was performed on ref0. */
1645
            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1646
            {
1647
                /* We can just leave the MV from the previous ref search. */
1648
                x264_me_refine_qpel_refdupe( h, &m, NULL );
1649
            }
1650
            else
1651
                x264_me_search( h, &m, mvc, 3 );
1652

1653
            m.cost += m.i_ref_cost;
1654

1655
            if( m.cost < l0m->cost )
1656
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1657
        }
1658

1659
        /* Early termination based on the current SATD score of partition[0]
1660
           plus the estimated SATD score of partition[1] */
1661
        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1662
        {
1663
            a->l0.i_cost16x8 = COST_MAX;
1664
            return;
1665
        }
1666

1667
        x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1668
        x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1669
    }
1670

1671
    a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1672
}
1673

1674
static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1675
{
1676
    x264_me_t m;
1677
    pixel **p_fenc = h->mb.pic.p_fenc;
1678
    ALIGNED_4( int16_t mvc[3][2] );
1679

1680
    /* XXX Needed for x264_mb_predict_mv */
1681
    h->mb.i_partition = D_8x16;
1682

1683
    for( int i = 0; i < 2; i++ )
1684
    {
1685
        x264_me_t *l0m = &a->l0.me8x16[i];
1686
        const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1687
        const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1688
        const int ref8[2] = { minref, maxref };
1689
        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1690

1691
        m.i_pixel = PIXEL_8x16;
1692

1693
        LOAD_FENC( &m, p_fenc, 8*i, 0 );
1694
        l0m->cost = INT_MAX;
1695
        for( int j = 0; j < i_ref8s; j++ )
1696
        {
1697
            const int i_ref = ref8[j];
1698
            m.i_ref_cost = REF_COST( 0, i_ref );
1699

1700
            CP32( mvc[0], a->l0.mvc[i_ref][0] );
1701
            CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1702
            CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1703

1704
            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1705
            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1706

1707
            x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1708
            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1709
            /* We can only take this shortcut if the first search was performed on ref0. */
1710
            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1711
            {
1712
                /* We can just leave the MV from the previous ref search. */
1713
                x264_me_refine_qpel_refdupe( h, &m, NULL );
1714
            }
1715
            else
1716
                x264_me_search( h, &m, mvc, 3 );
1717

1718
            m.cost += m.i_ref_cost;
1719

1720
            if( m.cost < l0m->cost )
1721
                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1722
        }
1723

1724
        /* Early termination based on the current SATD score of partition[0]
1725
           plus the estimated SATD score of partition[1] */
1726
        if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1727
        {
1728
            a->l0.i_cost8x16 = COST_MAX;
1729
            return;
1730
        }
1731

1732
        x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1733
        x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1734
    }
1735

1736
    a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1737
}
1738

1739
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1740
                                                                     pixel **p_fref, int i8x8, int size, int chroma )
1741
{
1742
    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1743
    pixel *pix2 = pix1+8;
1744
    int i_stride = h->mb.pic.i_stride[1];
1745
    int chroma_h_shift = chroma <= CHROMA_422;
1746
    int chroma_v_shift = chroma == CHROMA_420;
1747
    int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1748
    int i_ref = a->l0.me8x8[i8x8].i_ref;
1749
    int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1750
    x264_weight_t *weight = h->sh.weight[i_ref];
1751

1752
    // FIXME weight can be done on 4x4 blocks even if mc is smaller
1753
#define CHROMA4x4MC( width, height, me, x, y ) \
1754
    if( chroma == CHROMA_444 ) \
1755
    { \
1756
        int mvx = (me).mv[0] + 4*2*x; \
1757
        int mvy = (me).mv[1] + 4*2*y; \
1758
        h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1759
                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1760
        h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1761
                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1762
    } \
1763
    else \
1764
    { \
1765
        int offset = x + (2>>chroma_v_shift)*16*y; \
1766
        int chroma_height = (2>>chroma_v_shift)*height; \
1767
        h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1768
                         (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1769
        if( weight[1].weightfn ) \
1770
            weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1771
        if( weight[2].weightfn ) \
1772
            weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1773
    }
1774

1775
    if( size == PIXEL_4x4 )
1776
    {
1777
        x264_me_t *m = a->l0.me4x4[i8x8];
1778
        CHROMA4x4MC( 2,2, m[0], 0,0 );
1779
        CHROMA4x4MC( 2,2, m[1], 2,0 );
1780
        CHROMA4x4MC( 2,2, m[2], 0,2 );
1781
        CHROMA4x4MC( 2,2, m[3], 2,2 );
1782
    }
1783
    else if( size == PIXEL_8x4 )
1784
    {
1785
        x264_me_t *m = a->l0.me8x4[i8x8];
1786
        CHROMA4x4MC( 4,2, m[0], 0,0 );
1787
        CHROMA4x4MC( 4,2, m[1], 0,2 );
1788
    }
1789
    else
1790
    {
1791
        x264_me_t *m = a->l0.me4x8[i8x8];
1792
        CHROMA4x4MC( 2,4, m[0], 0,0 );
1793
        CHROMA4x4MC( 2,4, m[1], 2,0 );
1794
    }
1795
#undef CHROMA4x4MC
1796

1797
    int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1798
    int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1799
    return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1800
         + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1801
}
1802

1803
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1804
{
1805
    if( CHROMA_FORMAT == CHROMA_444 )
1806
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1807
    else if( CHROMA_FORMAT == CHROMA_422 )
1808
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1809
    else
1810
        return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1811
}
1812

1813
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1814
{
1815
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1816
    pixel **p_fenc = h->mb.pic.p_fenc;
1817
    const int i_ref = a->l0.me8x8[i8x8].i_ref;
1818

1819
    /* XXX Needed for x264_mb_predict_mv */
1820
    h->mb.i_partition = D_8x8;
1821

1822
    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1823
    {
1824
        const int idx = 4*i8x8 + i4x4;
1825
        const int x4 = block_idx_x[idx];
1826
        const int y4 = block_idx_y[idx];
1827
        const int i_mvc = (i4x4 == 0);
1828

1829
        x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1830

1831
        m->i_pixel = PIXEL_4x4;
1832

1833
        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1834
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1835
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1836

1837
        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1838
        x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1839

1840
        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1841
    }
1842
    a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1843
                            a->l0.me4x4[i8x8][1].cost +
1844
                            a->l0.me4x4[i8x8][2].cost +
1845
                            a->l0.me4x4[i8x8][3].cost +
1846
                            REF_COST( 0, i_ref ) +
1847
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1848
    if( h->mb.b_chroma_me )
1849
        a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1850
}
1851

1852
static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1853
{
1854
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1855
    pixel **p_fenc = h->mb.pic.p_fenc;
1856
    const int i_ref = a->l0.me8x8[i8x8].i_ref;
1857

1858
    /* XXX Needed for x264_mb_predict_mv */
1859
    h->mb.i_partition = D_8x8;
1860

1861
    for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1862
    {
1863
        const int idx = 4*i8x8 + 2*i8x4;
1864
        const int x4 = block_idx_x[idx];
1865
        const int y4 = block_idx_y[idx];
1866
        const int i_mvc = (i8x4 == 0);
1867

1868
        x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1869

1870
        m->i_pixel = PIXEL_8x4;
1871

1872
        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1873
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1874
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1875

1876
        x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1877
        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1878

1879
        x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1880
    }
1881
    a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1882
                            REF_COST( 0, i_ref ) +
1883
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1884
    if( h->mb.b_chroma_me )
1885
        a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1886
}
1887

1888
static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1889
{
1890
    pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1891
    pixel **p_fenc = h->mb.pic.p_fenc;
1892
    const int i_ref = a->l0.me8x8[i8x8].i_ref;
1893

1894
    /* XXX Needed for x264_mb_predict_mv */
1895
    h->mb.i_partition = D_8x8;
1896

1897
    for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1898
    {
1899
        const int idx = 4*i8x8 + i4x8;
1900
        const int x4 = block_idx_x[idx];
1901
        const int y4 = block_idx_y[idx];
1902
        const int i_mvc = (i4x8 == 0);
1903

1904
        x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1905

1906
        m->i_pixel = PIXEL_4x8;
1907

1908
        LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1909
        LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1910
        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1911

1912
        x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1913
        x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1914

1915
        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1916
    }
1917
    a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1918
                            REF_COST( 0, i_ref ) +
1919
                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1920
    if( h->mb.b_chroma_me )
1921
        a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1922
}
1923

1924
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1925
{
1926
    ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1927
    ALIGNED_ARRAY_N( pixel,  bi, [2],[16*16] );
1928
    int i_chroma_cost = 0;
1929
    int chromapix = h->luma2chroma_pixel[i_pixel];
1930

1931
#define COST_BI_CHROMA( m0, m1, width, height ) \
1932
{ \
1933
    if( CHROMA444 ) \
1934
    { \
1935
        h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1936
                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1937
        h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1938
                       m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1939
        h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1940
                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1941
        h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1942
                       m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1943
    } \
1944
    else \
1945
    { \
1946
        int v_shift = CHROMA_V_SHIFT; \
1947
        int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1948
        int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1949
        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1950
                         m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1951
        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1952
                         m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1953
    } \
1954
    h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1955
    h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1956
    i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1957
                  + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1958
}
1959

1960
    if( i_pixel == PIXEL_16x16 )
1961
        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1962
    else if( i_pixel == PIXEL_16x8 )
1963
        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1964
    else if( i_pixel == PIXEL_8x16 )
1965
        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1966
    else
1967
        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1968

1969
    return i_chroma_cost;
1970
}
1971

1972
static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1973
{
1974
    /* Assumes that fdec still contains the results of
1975
     * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1976

1977
    pixel *p_fenc = h->mb.pic.p_fenc[0];
1978
    pixel *p_fdec = h->mb.pic.p_fdec[0];
1979

1980
    a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1981
    if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1982
    {
1983
        int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1984

1985
        for( int i = 0; i < 4; i++ )
1986
        {
1987
            const int x = (i&1)*8;
1988
            const int y = (i>>1)*8;
1989
            a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1990
                                                              &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1991
            if( h->mb.b_chroma_me )
1992
            {
1993
                int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1994
                int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1995
                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1996
                                                                   &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1997
                                       + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1998
                                                                   &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1999
            }
2000
            a->i_cost16x16direct += a->i_cost8x8direct[i];
2001

2002
            /* mb type cost */
2003
            a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2004
        }
2005
    }
2006
    else
2007
    {
2008
        a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2009
        if( h->mb.b_chroma_me )
2010
        {
2011
            int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2012
            a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2013
                                 +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2014
        }
2015
    }
2016
}
2017

2018
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2019
{
2020
    ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2021
    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2022
    pixel *src0, *src1;
2023
    intptr_t stride0 = 16, stride1 = 16;
2024
    int i_ref, i_mvc;
2025
    ALIGNED_4( int16_t mvc[9][2] );
2026
    int try_skip = a->b_try_skip;
2027
    int list1_skipped = 0;
2028
    int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2029
    int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2030
                                (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2031

2032
    x264_me_t m;
2033
    m.i_pixel = PIXEL_16x16;
2034

2035
    LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2036

2037
    /* 16x16 Search on list 0 and list 1 */
2038
    a->l0.me16x16.cost = INT_MAX;
2039
    a->l1.me16x16.cost = INT_MAX;
2040
    for( int l = 1; l >= 0; )
2041
    {
2042
        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2043

2044
        /* This loop is extremely munged in order to facilitate the following order of operations,
2045
         * necessary for an efficient fast skip.
2046
         * 1.  Search list1 ref0.
2047
         * 2.  Search list0 ref0.
2048
         * 3.  Try skip.
2049
         * 4.  Search the rest of list0.
2050
         * 5.  Go back and finish list1.
2051
         */
2052
        for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2053
        {
2054
            if( try_skip && l == 1 && i_ref > 0 )
2055
            {
2056
                list1_skipped = 1;
2057
                break;
2058
            }
2059

2060
            m.i_ref_cost = REF_COST( l, i_ref );
2061

2062
            /* search with ref */
2063
            LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2064
            x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2065
            x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2066
            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2067

2068
            /* add ref cost */
2069
            m.cost += m.i_ref_cost;
2070

2071
            if( m.cost < lX->me16x16.cost )
2072
                h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2073

2074
            /* save mv for predicting neighbors */
2075
            CP32( lX->mvc[i_ref][0], m.mv );
2076
            CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2077

2078
            /* Fast skip detection. */
2079
            if( i_ref == 0 && try_skip )
2080
            {
2081
                if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2082
                    abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2083
                {
2084
                    try_skip = 0;
2085
                }
2086
                else if( !l )
2087
                {
2088
                    /* We already tested skip */
2089
                    h->mb.i_type = B_SKIP;
2090
                    x264_analyse_update_cache( h, a );
2091
                    return;
2092
                }
2093
            }
2094
        }
2095
        if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2096
            break;
2097
        if( list1_skipped && l == 0 )
2098
            l = 1;
2099
        else
2100
            l--;
2101
    }
2102

2103
    /* get cost of BI mode */
2104
    h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2105
    h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2106
    int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2107
    src0 = h->mc.get_ref( pix0, &stride0,
2108
                          h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2109
                          a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2110
    src1 = h->mc.get_ref( pix1, &stride1,
2111
                          h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2112
                          a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2113

2114
    h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2115

2116
    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2117
                     + ref_costs
2118
                     + a->l0.bi16x16.cost_mv
2119
                     + a->l1.bi16x16.cost_mv;
2120

2121
    if( h->mb.b_chroma_me )
2122
        a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2123

2124
    /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2125
    if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2126
    {
2127
        int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2128
                       + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2129
        int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2130
                       + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2131
        h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2132
                                h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2133
                                h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2134
        int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2135
                   + ref_costs + l0_mv_cost + l1_mv_cost;
2136

2137
        if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2138
        {
2139
            ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2140

2141
            if( CHROMA444 )
2142
            {
2143
                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2144
                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2145
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2146
                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2147
                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2148
                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2149
                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2150
                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2151
            }
2152
            else
2153
            {
2154
                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2155
                int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2156
                int v_shift = CHROMA_V_SHIFT;
2157

2158
                if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2159
                {
2160
                    int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2161
                    h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2162
                                     h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2163
                }
2164
                else
2165
                    h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2166
                                                         h->mb.pic.i_stride[1], 16>>v_shift );
2167

2168
                if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2169
                {
2170
                    int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2171
                    h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2172
                                     h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2173
                }
2174
                else
2175
                    h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2176
                                                         h->mb.pic.i_stride[1], 16>>v_shift );
2177

2178
                h->mc.avg[chromapix]( bi,   FENC_STRIDE, pixuv[0],   FENC_STRIDE, pixuv[1],   FENC_STRIDE,
2179
                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2180
                h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2181
                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2182

2183
                cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi,   FENC_STRIDE )
2184
                       +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2185
            }
2186
        }
2187

2188
        if( cost00 < a->i_cost16x16bi )
2189
        {
2190
            M32( a->l0.bi16x16.mv ) = 0;
2191
            M32( a->l1.bi16x16.mv ) = 0;
2192
            a->l0.bi16x16.cost_mv = l0_mv_cost;
2193
            a->l1.bi16x16.cost_mv = l1_mv_cost;
2194
            a->i_cost16x16bi = cost00;
2195
        }
2196
    }
2197

2198
    /* mb type cost */
2199
    a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2200
    a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2201
    a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2202
}
2203

2204
static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2205
{
2206
    int x = 2*(i&1);
2207
    int y = i&2;
2208

2209
    switch( h->mb.i_sub_partition[i] )
2210
    {
2211
        case D_L0_8x8:
2212
            x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2213
            break;
2214
        case D_L0_8x4:
2215
            x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2216
            x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2217
            break;
2218
        case D_L0_4x8:
2219
            x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2220
            x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2221
            break;
2222
        case D_L0_4x4:
2223
            x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2224
            x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2225
            x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2226
            x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2227
            break;
2228
        default:
2229
            x264_log( h, X264_LOG_ERROR, "internal error\n" );
2230
            break;
2231
    }
2232
}
2233

2234
static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2235
{
2236
    int x = 2*(idx&1);
2237
    int y = idx&2;
2238
    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2239
    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2240
    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2241
    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2242
}
2243

2244
#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2245
    if( x264_mb_partition_listX_table[0][part] ) \
2246
    { \
2247
        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2248
        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2249
    } \
2250
    else \
2251
    { \
2252
        x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2253
        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
2254
        if( b_mvd ) \
2255
            x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2256
    } \
2257
    if( x264_mb_partition_listX_table[1][part] ) \
2258
    { \
2259
        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2260
        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2261
    } \
2262
    else \
2263
    { \
2264
        x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2265
        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
2266
        if( b_mvd ) \
2267
            x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2268
    }
2269

2270
static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2271
{
2272
    int x = 2*(i&1);
2273
    int y = i&2;
2274
    if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2275
    {
2276
        x264_mb_load_mv_direct8x8( h, i );
2277
        if( b_mvd )
2278
        {
2279
            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
2280
            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
2281
            x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2282
        }
2283
    }
2284
    else
2285
    {
2286
        CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2287
    }
2288
}
2289
static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2290
{
2291
    CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2292
}
2293
static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2294
{
2295
    CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2296
}
2297
#undef CACHE_MV_BI
2298

2299
static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2300
{
2301
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2302
    int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2303

2304
    /* early termination: if 16x16 chose ref 0, then evalute no refs older
2305
     * than those used by the neighbors */
2306
    #define CHECK_NEIGHBOUR(i)\
2307
    {\
2308
        int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2309
        if( ref > i_maxref[l] )\
2310
            i_maxref[l] = ref;\
2311
    }
2312

2313
    for( int l = 0; l < 2; l++ )
2314
    {
2315
        x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2316
        if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2317
            h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2318
        {
2319
            i_maxref[l] = 0;
2320
            CHECK_NEIGHBOUR(  -8 - 1 );
2321
            CHECK_NEIGHBOUR(  -8 + 0 );
2322
            CHECK_NEIGHBOUR(  -8 + 2 );
2323
            CHECK_NEIGHBOUR(  -8 + 4 );
2324
            CHECK_NEIGHBOUR(   0 - 1 );
2325
            CHECK_NEIGHBOUR( 2*8 - 1 );
2326
        }
2327
    }
2328

2329
    /* XXX Needed for x264_mb_predict_mv */
2330
    h->mb.i_partition = D_8x8;
2331

2332
    a->i_cost8x8bi = 0;
2333

2334
    for( int i = 0; i < 4; i++ )
2335
    {
2336
        int x8 = i&1;
2337
        int y8 = i>>1;
2338
        int i_part_cost;
2339
        int i_part_cost_bi;
2340
        intptr_t stride[2] = {8,8};
2341
        pixel *src[2];
2342
        x264_me_t m;
2343
        m.i_pixel = PIXEL_8x8;
2344
        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2345

2346
        for( int l = 0; l < 2; l++ )
2347
        {
2348
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2349

2350
            lX->me8x8[i].cost = INT_MAX;
2351
            for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2352
            {
2353
                m.i_ref_cost = REF_COST( l, i_ref );
2354

2355
                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2356

2357
                x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2358
                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2359
                x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2360
                m.cost += m.i_ref_cost;
2361

2362
                if( m.cost < lX->me8x8[i].cost )
2363
                {
2364
                    h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2365
                    a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2366
                }
2367

2368
                /* save mv for predicting other partitions within this MB */
2369
                CP32( lX->mvc[i_ref][i+1], m.mv );
2370
            }
2371
        }
2372

2373
        /* BI mode */
2374
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2375
                                a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2376
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2377
                                a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2378
        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2379
                                h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2380

2381
        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2382
        i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2383
                         + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2384
                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2385

2386
        if( h->mb.b_chroma_me )
2387
        {
2388
            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2389
            i_part_cost_bi += i_chroma_cost;
2390
            a->i_satd8x8[2][i] += i_chroma_cost;
2391
        }
2392

2393
        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2394
        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2395

2396
        i_part_cost = a->l0.me8x8[i].cost;
2397
        h->mb.i_sub_partition[i] = D_L0_8x8;
2398
        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2399
        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2400
        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2401
        a->i_cost8x8bi += i_part_cost;
2402

2403
        /* XXX Needed for x264_mb_predict_mv */
2404
        x264_mb_cache_mv_b8x8( h, a, i, 0 );
2405
    }
2406

2407
    /* mb type cost */
2408
    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2409
}
2410

2411
static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2412
{
2413
    pixel **p_fref[2] =
2414
        { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2415
          h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2416
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2417

2418
    /* XXX Needed for x264_mb_predict_mv */
2419
    h->mb.i_partition = D_8x8;
2420

2421
    a->i_cost8x8bi = 0;
2422

2423
    for( int i = 0; i < 4; i++ )
2424
    {
2425
        int x8 = i&1;
2426
        int y8 = i>>1;
2427
        int i_part_cost;
2428
        int i_part_cost_bi = 0;
2429
        intptr_t stride[2] = {8,8};
2430
        pixel *src[2];
2431

2432
        for( int l = 0; l < 2; l++ )
2433
        {
2434
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2435
            x264_me_t *m = &lX->me8x8[i];
2436
            m->i_pixel = PIXEL_8x8;
2437
            LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2438

2439
            m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2440
            m->i_ref = lX->me16x16.i_ref;
2441

2442
            LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2443

2444
            x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2445
            x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2446
            x264_me_search( h, m, &lX->me16x16.mv, 1 );
2447
            a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2448
            m->cost += m->i_ref_cost;
2449

2450
            x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2451

2452
            /* save mv for predicting other partitions within this MB */
2453
            CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2454

2455
            /* BI mode */
2456
            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2457
                                    m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2458
            i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2459
        }
2460
        h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2461
        a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2462
        i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2463
        a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2464
        a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2465

2466
        if( h->mb.b_chroma_me )
2467
        {
2468
            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2469
            i_part_cost_bi += i_chroma_cost;
2470
            a->i_satd8x8[2][i] += i_chroma_cost;
2471
        }
2472

2473
        i_part_cost = a->l0.me8x8[i].cost;
2474
        h->mb.i_sub_partition[i] = D_L0_8x8;
2475
        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2476
        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2477
        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2478
        a->i_cost8x8bi += i_part_cost;
2479

2480
        /* XXX Needed for x264_mb_predict_mv */
2481
        x264_mb_cache_mv_b8x8( h, a, i, 0 );
2482
    }
2483

2484
    /* mb type cost */
2485
    a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2486
}
2487

2488
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2489
{
2490
    ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2491
    ALIGNED_4( int16_t mvc[3][2] );
2492

2493
    h->mb.i_partition = D_16x8;
2494
    a->i_cost16x8bi = 0;
2495

2496
    for( int i = 0; i < 2; i++ )
2497
    {
2498
        int i_part_cost;
2499
        int i_part_cost_bi = 0;
2500
        intptr_t stride[2] = {16,16};
2501
        pixel *src[2];
2502
        x264_me_t m;
2503
        m.i_pixel = PIXEL_16x8;
2504
        LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2505

2506
        for( int l = 0; l < 2; l++ )
2507
        {
2508
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2509
            int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2510
            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2511
            lX->me16x8[i].cost = INT_MAX;
2512
            for( int j = 0; j < i_ref8s; j++ )
2513
            {
2514
                int i_ref = ref8[j];
2515
                m.i_ref_cost = REF_COST( l, i_ref );
2516

2517
                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2518

2519
                CP32( mvc[0], lX->mvc[i_ref][0] );
2520
                CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2521
                CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2522

2523
                x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2524
                x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2525
                x264_me_search( h, &m, mvc, 3 );
2526
                m.cost += m.i_ref_cost;
2527

2528
                if( m.cost < lX->me16x8[i].cost )
2529
                    h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2530
            }
2531
        }
2532

2533
        /* BI mode */
2534
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2535
                                a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2536
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2537
                                a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2538
        h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2539
                                h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2540

2541
        i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2542
                        + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2543
                        + a->l1.me16x8[i].i_ref_cost;
2544

2545
        if( h->mb.b_chroma_me )
2546
            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2547

2548
        i_part_cost = a->l0.me16x8[i].cost;
2549
        a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2550

2551
        if( a->l1.me16x8[i].cost < i_part_cost )
2552
        {
2553
            i_part_cost = a->l1.me16x8[i].cost;
2554
            a->i_mb_partition16x8[i] = D_L1_8x8;
2555
        }
2556
        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2557
        {
2558
            i_part_cost = i_part_cost_bi;
2559
            a->i_mb_partition16x8[i] = D_BI_8x8;
2560
        }
2561
        a->i_cost16x8bi += i_part_cost;
2562

2563
        /* Early termination based on the current SATD score of partition[0]
2564
           plus the estimated SATD score of partition[1] */
2565
        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2566
            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2567
        {
2568
            a->i_cost16x8bi = COST_MAX;
2569
            return;
2570
        }
2571

2572
        x264_mb_cache_mv_b16x8( h, a, i, 0 );
2573
    }
2574

2575
    /* mb type cost */
2576
    a->i_mb_type16x8 = B_L0_L0
2577
        + (a->i_mb_partition16x8[0]>>2) * 3
2578
        + (a->i_mb_partition16x8[1]>>2);
2579
    a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2580
}
2581

2582
static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2583
{
2584
    ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2585
    ALIGNED_4( int16_t mvc[3][2] );
2586

2587
    h->mb.i_partition = D_8x16;
2588
    a->i_cost8x16bi = 0;
2589

2590
    for( int i = 0; i < 2; i++ )
2591
    {
2592
        int i_part_cost;
2593
        int i_part_cost_bi = 0;
2594
        intptr_t stride[2] = {8,8};
2595
        pixel *src[2];
2596
        x264_me_t m;
2597
        m.i_pixel = PIXEL_8x16;
2598
        LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2599

2600
        for( int l = 0; l < 2; l++ )
2601
        {
2602
            x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2603
            int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2604
            int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2605
            lX->me8x16[i].cost = INT_MAX;
2606
            for( int j = 0; j < i_ref8s; j++ )
2607
            {
2608
                int i_ref = ref8[j];
2609
                m.i_ref_cost = REF_COST( l, i_ref );
2610

2611
                LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2612

2613
                CP32( mvc[0], lX->mvc[i_ref][0] );
2614
                CP32( mvc[1], lX->mvc[i_ref][i+1] );
2615
                CP32( mvc[2], lX->mvc[i_ref][i+3] );
2616

2617
                x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2618
                x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2619
                x264_me_search( h, &m, mvc, 3 );
2620
                m.cost += m.i_ref_cost;
2621

2622
                if( m.cost < lX->me8x16[i].cost )
2623
                    h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2624
            }
2625
        }
2626

2627
        /* BI mode */
2628
        src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2629
                                a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2630
        src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2631
                                a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2632
        h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2633

2634
        i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2635
                        + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2636
                        + a->l1.me8x16[i].i_ref_cost;
2637

2638
        if( h->mb.b_chroma_me )
2639
            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2640

2641
        i_part_cost = a->l0.me8x16[i].cost;
2642
        a->i_mb_partition8x16[i] = D_L0_8x8;
2643

2644
        if( a->l1.me8x16[i].cost < i_part_cost )
2645
        {
2646
            i_part_cost = a->l1.me8x16[i].cost;
2647
            a->i_mb_partition8x16[i] = D_L1_8x8;
2648
        }
2649
        if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2650
        {
2651
            i_part_cost = i_part_cost_bi;
2652
            a->i_mb_partition8x16[i] = D_BI_8x8;
2653
        }
2654
        a->i_cost8x16bi += i_part_cost;
2655

2656
        /* Early termination based on the current SATD score of partition[0]
2657
           plus the estimated SATD score of partition[1] */
2658
        if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2659
            * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2660
        {
2661
            a->i_cost8x16bi = COST_MAX;
2662
            return;
2663
        }
2664

2665
        x264_mb_cache_mv_b8x16( h, a, i, 0 );
2666
    }
2667

2668
    /* mb type cost */
2669
    a->i_mb_type8x16 = B_L0_L0
2670
        + (a->i_mb_partition8x16[0]>>2) * 3
2671
        + (a->i_mb_partition8x16[1]>>2);
2672
    a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2673
}
2674

2675
static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2676
{
2677
    int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2678

2679
    h->mb.i_type = P_L0;
2680
    if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2681
    {
2682
        h->mb.i_partition = D_16x16;
2683
        x264_analyse_update_cache( h, a );
2684
        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2685
    }
2686

2687
    if( a->l0.i_cost16x8 < thresh )
2688
    {
2689
        h->mb.i_partition = D_16x8;
2690
        x264_analyse_update_cache( h, a );
2691
        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2692
    }
2693
    else
2694
        a->l0.i_cost16x8 = COST_MAX;
2695

2696
    if( a->l0.i_cost8x16 < thresh )
2697
    {
2698
        h->mb.i_partition = D_8x16;
2699
        x264_analyse_update_cache( h, a );
2700
        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2701
    }
2702
    else
2703
        a->l0.i_cost8x16 = COST_MAX;
2704

2705
    if( a->l0.i_cost8x8 < thresh )
2706
    {
2707
        h->mb.i_type = P_8x8;
2708
        h->mb.i_partition = D_8x8;
2709
        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2710
        {
2711
            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2712
            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2713
            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2714
            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2715
            /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2716
             * for future blocks are those left over from previous RDO calls. */
2717
            for( int i = 0; i < 4; i++ )
2718
            {
2719
                int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2720
                int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2721
                int subtype, btype = D_L0_8x8;
2722
                uint64_t bcost = COST_MAX64;
2723
                for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2724
                {
2725
                    uint64_t cost;
2726
                    if( costs[subtype] > sub8x8_thresh )
2727
                        continue;
2728
                    h->mb.i_sub_partition[i] = subtype;
2729
                    x264_mb_cache_mv_p8x8( h, a, i );
2730
                    if( subtype == btype )
2731
                        continue;
2732
                    cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2733
                    COPY2_IF_LT( bcost, cost, btype, subtype );
2734
                }
2735
                if( h->mb.i_sub_partition[i] != btype )
2736
                {
2737
                    h->mb.i_sub_partition[i] = btype;
2738
                    x264_mb_cache_mv_p8x8( h, a, i );
2739
                }
2740
            }
2741
        }
2742
        else
2743
            x264_analyse_update_cache( h, a );
2744
        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2745
    }
2746
    else
2747
        a->l0.i_cost8x8 = COST_MAX;
2748
}
2749

2750
static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2751
{
2752
    int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2753

2754
    if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2755
    {
2756
        h->mb.i_type = B_DIRECT;
2757
        /* Assumes direct/skip MC is still in fdec */
2758
        /* Requires b-rdo to be done before intra analysis */
2759
        h->mb.b_skip_mc = 1;
2760
        x264_analyse_update_cache( h, a );
2761
        a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2762
        h->mb.b_skip_mc = 0;
2763
    }
2764

2765
    //FIXME not all the update_cache calls are needed
2766
    h->mb.i_partition = D_16x16;
2767
    /* L0 */
2768
    if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2769
    {
2770
        h->mb.i_type = B_L0_L0;
2771
        x264_analyse_update_cache( h, a );
2772
        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2773
    }
2774

2775
    /* L1 */
2776
    if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2777
    {
2778
        h->mb.i_type = B_L1_L1;
2779
        x264_analyse_update_cache( h, a );
2780
        a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2781
    }
2782

2783
    /* BI */
2784
    if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2785
    {
2786
        h->mb.i_type = B_BI_BI;
2787
        x264_analyse_update_cache( h, a );
2788
        a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2789
    }
2790

2791
    /* 8x8 */
2792
    if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2793
    {
2794
        h->mb.i_type = B_8x8;
2795
        h->mb.i_partition = D_8x8;
2796
        x264_analyse_update_cache( h, a );
2797
        a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2798
        x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2799
    }
2800

2801
    /* 16x8 */
2802
    if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2803
    {
2804
        h->mb.i_type = a->i_mb_type16x8;
2805
        h->mb.i_partition = D_16x8;
2806
        x264_analyse_update_cache( h, a );
2807
        a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2808
    }
2809

2810
    /* 8x16 */
2811
    if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2812
    {
2813
        h->mb.i_type = a->i_mb_type8x16;
2814
        h->mb.i_partition = D_8x16;
2815
        x264_analyse_update_cache( h, a );
2816
        a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2817
    }
2818
}
2819

2820
static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2821
{
2822
    int i_biweight;
2823

2824
    if( IS_INTRA(h->mb.i_type) )
2825
        return;
2826

2827
    switch( h->mb.i_partition )
2828
    {
2829
        case D_16x16:
2830
            if( h->mb.i_type == B_BI_BI )
2831
            {
2832
                i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2833
                x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2834
            }
2835
            break;
2836
        case D_16x8:
2837
            for( int i = 0; i < 2; i++ )
2838
                if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2839
                {
2840
                    i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2841
                    x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2842
                }
2843
            break;
2844
        case D_8x16:
2845
            for( int i = 0; i < 2; i++ )
2846
                if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2847
                {
2848
                    i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2849
                    x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2850
                }
2851
            break;
2852
        case D_8x8:
2853
            for( int i = 0; i < 4; i++ )
2854
                if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2855
                {
2856
                    i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2857
                    x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2858
                }
2859
            break;
2860
    }
2861
}
2862

2863
static inline void x264_mb_analyse_transform( x264_t *h )
2864
{
2865
    if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2866
    {
2867
        /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2868
        x264_mb_mc( h );
2869

2870
        int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2871
        int i_cost8 = 0, i_cost4 = 0;
2872
        /* Not all platforms have a merged SATD function */
2873
        if( h->pixf.sa8d_satd[PIXEL_16x16] )
2874
        {
2875
            uint64_t cost = 0;
2876
            for( int p = 0; p < plane_count; p++ )
2877
            {
2878
                cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2879
                                                        h->mb.pic.p_fdec[p], FDEC_STRIDE );
2880

2881
            }
2882
            i_cost8 = (uint32_t)cost;
2883
            i_cost4 = (uint32_t)(cost >> 32);
2884
        }
2885
        else
2886
        {
2887
            for( int p = 0; p < plane_count; p++ )
2888
            {
2889
                i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2890
                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
2891
                i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2892
                                                      h->mb.pic.p_fdec[p], FDEC_STRIDE );
2893
            }
2894
        }
2895

2896
        h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2897
        h->mb.b_skip_mc = 1;
2898
    }
2899
}
2900

2901
static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2902
{
2903
    if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2904
    {
2905
        uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2906
        /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2907
        if( h->mb.i_type == P_8x8 )
2908
            M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2909
        else if( !x264_transform_allowed[h->mb.i_type] )
2910
            return;
2911

2912
        x264_analyse_update_cache( h, a );
2913
        h->mb.b_transform_8x8 ^= 1;
2914
        /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2915
        int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2916

2917
        if( *i_rd >= i_rd8 )
2918
        {
2919
            if( *i_rd > 0 )
2920
                *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2921
            *i_rd = i_rd8;
2922
        }
2923
        else
2924
        {
2925
            h->mb.b_transform_8x8 ^= 1;
2926
            M32( h->mb.i_sub_partition ) = subpart_bak;
2927
        }
2928
    }
2929
}
2930

2931
/* Rate-distortion optimal QP selection.
2932
 * FIXME: More than half of the benefit of this function seems to be
2933
 * in the way it improves the coding of chroma DC (by decimating or
2934
 * finding a better way to code a single DC coefficient.)
2935
 * There must be a more efficient way to get that portion of the benefit
2936
 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2937
 * trick. */
2938
static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2939
{
2940
    int bcost, cost, failures, prevcost, origcost;
2941
    int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2942
    int last_qp_tried = 0;
2943
    origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2944
    int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2945

2946
    /* If CBP is already zero, don't raise the quantizer any higher. */
2947
    for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2948
    {
2949
        /* Without psy-RD, require monotonicity when moving quant away from previous
2950
         * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2951
         * With psy-RD, allow 1 failure when moving quant away from previous quant,
2952
         * allow 2 failures when moving quant towards previous quant.
2953
         * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2954
        int threshold = (!!h->mb.i_psy_rd);
2955
        /* Raise the threshold for failures if we're moving towards the last QP. */
2956
        if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2957
            ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
2958
            threshold++;
2959
        h->mb.i_qp = orig_qp;
2960
        failures = 0;
2961
        prevcost = origcost;
2962

2963
        /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2964
         * (up to a point) will too.  So, jump down to where the threshold will kick in
2965
         * and check the QP there.  If the CBP is still empty, skip the main loop.
2966
         * If it isn't empty, we would have ended up having to check this QP anyways,
2967
         * so as long as we store it for later lookup, we lose nothing. */
2968
        int already_checked_qp = -1;
2969
        int already_checked_cost = COST_MAX;
2970
        if( direction == -1 )
2971
        {
2972
            if( !origcbp )
2973
            {
2974
                h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2975
                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2976
                already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2977
                if( !h->mb.cbp[h->mb.i_mb_xy] )
2978
                {
2979
                    /* If our empty-CBP block is lower QP than the last QP,
2980
                     * the last QP almost surely doesn't have a CBP either. */
2981
                    if( h->mb.i_last_qp > h->mb.i_qp )
2982
                        last_qp_tried = 1;
2983
                    break;
2984
                }
2985
                already_checked_qp = h->mb.i_qp;
2986
                h->mb.i_qp = orig_qp;
2987
            }
2988
        }
2989

2990
        h->mb.i_qp += direction;
2991
        while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2992
        {
2993
            if( h->mb.i_last_qp == h->mb.i_qp )
2994
                last_qp_tried = 1;
2995
            if( h->mb.i_qp == already_checked_qp )
2996
                cost = already_checked_cost;
2997
            else
2998
            {
2999
                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3000
                cost = x264_rd_cost_mb( h, a->i_lambda2 );
3001
                COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3002
            }
3003

3004
            /* We can't assume that the costs are monotonic over QPs.
3005
             * Tie case-as-failure seems to give better results. */
3006
            if( cost < prevcost )
3007
                failures = 0;
3008
            else
3009
                failures++;
3010
            prevcost = cost;
3011

3012
            if( failures > threshold )
3013
                break;
3014
            if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3015
                break;
3016
            h->mb.i_qp += direction;
3017
        }
3018
    }
3019

3020
    /* Always try the last block's QP. */
3021
    if( !last_qp_tried )
3022
    {
3023
        h->mb.i_qp = h->mb.i_last_qp;
3024
        h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3025
        cost = x264_rd_cost_mb( h, a->i_lambda2 );
3026
        COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3027
    }
3028

3029
    h->mb.i_qp = bqp;
3030
    h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3031

3032
    /* Check transform again; decision from before may no longer be optimal. */
3033
    if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3034
        x264_mb_transform_8x8_allowed( h ) )
3035
    {
3036
        h->mb.b_transform_8x8 ^= 1;
3037
        cost = x264_rd_cost_mb( h, a->i_lambda2 );
3038
        if( cost > bcost )
3039
            h->mb.b_transform_8x8 ^= 1;
3040
    }
3041
}
3042

3043
/*****************************************************************************
3044
 * x264_macroblock_analyse:
3045
 *****************************************************************************/
3046
void x264_macroblock_analyse( x264_t *h )
3047
{
3048
    x264_mb_analysis_t analysis;
3049
    int i_cost = COST_MAX;
3050

3051
    h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3052
    /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3053
     * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
3054
    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3055
        h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3056

3057
    if( h->param.analyse.b_mb_info )
3058
        h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3059
    x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3060

3061
    /*--------------------------- Do the analysis ---------------------------*/
3062
    if( h->sh.i_type == SLICE_TYPE_I )
3063
    {
3064
intra_analysis:
3065
        if( analysis.i_mbrd )
3066
            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3067
        x264_mb_analyse_intra( h, &analysis, COST_MAX );
3068
        if( analysis.i_mbrd )
3069
            x264_intra_rd( h, &analysis, COST_MAX );
3070

3071
        i_cost = analysis.i_satd_i16x16;
3072
        h->mb.i_type = I_16x16;
3073
        COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3074
        COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3075
        if( analysis.i_satd_pcm < i_cost )
3076
            h->mb.i_type = I_PCM;
3077

3078
        else if( analysis.i_mbrd >= 2 )
3079
            x264_intra_rd_refine( h, &analysis );
3080
    }
3081
    else if( h->sh.i_type == SLICE_TYPE_P )
3082
    {
3083
        int b_skip = 0;
3084

3085
        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3086

3087
        analysis.b_try_skip = 0;
3088
        if( analysis.b_force_intra )
3089
        {
3090
            if( !h->param.analyse.b_psy )
3091
            {
3092
                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3093
                goto intra_analysis;
3094
            }
3095
        }
3096
        else
3097
        {
3098
            /* Special fast-skip logic using information from mb_info. */
3099
            if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3100
            {
3101
                if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3102
                    h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3103
                {
3104
                    h->mb.i_partition = D_16x16;
3105
                    /* Use the P-SKIP MV if we can... */
3106
                    if( !M32(h->mb.cache.pskip_mv) )
3107
                    {
3108
                        b_skip = 1;
3109
                        h->mb.i_type = P_SKIP;
3110
                    }
3111
                    /* Otherwise, just force a 16x16 block. */
3112
                    else
3113
                    {
3114
                        h->mb.i_type = P_L0;
3115
                        analysis.l0.me16x16.i_ref = 0;
3116
                        M32( analysis.l0.me16x16.mv ) = 0;
3117
                    }
3118
                    goto skip_analysis;
3119
                }
3120
                /* Reset the information accordingly */
3121
                else if( h->param.analyse.b_mb_info_update )
3122
                    h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3123
            }
3124

3125
            int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3126
            /* If the current macroblock is off the frame, just skip it. */
3127
            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3128
                b_skip = 1;
3129
            /* Fast P_SKIP detection */
3130
            else if( h->param.analyse.b_fast_pskip )
3131
            {
3132
                if( skip_invalid )
3133
                    // FIXME don't need to check this if the reference frame is done
3134
                    {}
3135
                else if( h->param.analyse.i_subpel_refine >= 3 )
3136
                    analysis.b_try_skip = 1;
3137
                else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3138
                         h->mb.i_mb_type_top == P_SKIP ||
3139
                         h->mb.i_mb_type_topleft == P_SKIP ||
3140
                         h->mb.i_mb_type_topright == P_SKIP )
3141
                    b_skip = x264_macroblock_probe_pskip( h );
3142
            }
3143
        }
3144

3145
        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3146

3147
        if( b_skip )
3148
        {
3149
            h->mb.i_type = P_SKIP;
3150
            h->mb.i_partition = D_16x16;
3151
            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3152
skip_analysis:
3153
            /* Set up MVs for future predictors */
3154
            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3155
                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3156
        }
3157
        else
3158
        {
3159
            const unsigned int flags = h->param.analyse.inter;
3160
            int i_type;
3161
            int i_partition;
3162
            int i_satd_inter, i_satd_intra;
3163

3164
            x264_mb_analyse_load_costs( h, &analysis );
3165

3166
            x264_mb_analyse_inter_p16x16( h, &analysis );
3167

3168
            if( h->mb.i_type == P_SKIP )
3169
            {
3170
                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3171
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3172
                return;
3173
            }
3174

3175
            if( flags & X264_ANALYSE_PSUB16x16 )
3176
            {
3177
                if( h->param.analyse.b_mixed_references )
3178
                    x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3179
                else
3180
                    x264_mb_analyse_inter_p8x8( h, &analysis );
3181
            }
3182

3183
            /* Select best inter mode */
3184
            i_type = P_L0;
3185
            i_partition = D_16x16;
3186
            i_cost = analysis.l0.me16x16.cost;
3187

3188
            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3189
                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3190
            {
3191
                i_type = P_8x8;
3192
                i_partition = D_8x8;
3193
                i_cost = analysis.l0.i_cost8x8;
3194

3195
                /* Do sub 8x8 */
3196
                if( flags & X264_ANALYSE_PSUB8x8 )
3197
                {
3198
                    for( int i = 0; i < 4; i++ )
3199
                    {
3200
                        x264_mb_analyse_inter_p4x4( h, &analysis, i );
3201
                        int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3202
                        if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3203
                        {
3204
                            int i_cost8x8 = analysis.l0.i_cost4x4[i];
3205
                            h->mb.i_sub_partition[i] = D_L0_4x4;
3206

3207
                            x264_mb_analyse_inter_p8x4( h, &analysis, i );
3208
                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3209
                                         h->mb.i_sub_partition[i], D_L0_8x4 );
3210

3211
                            x264_mb_analyse_inter_p4x8( h, &analysis, i );
3212
                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3213
                                         h->mb.i_sub_partition[i], D_L0_4x8 );
3214

3215
                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3216
                        }
3217
                        x264_mb_cache_mv_p8x8( h, &analysis, i );
3218
                    }
3219
                    analysis.l0.i_cost8x8 = i_cost;
3220
                }
3221
            }
3222

3223
            /* Now do 16x8/8x16 */
3224
            int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3225
            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3226
                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3227
            {
3228
                int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3229
                                      + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3230
                analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3231

3232
                x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3233
                COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3234

3235
                i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3236
                                  + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3237
                analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3238

3239
                x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3240
                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3241
            }
3242

3243
            h->mb.i_partition = i_partition;
3244

3245
            /* refine qpel */
3246
            //FIXME mb_type costs?
3247
            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3248
            {
3249
                /* refine later */
3250
            }
3251
            else if( i_partition == D_16x16 )
3252
            {
3253
                x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3254
                i_cost = analysis.l0.me16x16.cost;
3255
            }
3256
            else if( i_partition == D_16x8 )
3257
            {
3258
                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3259
                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3260
                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3261
            }
3262
            else if( i_partition == D_8x16 )
3263
            {
3264
                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3265
                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3266
                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3267
            }
3268
            else if( i_partition == D_8x8 )
3269
            {
3270
                i_cost = 0;
3271
                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3272
                {
3273
                    switch( h->mb.i_sub_partition[i8x8] )
3274
                    {
3275
                        case D_L0_8x8:
3276
                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3277
                            i_cost += analysis.l0.me8x8[i8x8].cost;
3278
                            break;
3279
                        case D_L0_8x4:
3280
                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3281
                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3282
                            i_cost += analysis.l0.me8x4[i8x8][0].cost +
3283
                                      analysis.l0.me8x4[i8x8][1].cost;
3284
                            break;
3285
                        case D_L0_4x8:
3286
                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3287
                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3288
                            i_cost += analysis.l0.me4x8[i8x8][0].cost +
3289
                                      analysis.l0.me4x8[i8x8][1].cost;
3290
                            break;
3291

3292
                        case D_L0_4x4:
3293
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3294
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3295
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3296
                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3297
                            i_cost += analysis.l0.me4x4[i8x8][0].cost +
3298
                                      analysis.l0.me4x4[i8x8][1].cost +
3299
                                      analysis.l0.me4x4[i8x8][2].cost +
3300
                                      analysis.l0.me4x4[i8x8][3].cost;
3301
                            break;
3302
                        default:
3303
                            x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3304
                            break;
3305
                    }
3306
                }
3307
            }
3308

3309
            if( h->mb.b_chroma_me )
3310
            {
3311
                if( CHROMA444 )
3312
                {
3313
                    x264_mb_analyse_intra( h, &analysis, i_cost );
3314
                    x264_mb_analyse_intra_chroma( h, &analysis );
3315
                }
3316
                else
3317
                {
3318
                    x264_mb_analyse_intra_chroma( h, &analysis );
3319
                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3320
                }
3321
                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3322
                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
3323
                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
3324
            }
3325
            else
3326
                x264_mb_analyse_intra( h, &analysis, i_cost );
3327

3328
            i_satd_inter = i_cost;
3329
            i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3330
                                      analysis.i_satd_i8x8,
3331
                                      analysis.i_satd_i4x4 );
3332

3333
            if( analysis.i_mbrd )
3334
            {
3335
                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3336
                i_type = P_L0;
3337
                i_partition = D_16x16;
3338
                i_cost = analysis.l0.i_rd16x16;
3339
                COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3340
                COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3341
                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3342
                h->mb.i_type = i_type;
3343
                h->mb.i_partition = i_partition;
3344
                if( i_cost < COST_MAX )
3345
                    x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3346
                x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3347
            }
3348

3349
            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3350
            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3351
            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3352
            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3353

3354
            h->mb.i_type = i_type;
3355

3356
            if( analysis.b_force_intra && !IS_INTRA(i_type) )
3357
            {
3358
                /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3359
                 * it was an inter block. */
3360
                x264_analyse_update_cache( h, &analysis );
3361
                x264_macroblock_encode( h );
3362
                for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3363
                    h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3364
                if( !CHROMA444 )
3365
                {
3366
                    int height = 16 >> CHROMA_V_SHIFT;
3367
                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3368
                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3369
                }
3370
                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3371
                goto intra_analysis;
3372
            }
3373

3374
            if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3375
            {
3376
                if( IS_INTRA( h->mb.i_type ) )
3377
                {
3378
                    x264_intra_rd_refine( h, &analysis );
3379
                }
3380
                else if( i_partition == D_16x16 )
3381
                {
3382
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3383
                    analysis.l0.me16x16.cost = i_cost;
3384
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3385
                }
3386
                else if( i_partition == D_16x8 )
3387
                {
3388
                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3389
                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3390
                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3391
                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3392
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3393
                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3394
                }
3395
                else if( i_partition == D_8x16 )
3396
                {
3397
                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3398
                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3399
                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3400
                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3401
                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3402
                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3403
                }
3404
                else if( i_partition == D_8x8 )
3405
                {
3406
                    x264_analyse_update_cache( h, &analysis );
3407
                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3408
                    {
3409
                        if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3410
                        {
3411
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3412
                        }
3413
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3414
                        {
3415
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3416
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3417
                        }
3418
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3419
                        {
3420
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3421
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3422
                        }
3423
                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3424
                        {
3425
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3426
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3427
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3428
                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3429
                        }
3430
                    }
3431
                }
3432
            }
3433
        }
3434
    }
3435
    else if( h->sh.i_type == SLICE_TYPE_B )
3436
    {
3437
        int i_bskip_cost = COST_MAX;
3438
        int b_skip = 0;
3439

3440
        if( analysis.i_mbrd )
3441
            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3442

3443
        h->mb.i_type = B_SKIP;
3444
        if( h->mb.b_direct_auto_write )
3445
        {
3446
            /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3447
            for( int i = 0; i < 2; i++ )
3448
            {
3449
                int b_changed = 1;
3450
                h->sh.b_direct_spatial_mv_pred ^= 1;
3451
                analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3452
                if( analysis.b_direct_available )
3453
                {
3454
                    if( b_changed )
3455
                    {
3456
                        x264_mb_mc( h );
3457
                        b_skip = x264_macroblock_probe_bskip( h );
3458
                    }
3459
                    h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3460
                }
3461
                else
3462
                    b_skip = 0;
3463
            }
3464
        }
3465
        else
3466
            analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3467

3468
        analysis.b_try_skip = 0;
3469
        if( analysis.b_direct_available )
3470
        {
3471
            if( !h->mb.b_direct_auto_write )
3472
                x264_mb_mc( h );
3473
            /* If the current macroblock is off the frame, just skip it. */
3474
            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3475
                b_skip = 1;
3476
            else if( analysis.i_mbrd )
3477
            {
3478
                i_bskip_cost = ssd_mb( h );
3479
                /* 6 = minimum cavlc cost of a non-skipped MB */
3480
                b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3481
            }
3482
            else if( !h->mb.b_direct_auto_write )
3483
            {
3484
                /* Conditioning the probe on neighboring block types
3485
                 * doesn't seem to help speed or quality. */
3486
                analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3487
                if( h->param.analyse.i_subpel_refine < 3 )
3488
                    b_skip = analysis.b_try_skip;
3489
            }
3490
            /* Set up MVs for future predictors */
3491
            if( b_skip )
3492
            {
3493
                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3494
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3495
                for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3496
                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3497
            }
3498
        }
3499

3500
        if( !b_skip )
3501
        {
3502
            const unsigned int flags = h->param.analyse.inter;
3503
            int i_type;
3504
            int i_partition;
3505
            int i_satd_inter;
3506
            h->mb.b_skip_mc = 0;
3507
            h->mb.i_type = B_DIRECT;
3508

3509
            x264_mb_analyse_load_costs( h, &analysis );
3510

3511
            /* select best inter mode */
3512
            /* direct must be first */
3513
            if( analysis.b_direct_available )
3514
                x264_mb_analyse_inter_direct( h, &analysis );
3515

3516
            x264_mb_analyse_inter_b16x16( h, &analysis );
3517

3518
            if( h->mb.i_type == B_SKIP )
3519
            {
3520
                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3521
                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3522
                for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3523
                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3524
                return;
3525
            }
3526

3527
            i_type = B_L0_L0;
3528
            i_partition = D_16x16;
3529
            i_cost = analysis.l0.me16x16.cost;
3530
            COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3531
            COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3532
            COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3533

3534
            if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3535
            {
3536
                x264_mb_analyse_b_rd( h, &analysis, i_cost );
3537
                if( i_bskip_cost < analysis.i_rd16x16direct &&
3538
                    i_bskip_cost < analysis.i_rd16x16bi &&
3539
                    i_bskip_cost < analysis.l0.i_rd16x16 &&
3540
                    i_bskip_cost < analysis.l1.i_rd16x16 )
3541
                {
3542
                    h->mb.i_type = B_SKIP;
3543
                    x264_analyse_update_cache( h, &analysis );
3544
                    return;
3545
                }
3546
            }
3547

3548
            if( flags & X264_ANALYSE_BSUB16x16 )
3549
            {
3550
                if( h->param.analyse.b_mixed_references )
3551
                    x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3552
                else
3553
                    x264_mb_analyse_inter_b8x8( h, &analysis );
3554

3555
                COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3556

3557
                /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3558
                int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3559
                int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3560
                for( int i = 0; i < 2; i++ )
3561
                {
3562
                    int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3563
                    int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3564
                    // 16x8
3565
                    i_best_cost = COST_MAX;
3566
                    i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3567
                    i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3568
                    i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3569
                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3570
                                         + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3571
                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3572
                                         + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3573
                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3574
                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3575
                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3576
                    analysis.i_cost_est16x8[i] = i_best_cost;
3577

3578
                    // 8x16
3579
                    i_best_cost = COST_MAX;
3580
                    i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3581
                    i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3582
                    i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3583
                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3584
                                         + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3585
                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3586
                                         + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3587
                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3588
                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3589
                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3590
                    analysis.i_cost_est8x16[i] = i_best_cost;
3591
                }
3592
                i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3593
                analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3594
                i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3595
                i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3596
                analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3597
                i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3598

3599
                /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3600
                int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3601
                if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3602
                {
3603
                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3604
                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3605
                }
3606
                if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3607
                {
3608
                    x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3609
                    COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3610
                }
3611
                if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3612
                {
3613
                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3614
                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3615
                }
3616
            }
3617

3618
            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3619
            {
3620
                /* refine later */
3621
            }
3622
            /* refine qpel */
3623
            else if( i_partition == D_16x16 )
3624
            {
3625
                analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3626
                analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3627
                if( i_type == B_L0_L0 )
3628
                {
3629
                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3630
                    i_cost = analysis.l0.me16x16.cost
3631
                           + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3632
                }
3633
                else if( i_type == B_L1_L1 )
3634
                {
3635
                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3636
                    i_cost = analysis.l1.me16x16.cost
3637
                           + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3638
                }
3639
                else if( i_type == B_BI_BI )
3640
                {
3641
                    x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3642
                    x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3643
                }
3644
            }
3645
            else if( i_partition == D_16x8 )
3646
            {
3647
                for( int i = 0; i < 2; i++ )
3648
                {
3649
                    if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3650
                        x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3651
                    if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3652
                        x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3653
                }
3654
            }
3655
            else if( i_partition == D_8x16 )
3656
            {
3657
                for( int i = 0; i < 2; i++ )
3658
                {
3659
                    if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3660
                        x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3661
                    if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3662
                        x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3663
                }
3664
            }
3665
            else if( i_partition == D_8x8 )
3666
            {
3667
                for( int i = 0; i < 4; i++ )
3668
                {
3669
                    x264_me_t *m;
3670
                    int i_part_cost_old;
3671
                    int i_type_cost;
3672
                    int i_part_type = h->mb.i_sub_partition[i];
3673
                    int b_bidir = (i_part_type == D_BI_8x8);
3674

3675
                    if( i_part_type == D_DIRECT_8x8 )
3676
                        continue;
3677
                    if( x264_mb_partition_listX_table[0][i_part_type] )
3678
                    {
3679
                        m = &analysis.l0.me8x8[i];
3680
                        i_part_cost_old = m->cost;
3681
                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3682
                        m->cost -= i_type_cost;
3683
                        x264_me_refine_qpel( h, m );
3684
                        if( !b_bidir )
3685
                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3686
                    }
3687
                    if( x264_mb_partition_listX_table[1][i_part_type] )
3688
                    {
3689
                        m = &analysis.l1.me8x8[i];
3690
                        i_part_cost_old = m->cost;
3691
                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3692
                        m->cost -= i_type_cost;
3693
                        x264_me_refine_qpel( h, m );
3694
                        if( !b_bidir )
3695
                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3696
                    }
3697
                    /* TODO: update mvp? */
3698
                }
3699
            }
3700

3701
            i_satd_inter = i_cost;
3702

3703
            if( analysis.i_mbrd )
3704
            {
3705
                x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3706
                i_type = B_SKIP;
3707
                i_cost = i_bskip_cost;
3708
                i_partition = D_16x16;
3709
                COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3710
                COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3711
                COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3712
                COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3713
                COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3714
                COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3715
                COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3716

3717
                h->mb.i_type = i_type;
3718
                h->mb.i_partition = i_partition;
3719
            }
3720

3721
            if( h->mb.b_chroma_me )
3722
            {
3723
                if( CHROMA444 )
3724
                {
3725
                    x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3726
                    x264_mb_analyse_intra_chroma( h, &analysis );
3727
                }
3728
                else
3729
                {
3730
                    x264_mb_analyse_intra_chroma( h, &analysis );
3731
                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3732
                }
3733
                analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3734
                analysis.i_satd_i8x8   += analysis.i_satd_chroma;
3735
                analysis.i_satd_i4x4   += analysis.i_satd_chroma;
3736
            }
3737
            else
3738
                x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3739

3740
            if( analysis.i_mbrd )
3741
            {
3742
                x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3743
                x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3744
            }
3745

3746
            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3747
            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3748
            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3749
            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3750

3751
            h->mb.i_type = i_type;
3752
            h->mb.i_partition = i_partition;
3753

3754
            if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3755
                x264_intra_rd_refine( h, &analysis );
3756
            if( h->mb.i_subpel_refine >= 5 )
3757
                x264_refine_bidir( h, &analysis );
3758

3759
            if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3760
            {
3761
                int i_biweight;
3762
                x264_analyse_update_cache( h, &analysis );
3763

3764
                if( i_partition == D_16x16 )
3765
                {
3766
                    if( i_type == B_L0_L0 )
3767
                    {
3768
                        analysis.l0.me16x16.cost = i_cost;
3769
                        x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3770
                    }
3771
                    else if( i_type == B_L1_L1 )
3772
                    {
3773
                        analysis.l1.me16x16.cost = i_cost;
3774
                        x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3775
                    }
3776
                    else if( i_type == B_BI_BI )
3777
                    {
3778
                        i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3779
                        x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3780
                    }
3781
                }
3782
                else if( i_partition == D_16x8 )
3783
                {
3784
                    for( int i = 0; i < 2; i++ )
3785
                    {
3786
                        h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3787
                        if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3788
                            x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3789
                        else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3790
                            x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3791
                        else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3792
                        {
3793
                            i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3794
                            x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3795
                        }
3796
                    }
3797
                }
3798
                else if( i_partition == D_8x16 )
3799
                {
3800
                    for( int i = 0; i < 2; i++ )
3801
                    {
3802
                        h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3803
                        if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3804
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3805
                        else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3806
                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3807
                        else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3808
                        {
3809
                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3810
                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3811
                        }
3812
                    }
3813
                }
3814
                else if( i_partition == D_8x8 )
3815
                {
3816
                    for( int i = 0; i < 4; i++ )
3817
                    {
3818
                        if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3819
                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3820
                        else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3821
                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3822
                        else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3823
                        {
3824
                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3825
                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3826
                        }
3827
                    }
3828
                }
3829
            }
3830
        }
3831
    }
3832

3833
    x264_analyse_update_cache( h, &analysis );
3834

3835
    /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3836
     * without realizing it.  Check for this and account for it if necessary. */
3837
    if( analysis.i_mbrd >= 2 )
3838
    {
3839
        /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3840
        static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3841
        int list = check_mv_lists[h->mb.i_type] - 1;
3842
        if( list >= 0 && h->mb.i_partition != D_16x16 &&
3843
            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3844
            h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3845
                h->mb.i_partition = D_16x16;
3846
    }
3847

3848
    if( !analysis.i_mbrd )
3849
        x264_mb_analyse_transform( h );
3850

3851
    if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3852
        x264_mb_analyse_qp_rd( h, &analysis );
3853

3854
    h->mb.b_trellis = h->param.analyse.i_trellis;
3855
    h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3856

3857
    if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3858
        x264_psy_trellis_init( h, 0 );
3859
    if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3860
        h->mb.i_skip_intra = 0;
3861
}
3862

3863
/*-------------------- Update MB from the analysis ----------------------*/
3864
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
3865
{
3866
    switch( h->mb.i_type )
3867
    {
3868
        case I_4x4:
3869
            for( int i = 0; i < 16; i++ )
3870
                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3871

3872
            x264_mb_analyse_intra_chroma( h, a );
3873
            break;
3874
        case I_8x8:
3875
            for( int i = 0; i < 4; i++ )
3876
                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3877

3878
            x264_mb_analyse_intra_chroma( h, a );
3879
            break;
3880
        case I_16x16:
3881
            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3882
            x264_mb_analyse_intra_chroma( h, a );
3883
            break;
3884

3885
        case I_PCM:
3886
            break;
3887

3888
        case P_L0:
3889
            switch( h->mb.i_partition )
3890
            {
3891
                case D_16x16:
3892
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3893
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3894
                    break;
3895

3896
                case D_16x8:
3897
                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3898
                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3899
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3900
                    x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3901
                    break;
3902

3903
                case D_8x16:
3904
                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3905
                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3906
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3907
                    x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3908
                    break;
3909

3910
                default:
3911
                    x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3912
                    break;
3913
            }
3914
            break;
3915

3916
        case P_8x8:
3917
            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3918
            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3919
            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3920
            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3921
            for( int i = 0; i < 4; i++ )
3922
                x264_mb_cache_mv_p8x8( h, a, i );
3923
            break;
3924

3925
        case P_SKIP:
3926
        {
3927
            h->mb.i_partition = D_16x16;
3928
            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3929
            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3930
            break;
3931
        }
3932

3933
        case B_SKIP:
3934
        case B_DIRECT:
3935
            h->mb.i_partition = h->mb.cache.direct_partition;
3936
            x264_mb_load_mv_direct8x8( h, 0 );
3937
            x264_mb_load_mv_direct8x8( h, 1 );
3938
            x264_mb_load_mv_direct8x8( h, 2 );
3939
            x264_mb_load_mv_direct8x8( h, 3 );
3940
            break;
3941

3942
        case B_8x8:
3943
            /* optimize: cache might not need to be rewritten */
3944
            for( int i = 0; i < 4; i++ )
3945
                x264_mb_cache_mv_b8x8( h, a, i, 1 );
3946
            break;
3947

3948
        default: /* the rest of the B types */
3949
            switch( h->mb.i_partition )
3950
            {
3951
            case D_16x16:
3952
                switch( h->mb.i_type )
3953
                {
3954
                case B_L0_L0:
3955
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3956
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3957

3958
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3959
                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3960
                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3961
                    break;
3962
                case B_L1_L1:
3963
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3964
                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3965
                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3966

3967
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3968
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3969
                    break;
3970
                case B_BI_BI:
3971
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3972
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3973

3974
                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3975
                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3976
                    break;
3977
                }
3978
                break;
3979
            case D_16x8:
3980
                x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3981
                x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3982
                break;
3983
            case D_8x16:
3984
                x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3985
                x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3986
                break;
3987
            default:
3988
                x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3989
                break;
3990
            }
3991
    }
3992

3993
#ifndef NDEBUG
3994
    if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3995
    {
3996
        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3997
        {
3998
            int completed;
3999
            int ref = h->mb.cache.ref[l][x264_scan8[0]];
4000
            if( ref < 0 )
4001
                continue;
4002
            completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
4003
            if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
4004
            {
4005
                x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4006
                x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4007
                x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4008
                                h->mb.cache.mv[l][x264_scan8[15]][0],
4009
                                h->mb.cache.mv[l][x264_scan8[15]][1] );
4010
                x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4011
                x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4012
                x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4013
                x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4014
                x264_mb_analyse_intra( h, a, COST_MAX );
4015
                h->mb.i_type = I_16x16;
4016
                h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4017
                x264_mb_analyse_intra_chroma( h, a );
4018
            }
4019
        }
4020
    }
4021
#endif
4022
}
4023

4024
#include "slicetype.c"
4025

4026

4027
Product

Resources

Company