Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52866 views
1
/*****************************************************************************
2
* analyse.c: macroblock analysis
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Laurent Aimar <[email protected]>
7
* Loren Merritt <[email protected]>
8
* Fiona Glaser <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#define _ISOC99_SOURCE
29
30
#include "common/common.h"
31
#include "macroblock.h"
32
#include "me.h"
33
#include "ratecontrol.h"
34
#include "analyse.h"
35
#include "rdo.c"
36
37
typedef struct
38
{
39
/* 16x16 */
40
int i_rd16x16;
41
x264_me_t me16x16;
42
x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
43
44
/* 8x8 */
45
int i_cost8x8;
46
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47
ALIGNED_4( int16_t mvc[32][5][2] );
48
x264_me_t me8x8[4];
49
50
/* Sub 4x4 */
51
int i_cost4x4[4]; /* cost per 8x8 partition */
52
x264_me_t me4x4[4][4];
53
54
/* Sub 8x4 */
55
int i_cost8x4[4]; /* cost per 8x8 partition */
56
x264_me_t me8x4[4][2];
57
58
/* Sub 4x8 */
59
int i_cost4x8[4]; /* cost per 8x8 partition */
60
x264_me_t me4x8[4][2];
61
62
/* 16x8 */
63
int i_cost16x8;
64
x264_me_t me16x8[2];
65
66
/* 8x16 */
67
int i_cost8x16;
68
x264_me_t me8x16[2];
69
70
} x264_mb_analysis_list_t;
71
72
typedef struct
73
{
74
/* conduct the analysis using this lamda and QP */
75
int i_lambda;
76
int i_lambda2;
77
int i_qp;
78
uint16_t *p_cost_mv;
79
uint16_t *p_cost_ref[2];
80
int i_mbrd;
81
82
83
/* I: Intra part */
84
/* Take some shortcuts in intra search if intra is deemed unlikely */
85
int b_fast_intra;
86
int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
87
int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
88
int b_try_skip;
89
90
/* Luma part */
91
int i_satd_i16x16;
92
int i_satd_i16x16_dir[7];
93
int i_predict16x16;
94
95
int i_satd_i8x8;
96
int i_cbp_i8x8_luma;
97
ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
98
int i_predict8x8[4];
99
100
int i_satd_i4x4;
101
int i_predict4x4[16];
102
103
int i_satd_pcm;
104
105
/* Chroma part */
106
int i_satd_chroma;
107
int i_satd_chroma_dir[7];
108
int i_predict8x8chroma;
109
110
/* II: Inter part P/B frame */
111
x264_mb_analysis_list_t l0;
112
x264_mb_analysis_list_t l1;
113
114
int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115
int i_cost16x16direct;
116
int i_cost8x8bi;
117
int i_cost8x8direct[4];
118
int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119
int i_cost_est16x8[2]; /* Per-partition estimated cost */
120
int i_cost_est8x16[2];
121
int i_cost16x8bi;
122
int i_cost8x16bi;
123
int i_rd16x16bi;
124
int i_rd16x16direct;
125
int i_rd16x8bi;
126
int i_rd8x16bi;
127
int i_rd8x8bi;
128
129
int i_mb_partition16x8[2]; /* mb_partition_e */
130
int i_mb_partition8x16[2];
131
int i_mb_type16x8; /* mb_class_e */
132
int i_mb_type8x16;
133
134
int b_direct_available;
135
int b_early_terminate;
136
137
} x264_mb_analysis_t;
138
139
/* lambda = pow(2,qp/6-2) */
140
const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
141
{
142
1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
143
1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
144
2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
145
4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
146
10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
147
25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
148
64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
149
161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
150
406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
151
1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
152
2048,2299, /* 80-81 */
153
};
154
155
/* lambda2 = pow(lambda,2) * .9 * 256 */
156
/* Capped to avoid overflow */
157
const int x264_lambda2_tab[QP_MAX_MAX+1] =
158
{
159
14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
160
91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
161
580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
162
3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
163
23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
164
148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
165
943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
166
5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
167
38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
168
134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
169
134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
170
};
171
172
const uint8_t x264_exp2_lut[64] =
173
{
174
0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
175
48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
176
106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
177
175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
178
};
179
180
const float x264_log2_lut[128] =
181
{
182
0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
183
0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
184
0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
185
0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
186
0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
187
0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
188
0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
189
0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
190
0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
191
0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
192
0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
193
0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
194
0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
195
0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
196
0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
197
0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
198
};
199
200
/* Avoid an int/float conversion. */
201
const float x264_log2_lz_lut[32] =
202
{
203
31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
204
};
205
206
// should the intra and inter lambdas be different?
207
// I'm just matching the behaviour of deadzone quant.
208
static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
209
{
210
// inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
211
{
212
46, 58, 73, 92, 117, 147,
213
185, 233, 294, 370, 466, 587,
214
740, 932, 1174, 1480, 1864, 2349,
215
2959, 3728, 4697, 5918, 7457, 9395,
216
11837, 14914, 18790, 23674, 29828, 37581,
217
47349, 59656, 75163, 94699, 119313, 150326,
218
189399, 238627, 300652, 378798, 477255, 601304,
219
757596, 954511, 1202608, 1515192, 1909022, 2405217,
220
3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
221
12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
222
48486154, 61088726, 76966972, 96972308,
223
122177453,134217727,134217727,134217727,134217727,134217727,
224
134217727,134217727,134217727,134217727,134217727,134217727,
225
},
226
// intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
227
{
228
27, 34, 43, 54, 68, 86,
229
108, 136, 172, 216, 273, 343,
230
433, 545, 687, 865, 1090, 1374,
231
1731, 2180, 2747, 3461, 4361, 5494,
232
6922, 8721, 10988, 13844, 17442, 21976,
233
27688, 34885, 43953, 55377, 69771, 87906,
234
110755, 139543, 175813, 221511, 279087, 351627,
235
443023, 558174, 703255, 886046, 1116348, 1406511,
236
1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
237
7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
238
28353495, 35723165, 45008368, 56706990,
239
71446330, 90016736,113413980,134217727,134217727,134217727,
240
134217727,134217727,134217727,134217727,134217727,134217727,
241
134217727,134217727,134217727,134217727,134217727,134217727,
242
}
243
};
244
245
#define MAX_CHROMA_LAMBDA_OFFSET 36
246
static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
247
{
248
16, 20, 25, 32, 40, 50,
249
64, 80, 101, 128, 161, 203,
250
256, 322, 406, 512, 645, 812,
251
1024, 1290, 1625, 2048, 2580, 3250,
252
4096, 5160, 6501, 8192, 10321, 13003,
253
16384, 20642, 26007, 32768, 41285, 52015,
254
65535
255
};
256
257
/* TODO: calculate CABAC costs */
258
static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
259
{
260
9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
261
};
262
static const uint8_t i_mb_b16x8_cost_table[17] =
263
{
264
0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
265
};
266
static const uint8_t i_sub_mb_b_cost_table[13] =
267
{
268
7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
269
};
270
static const uint8_t i_sub_mb_p_cost_table[4] =
271
{
272
5, 3, 3, 1
273
};
274
275
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
276
277
static uint16_t x264_cost_ref[QP_MAX+1][3][33];
278
static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279
static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
280
281
static int init_costs( x264_t *h, float *logs, int qp )
282
{
283
int lambda = x264_lambda_tab[qp];
284
if( h->cost_mv[qp] )
285
return 0;
286
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
287
CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
288
h->cost_mv[qp] += 2*4*2048;
289
for( int i = 0; i <= 2*4*2048; i++ )
290
{
291
h->cost_mv[qp][-i] =
292
h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
293
}
294
x264_pthread_mutex_lock( &cost_ref_mutex );
295
for( int i = 0; i < 3; i++ )
296
for( int j = 0; j < 33; j++ )
297
x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
298
x264_pthread_mutex_unlock( &cost_ref_mutex );
299
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
300
{
301
for( int j = 0; j < 4; j++ )
302
{
303
CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
304
h->cost_mv_fpel[qp][j] += 2*2048;
305
for( int i = -2*2048; i < 2*2048; i++ )
306
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
307
}
308
}
309
uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
310
for( int i = 0; i < 17; i++ )
311
cost_i4x4_mode[i] = 3*lambda*(i!=8);
312
return 0;
313
fail:
314
return -1;
315
}
316
317
int x264_analyse_init_costs( x264_t *h )
318
{
319
float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
320
if( !logs )
321
return -1;
322
323
logs[0] = 0.718f;
324
for( int i = 1; i <= 2*4*2048; i++ )
325
logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
326
327
for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
328
if( init_costs( h, logs, qp ) )
329
goto fail;
330
331
if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
332
goto fail;
333
334
x264_free( logs );
335
return 0;
336
fail:
337
x264_free( logs );
338
return -1;
339
}
340
341
void x264_analyse_free_costs( x264_t *h )
342
{
343
for( int i = 0; i < QP_MAX+1; i++ )
344
{
345
if( h->cost_mv[i] )
346
x264_free( h->cost_mv[i] - 2*4*2048 );
347
if( h->cost_mv_fpel[i][0] )
348
for( int j = 0; j < 4; j++ )
349
x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
350
}
351
}
352
353
void x264_analyse_weight_frame( x264_t *h, int end )
354
{
355
for( int j = 0; j < h->i_ref[0]; j++ )
356
{
357
if( h->sh.weight[j][0].weightfn )
358
{
359
x264_frame_t *frame = h->fref[0][j];
360
int width = frame->i_width[0] + 2*PADH;
361
int i_padv = PADV << PARAM_INTERLACED;
362
int offset, height;
363
pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
364
height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
365
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
366
h->fenc->i_lines_weighted += height;
367
if( height )
368
for( int k = j; k < h->i_ref[0]; k++ )
369
if( h->sh.weight[k][0].weightfn )
370
{
371
pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
372
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
373
src + offset, frame->i_stride[0],
374
width, height, &h->sh.weight[k][0] );
375
}
376
break;
377
}
378
}
379
}
380
381
/* initialize an array of lambda*nbits for all possible mvs */
382
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
383
{
384
a->p_cost_mv = h->cost_mv[a->i_qp];
385
a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
386
a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
387
}
388
389
static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
390
{
391
int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
392
a->i_lambda = x264_lambda_tab[qp];
393
a->i_lambda2 = x264_lambda2_tab[qp];
394
395
h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
396
if( h->param.analyse.i_trellis )
397
{
398
h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
399
h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
400
h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
401
h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
402
}
403
h->mb.i_psy_rd_lambda = a->i_lambda;
404
/* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
405
int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
406
h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
407
408
if( qp > QP_MAX_SPEC )
409
{
410
h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
411
h->nr_residual_sum = h->nr_residual_sum_buf[1];
412
h->nr_count = h->nr_count_buf[1];
413
h->mb.b_noise_reduction = 1;
414
qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
415
}
416
else
417
{
418
h->nr_offset = h->nr_offset_denoise;
419
h->nr_residual_sum = h->nr_residual_sum_buf[0];
420
h->nr_count = h->nr_count_buf[0];
421
h->mb.b_noise_reduction = 0;
422
}
423
424
a->i_qp = h->mb.i_qp = qp;
425
h->mb.i_chroma_qp = h->chroma_qp_table[qp];
426
}
427
428
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
429
{
430
int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
431
432
/* mbrd == 1 -> RD mode decision */
433
/* mbrd == 2 -> RD refinement */
434
/* mbrd == 3 -> QPRD */
435
a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
436
h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
437
a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
438
439
x264_mb_analyse_init_qp( h, a, qp );
440
441
h->mb.b_transform_8x8 = 0;
442
443
/* I: Intra part */
444
a->i_satd_i16x16 =
445
a->i_satd_i8x8 =
446
a->i_satd_i4x4 =
447
a->i_satd_chroma = COST_MAX;
448
449
/* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
450
* PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
451
uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
452
a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
453
454
a->b_fast_intra = 0;
455
a->b_avoid_topright = 0;
456
h->mb.i_skip_intra =
457
h->mb.b_lossless ? 0 :
458
a->i_mbrd ? 2 :
459
!h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
460
461
/* II: Inter part P/B frame */
462
if( h->sh.i_type != SLICE_TYPE_I )
463
{
464
int i_fmv_range = 4 * h->param.analyse.i_mv_range;
465
// limit motion search to a slightly smaller range than the theoretical limit,
466
// since the search may go a few iterations past its given range
467
int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
468
469
/* Calculate max allowed MV range */
470
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
471
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
472
h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
473
h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
474
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
475
if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
476
{
477
int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
478
int max_mv = max_x - 4*16*h->mb.i_mb_x;
479
/* If we're left of the refresh bar, don't reference right of it. */
480
if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
481
h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
482
}
483
h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
484
h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
485
if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
486
{
487
int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
488
int thread_mvy_range = i_fmv_range;
489
490
if( h->i_thread_frames > 1 )
491
{
492
int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
493
int thresh = pix_y + h->param.analyse.i_mv_range_thread;
494
for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
495
for( int j = 0; j < h->i_ref[i]; j++ )
496
{
497
x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
498
thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
499
}
500
501
if( h->param.b_deterministic )
502
thread_mvy_range = h->param.analyse.i_mv_range_thread;
503
if( PARAM_INTERLACED )
504
thread_mvy_range >>= 1;
505
506
x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
507
}
508
509
if( PARAM_INTERLACED )
510
{
511
/* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
512
for( int i = 0; i < 3; i++ )
513
{
514
int j = i == 2;
515
mb_y = (h->mb.i_mb_y >> j) + (i == 1);
516
h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
517
h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
518
h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
519
h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
520
h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
521
h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
522
h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
523
}
524
}
525
else
526
{
527
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
528
h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
529
h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
530
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
531
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
532
h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
533
h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
534
}
535
}
536
if( PARAM_INTERLACED )
537
{
538
int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
539
h->mb.mv_min[1] = h->mb.mv_miny_row[i];
540
h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
541
h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
542
h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
543
h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
544
h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
545
}
546
#undef CLIP_FMV
547
548
a->l0.me16x16.cost =
549
a->l0.i_rd16x16 =
550
a->l0.i_cost8x8 =
551
a->l0.i_cost16x8 =
552
a->l0.i_cost8x16 = COST_MAX;
553
if( h->sh.i_type == SLICE_TYPE_B )
554
{
555
a->l1.me16x16.cost =
556
a->l1.i_rd16x16 =
557
a->l1.i_cost8x8 =
558
a->i_cost8x8direct[0] =
559
a->i_cost8x8direct[1] =
560
a->i_cost8x8direct[2] =
561
a->i_cost8x8direct[3] =
562
a->l1.i_cost16x8 =
563
a->l1.i_cost8x16 =
564
a->i_rd16x16bi =
565
a->i_rd16x16direct =
566
a->i_rd8x8bi =
567
a->i_rd16x8bi =
568
a->i_rd8x16bi =
569
a->i_cost16x16bi =
570
a->i_cost16x16direct =
571
a->i_cost8x8bi =
572
a->i_cost16x8bi =
573
a->i_cost8x16bi = COST_MAX;
574
}
575
else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
576
for( int i = 0; i < 4; i++ )
577
{
578
a->l0.i_cost4x4[i] =
579
a->l0.i_cost8x4[i] =
580
a->l0.i_cost4x8[i] = COST_MAX;
581
}
582
583
/* Fast intra decision */
584
if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
585
{
586
/* Always run in fast-intra mode for subme < 3 */
587
if( h->mb.i_subpel_refine > 2 &&
588
( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
589
IS_INTRA( h->mb.i_mb_type_top ) ||
590
IS_INTRA( h->mb.i_mb_type_topleft ) ||
591
IS_INTRA( h->mb.i_mb_type_topright ) ||
592
(h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
593
(h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
594
{ /* intra is likely */ }
595
else
596
{
597
a->b_fast_intra = 1;
598
}
599
}
600
h->mb.b_skip_mc = 0;
601
if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
602
h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
603
{
604
a->b_force_intra = 1;
605
a->b_fast_intra = 0;
606
a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
607
}
608
else
609
a->b_force_intra = 0;
610
}
611
}
612
613
/* Prediction modes allowed for various combinations of neighbors. */
614
/* Terminated by a -1. */
615
/* In order, no neighbors, left, top, top/left, top/left/topleft */
616
static const int8_t i16x16_mode_available[5][5] =
617
{
618
{I_PRED_16x16_DC_128, -1, -1, -1, -1},
619
{I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
620
{I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
621
{I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
622
{I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
623
};
624
625
static const int8_t chroma_mode_available[5][5] =
626
{
627
{I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
628
{I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
629
{I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
630
{I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
631
{I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
632
};
633
634
static const int8_t i8x8_mode_available[2][5][10] =
635
{
636
{
637
{I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
638
{I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
639
{I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
640
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
641
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
642
},
643
{
644
{I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
645
{I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
646
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
647
{I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
648
{I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
649
}
650
};
651
652
static const int8_t i4x4_mode_available[2][5][10] =
653
{
654
{
655
{I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
656
{I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
657
{I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
658
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
659
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
660
},
661
{
662
{I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
663
{I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
664
{I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
665
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
666
{I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
667
}
668
};
669
670
static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
671
{
672
int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
673
idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
674
return i16x16_mode_available[idx];
675
}
676
677
static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
678
{
679
int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
680
idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
681
return chroma_mode_available[idx];
682
}
683
684
static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
685
{
686
int avoid_topright = force_intra && (i&1);
687
int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
688
idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
689
return i8x8_mode_available[avoid_topright][idx];
690
}
691
692
static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
693
{
694
int avoid_topright = force_intra && ((i&5) == 5);
695
int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
696
idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
697
return i4x4_mode_available[avoid_topright][idx];
698
}
699
700
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
701
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
702
{
703
ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
704
705
if( do_both_dct || h->mb.b_transform_8x8 )
706
h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
707
if( do_both_dct || !h->mb.b_transform_8x8 )
708
h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
709
}
710
711
/* Reset fenc satd scores cache for psy RD */
712
static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
713
{
714
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
715
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
716
if( !h->mb.i_psy_rd )
717
return;
718
/* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
719
h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
720
if( b_satd )
721
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
722
}
723
724
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
725
{
726
if( a->i_satd_chroma < COST_MAX )
727
return;
728
729
if( CHROMA444 )
730
{
731
if( !h->mb.b_chroma_me )
732
{
733
a->i_satd_chroma = 0;
734
return;
735
}
736
737
/* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
738
if( h->mb.b_lossless )
739
{
740
x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
741
x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
742
}
743
else
744
{
745
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
746
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
747
}
748
a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
749
+ h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
750
return;
751
}
752
753
const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
754
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
755
756
/* Prediction selection for chroma */
757
if( predict_mode[3] >= 0 && !h->mb.b_lossless )
758
{
759
int satdu[4], satdv[4];
760
h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
761
h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
762
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
763
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
764
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
765
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
766
767
for( ; *predict_mode >= 0; predict_mode++ )
768
{
769
int i_mode = *predict_mode;
770
int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
771
772
a->i_satd_chroma_dir[i_mode] = i_satd;
773
COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
774
}
775
}
776
else
777
{
778
for( ; *predict_mode >= 0; predict_mode++ )
779
{
780
int i_satd;
781
int i_mode = *predict_mode;
782
783
/* we do the prediction */
784
if( h->mb.b_lossless )
785
x264_predict_lossless_chroma( h, i_mode );
786
else
787
{
788
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
789
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
790
}
791
792
/* we calculate the cost */
793
i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
794
h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
795
a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
796
797
a->i_satd_chroma_dir[i_mode] = i_satd;
798
COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
799
}
800
}
801
802
h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
803
}
804
805
/* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
806
static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
807
{
808
const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
809
pixel *p_src = h->mb.pic.p_fenc[0];
810
pixel *p_dst = h->mb.pic.p_fdec[0];
811
static const int8_t intra_analysis_shortcut[2][2][2][5] =
812
{
813
{{{I_PRED_4x4_HU, -1, -1, -1, -1},
814
{I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
815
{{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
816
{I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
817
{{{I_PRED_4x4_HU, -1, -1, -1, -1},
818
{-1, -1, -1, -1, -1}},
819
{{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
820
{I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
821
};
822
823
int idx;
824
int lambda = a->i_lambda;
825
826
/*---------------- Try all mode and calculate their score ---------------*/
827
/* Disabled i16x16 for AVC-Intra compat */
828
if( !h->param.i_avcintra_class )
829
{
830
const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
831
832
/* Not heavily tuned */
833
static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
834
int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
835
836
if( !h->mb.b_lossless && predict_mode[3] >= 0 )
837
{
838
h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
839
a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
840
a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
841
a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
842
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
843
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
844
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
845
846
/* Plane is expensive, so don't check it unless one of the previous modes was useful. */
847
if( a->i_satd_i16x16 <= i16x16_thresh )
848
{
849
h->predict_16x16[I_PRED_16x16_P]( p_dst );
850
a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
851
a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
852
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
853
}
854
}
855
else
856
{
857
for( ; *predict_mode >= 0; predict_mode++ )
858
{
859
int i_satd;
860
int i_mode = *predict_mode;
861
862
if( h->mb.b_lossless )
863
x264_predict_lossless_16x16( h, 0, i_mode );
864
else
865
h->predict_16x16[i_mode]( p_dst );
866
867
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
868
lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
869
COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
870
a->i_satd_i16x16_dir[i_mode] = i_satd;
871
}
872
}
873
874
if( h->sh.i_type == SLICE_TYPE_B )
875
/* cavlc mb type prefix */
876
a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
877
878
if( a->i_satd_i16x16 > i16x16_thresh )
879
return;
880
}
881
882
uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
883
/* 8x8 prediction selection */
884
if( flags & X264_ANALYSE_I8x8 )
885
{
886
ALIGNED_ARRAY_32( pixel, edge,[36] );
887
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
888
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
889
890
// FIXME some bias like in i4x4?
891
int i_cost = lambda * 4; /* base predmode costs */
892
h->mb.i_cbp_luma = 0;
893
894
if( h->sh.i_type == SLICE_TYPE_B )
895
i_cost += lambda * i_mb_b_cost_table[I_8x8];
896
897
for( idx = 0;; idx++ )
898
{
899
int x = idx&1;
900
int y = idx>>1;
901
pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
902
pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
903
int i_best = COST_MAX;
904
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
905
906
const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
907
h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
908
909
if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
910
{
911
/* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
912
i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
913
i_cost += i_best & 0xffff;
914
i_best >>= 16;
915
a->i_predict8x8[idx] = i_best;
916
if( idx == 3 || i_cost > i_satd_thresh )
917
break;
918
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
919
}
920
else
921
{
922
if( !h->mb.b_lossless && predict_mode[5] >= 0 )
923
{
924
ALIGNED_ARRAY_16( int32_t, satd,[9] );
925
h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
926
int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
927
satd[i_pred_mode] -= 3 * lambda;
928
for( int i = 2; i >= 0; i-- )
929
{
930
int cost = satd[i];
931
a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
932
COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
933
}
934
935
/* Take analysis shortcuts: don't analyse modes that are too
936
* far away direction-wise from the favored mode. */
937
if( a->i_mbrd < 1 + a->b_fast_intra )
938
predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
939
else
940
predict_mode += 3;
941
}
942
943
for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
944
{
945
int i_satd;
946
int i_mode = *predict_mode;
947
948
if( h->mb.b_lossless )
949
x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
950
else
951
h->predict_8x8[i_mode]( p_dst_by, edge );
952
953
i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
954
if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
955
i_satd -= 3 * lambda;
956
957
COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
958
a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
959
}
960
i_cost += i_best + 3*lambda;
961
962
if( idx == 3 || i_cost > i_satd_thresh )
963
break;
964
if( h->mb.b_lossless )
965
x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
966
else
967
h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
968
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
969
}
970
/* we need to encode this block now (for next ones) */
971
x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
972
}
973
974
if( idx == 3 )
975
{
976
a->i_satd_i8x8 = i_cost;
977
if( h->mb.i_skip_intra )
978
{
979
h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
980
h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
981
h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
982
h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
983
h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
984
h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
985
if( h->mb.i_skip_intra == 2 )
986
h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
987
}
988
}
989
else
990
{
991
static const uint16_t cost_div_fix8[3] = {1024,512,341};
992
a->i_satd_i8x8 = COST_MAX;
993
i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
994
}
995
/* Not heavily tuned */
996
static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
997
if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
998
return;
999
}
1000
1001
/* 4x4 prediction selection */
1002
if( flags & X264_ANALYSE_I4x4 )
1003
{
1004
int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1005
int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1006
h->mb.i_cbp_luma = 0;
1007
1008
if( a->b_early_terminate && a->i_mbrd )
1009
i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1010
1011
if( h->sh.i_type == SLICE_TYPE_B )
1012
i_cost += lambda * i_mb_b_cost_table[I_4x4];
1013
1014
for( idx = 0;; idx++ )
1015
{
1016
pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1017
pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1018
int i_best = COST_MAX;
1019
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1020
1021
const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1022
1023
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1024
/* emulate missing topright samples */
1025
MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1026
1027
if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1028
{
1029
/* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1030
i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1031
i_cost += i_best & 0xffff;
1032
i_best >>= 16;
1033
a->i_predict4x4[idx] = i_best;
1034
if( i_cost > i_satd_thresh || idx == 15 )
1035
break;
1036
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1037
}
1038
else
1039
{
1040
if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1041
{
1042
ALIGNED_ARRAY_16( int32_t, satd,[9] );
1043
h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1044
int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1045
satd[i_pred_mode] -= 3 * lambda;
1046
i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1047
COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1048
COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1049
1050
/* Take analysis shortcuts: don't analyse modes that are too
1051
* far away direction-wise from the favored mode. */
1052
if( a->i_mbrd < 1 + a->b_fast_intra )
1053
predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1054
else
1055
predict_mode += 3;
1056
}
1057
1058
if( i_best > 0 )
1059
{
1060
for( ; *predict_mode >= 0; predict_mode++ )
1061
{
1062
int i_satd;
1063
int i_mode = *predict_mode;
1064
1065
if( h->mb.b_lossless )
1066
x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1067
else
1068
h->predict_4x4[i_mode]( p_dst_by );
1069
1070
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1071
if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1072
{
1073
i_satd -= lambda * 3;
1074
if( i_satd <= 0 )
1075
{
1076
i_best = i_satd;
1077
a->i_predict4x4[idx] = i_mode;
1078
break;
1079
}
1080
}
1081
1082
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1083
}
1084
}
1085
1086
i_cost += i_best + 3 * lambda;
1087
if( i_cost > i_satd_thresh || idx == 15 )
1088
break;
1089
if( h->mb.b_lossless )
1090
x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1091
else
1092
h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1093
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1094
}
1095
/* we need to encode this block now (for next ones) */
1096
x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1097
}
1098
if( idx == 15 )
1099
{
1100
a->i_satd_i4x4 = i_cost;
1101
if( h->mb.i_skip_intra )
1102
{
1103
h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1104
h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1105
h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1106
h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1107
h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1108
h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1109
if( h->mb.i_skip_intra == 2 )
1110
h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1111
}
1112
}
1113
else
1114
a->i_satd_i4x4 = COST_MAX;
1115
}
1116
}
1117
1118
static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1119
{
1120
if( !a->b_early_terminate )
1121
i_satd_thresh = COST_MAX;
1122
1123
if( a->i_satd_i16x16 < i_satd_thresh )
1124
{
1125
h->mb.i_type = I_16x16;
1126
x264_analyse_update_cache( h, a );
1127
a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1128
}
1129
else
1130
a->i_satd_i16x16 = COST_MAX;
1131
1132
if( a->i_satd_i4x4 < i_satd_thresh )
1133
{
1134
h->mb.i_type = I_4x4;
1135
x264_analyse_update_cache( h, a );
1136
a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1137
}
1138
else
1139
a->i_satd_i4x4 = COST_MAX;
1140
1141
if( a->i_satd_i8x8 < i_satd_thresh )
1142
{
1143
h->mb.i_type = I_8x8;
1144
x264_analyse_update_cache( h, a );
1145
a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1146
a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1147
}
1148
else
1149
a->i_satd_i8x8 = COST_MAX;
1150
}
1151
1152
static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1153
{
1154
uint64_t i_satd, i_best;
1155
int plane_count = CHROMA444 ? 3 : 1;
1156
h->mb.i_skip_intra = 0;
1157
1158
if( h->mb.i_type == I_16x16 )
1159
{
1160
int old_pred_mode = a->i_predict16x16;
1161
const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1162
int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1163
i_best = a->i_satd_i16x16;
1164
for( ; *predict_mode >= 0; predict_mode++ )
1165
{
1166
int i_mode = *predict_mode;
1167
if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1168
continue;
1169
h->mb.i_intra16x16_pred_mode = i_mode;
1170
i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1171
COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1172
}
1173
}
1174
1175
/* RD selection for chroma prediction */
1176
if( !CHROMA444 )
1177
{
1178
const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1179
if( predict_mode[1] >= 0 )
1180
{
1181
int8_t predict_mode_sorted[4];
1182
int i_max;
1183
int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1184
1185
for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1186
{
1187
int i_mode = *predict_mode;
1188
if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1189
predict_mode_sorted[i_max++] = i_mode;
1190
}
1191
1192
if( i_max > 0 )
1193
{
1194
int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1195
int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1196
/* the previous thing encoded was x264_intra_rd(), so the pixels and
1197
* coefs for the current chroma mode are still around, so we only
1198
* have to recount the bits. */
1199
i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1200
for( int i = 0; i < i_max; i++ )
1201
{
1202
int i_mode = predict_mode_sorted[i];
1203
if( h->mb.b_lossless )
1204
x264_predict_lossless_chroma( h, i_mode );
1205
else
1206
{
1207
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1208
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1209
}
1210
/* if we've already found a mode that needs no residual, then
1211
* probably any mode with a residual will be worse.
1212
* so avoid dct on the remaining modes to improve speed. */
1213
i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1214
COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1215
}
1216
h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1217
h->mb.i_cbp_chroma = i_cbp_chroma_best;
1218
}
1219
}
1220
}
1221
1222
if( h->mb.i_type == I_4x4 )
1223
{
1224
pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1225
int nnz[3] = {0};
1226
for( int idx = 0; idx < 16; idx++ )
1227
{
1228
pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1229
h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1230
h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1231
i_best = COST_MAX64;
1232
1233
const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1234
1235
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1236
for( int p = 0; p < plane_count; p++ )
1237
/* emulate missing topright samples */
1238
MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1239
1240
for( ; *predict_mode >= 0; predict_mode++ )
1241
{
1242
int i_mode = *predict_mode;
1243
i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1244
1245
if( i_best > i_satd )
1246
{
1247
a->i_predict4x4[idx] = i_mode;
1248
i_best = i_satd;
1249
for( int p = 0; p < plane_count; p++ )
1250
{
1251
pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1252
pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1253
pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1254
pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1255
nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1256
}
1257
}
1258
}
1259
1260
for( int p = 0; p < plane_count; p++ )
1261
{
1262
MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1263
MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1264
MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1265
MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1266
h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1267
}
1268
1269
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1270
}
1271
}
1272
else if( h->mb.i_type == I_8x8 )
1273
{
1274
ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1275
pixel4 pels_h[3][2] = {{0}};
1276
pixel pels_v[3][7] = {{0}};
1277
uint16_t nnz[3][2] = {{0}}; //shut up gcc
1278
for( int idx = 0; idx < 4; idx++ )
1279
{
1280
int x = idx&1;
1281
int y = idx>>1;
1282
int s8 = X264_SCAN8_0 + 2*x + 16*y;
1283
pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1284
h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1285
h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1286
int cbp_luma_new = 0;
1287
int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1288
1289
i_best = COST_MAX64;
1290
1291
const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1292
for( int p = 0; p < plane_count; p++ )
1293
h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1294
1295
for( ; *predict_mode >= 0; predict_mode++ )
1296
{
1297
int i_mode = *predict_mode;
1298
if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1299
continue;
1300
1301
h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1302
i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1303
1304
if( i_best > i_satd )
1305
{
1306
a->i_predict8x8[idx] = i_mode;
1307
cbp_luma_new = h->mb.i_cbp_luma;
1308
i_best = i_satd;
1309
1310
for( int p = 0; p < plane_count; p++ )
1311
{
1312
pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1313
pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1314
if( !(idx&1) )
1315
for( int j = 0; j < 7; j++ )
1316
pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1317
nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1318
nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1319
}
1320
}
1321
}
1322
a->i_cbp_i8x8_luma = cbp_luma_new;
1323
for( int p = 0; p < plane_count; p++ )
1324
{
1325
MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1326
MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1327
if( !(idx&1) )
1328
for( int j = 0; j < 7; j++ )
1329
dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1330
M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1331
M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1332
}
1333
1334
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1335
}
1336
}
1337
}
1338
1339
#define LOAD_FENC(m, src, xoff, yoff) \
1340
{ \
1341
(m)->p_cost_mv = a->p_cost_mv; \
1342
(m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1343
(m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1344
(m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1345
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1346
(m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1347
(m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1348
}
1349
1350
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1351
{ \
1352
(m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353
(m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1354
(m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1355
(m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1356
if( CHROMA444 ) \
1357
{ \
1358
(m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359
(m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360
(m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1361
(m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1362
(m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363
(m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1364
(m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1365
(m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1366
} \
1367
else \
1368
(m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1369
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1370
(m)->weight = x264_weight_none; \
1371
(m)->i_ref = ref; \
1372
}
1373
1374
#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1375
(m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1376
(m)->weight = h->sh.weight[i_ref];
1377
1378
#define REF_COST(list, ref) \
1379
(a->p_cost_ref[list][ref])
1380
1381
static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1382
{
1383
x264_me_t m;
1384
int i_mvc;
1385
ALIGNED_4( int16_t mvc[8][2] );
1386
int i_halfpel_thresh = INT_MAX;
1387
int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1388
1389
/* 16x16 Search on all ref frame */
1390
m.i_pixel = PIXEL_16x16;
1391
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1392
1393
a->l0.me16x16.cost = INT_MAX;
1394
for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1395
{
1396
m.i_ref_cost = REF_COST( 0, i_ref );
1397
i_halfpel_thresh -= m.i_ref_cost;
1398
1399
/* search with ref */
1400
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1401
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1402
1403
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1404
1405
if( h->mb.ref_blind_dupe == i_ref )
1406
{
1407
CP32( m.mv, a->l0.mvc[0][0] );
1408
x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1409
}
1410
else
1411
{
1412
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1413
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1414
}
1415
1416
/* save mv for predicting neighbors */
1417
CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1418
CP32( a->l0.mvc[i_ref][0], m.mv );
1419
1420
/* early termination
1421
* SSD threshold would probably be better than SATD */
1422
if( i_ref == 0
1423
&& a->b_try_skip
1424
&& m.cost-m.cost_mv < 300*a->i_lambda
1425
&& abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1426
+ abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1427
&& x264_macroblock_probe_pskip( h ) )
1428
{
1429
h->mb.i_type = P_SKIP;
1430
x264_analyse_update_cache( h, a );
1431
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1432
return;
1433
}
1434
1435
m.cost += m.i_ref_cost;
1436
i_halfpel_thresh += m.i_ref_cost;
1437
1438
if( m.cost < a->l0.me16x16.cost )
1439
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1440
}
1441
1442
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1443
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1444
1445
h->mb.i_type = P_L0;
1446
if( a->i_mbrd )
1447
{
1448
x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1449
if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1450
{
1451
h->mb.i_partition = D_16x16;
1452
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1453
a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1454
if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1455
h->mb.i_type = P_SKIP;
1456
}
1457
}
1458
}
1459
1460
static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1461
{
1462
x264_me_t m;
1463
pixel **p_fenc = h->mb.pic.p_fenc;
1464
int i_maxref = h->mb.pic.i_fref[0]-1;
1465
1466
h->mb.i_partition = D_8x8;
1467
1468
#define CHECK_NEIGHBOUR(i)\
1469
{\
1470
int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1471
if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1472
i_maxref = ref;\
1473
}
1474
1475
/* early termination: if 16x16 chose ref 0, then evalute no refs older
1476
* than those used by the neighbors */
1477
if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1478
h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1479
{
1480
i_maxref = 0;
1481
CHECK_NEIGHBOUR( -8 - 1 );
1482
CHECK_NEIGHBOUR( -8 + 0 );
1483
CHECK_NEIGHBOUR( -8 + 2 );
1484
CHECK_NEIGHBOUR( -8 + 4 );
1485
CHECK_NEIGHBOUR( 0 - 1 );
1486
CHECK_NEIGHBOUR( 2*8 - 1 );
1487
}
1488
#undef CHECK_NEIGHBOUR
1489
1490
for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1491
CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1492
1493
for( int i = 0; i < 4; i++ )
1494
{
1495
x264_me_t *l0m = &a->l0.me8x8[i];
1496
int x8 = i&1;
1497
int y8 = i>>1;
1498
1499
m.i_pixel = PIXEL_8x8;
1500
1501
LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1502
l0m->cost = INT_MAX;
1503
for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1504
{
1505
m.i_ref_cost = REF_COST( 0, i_ref );
1506
1507
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1508
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1509
1510
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1511
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1512
if( h->mb.ref_blind_dupe == i_ref )
1513
{
1514
CP32( m.mv, a->l0.mvc[0][i+1] );
1515
x264_me_refine_qpel_refdupe( h, &m, NULL );
1516
}
1517
else
1518
x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1519
1520
m.cost += m.i_ref_cost;
1521
1522
CP32( a->l0.mvc[i_ref][i+1], m.mv );
1523
1524
if( m.cost < l0m->cost )
1525
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1526
if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1527
i_ref = h->mb.ref_blind_dupe;
1528
else
1529
i_ref++;
1530
}
1531
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1532
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1533
1534
a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1535
1536
/* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1537
are effectively zero. */
1538
if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1539
l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1540
}
1541
1542
a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1543
a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1544
/* P_8x8 ref0 has no ref cost */
1545
if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1546
a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1547
a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1548
h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1549
h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1550
}
1551
1552
static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1553
{
1554
/* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1555
* reference frame flags. Thus, if we're not doing mixedrefs, just
1556
* don't bother analysing the dupes. */
1557
const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1558
const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1559
pixel **p_fenc = h->mb.pic.p_fenc;
1560
int i_mvc;
1561
int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1562
1563
/* XXX Needed for x264_mb_predict_mv */
1564
h->mb.i_partition = D_8x8;
1565
1566
i_mvc = 1;
1567
CP32( mvc[0], a->l0.me16x16.mv );
1568
1569
for( int i = 0; i < 4; i++ )
1570
{
1571
x264_me_t *m = &a->l0.me8x8[i];
1572
int x8 = i&1;
1573
int y8 = i>>1;
1574
1575
m->i_pixel = PIXEL_8x8;
1576
m->i_ref_cost = i_ref_cost;
1577
1578
LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1579
LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1580
LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1581
1582
x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1583
x264_me_search( h, m, mvc, i_mvc );
1584
1585
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1586
1587
CP32( mvc[i_mvc], m->mv );
1588
i_mvc++;
1589
1590
a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1591
1592
/* mb type cost */
1593
m->cost += i_ref_cost;
1594
if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1595
m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1596
}
1597
1598
a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1599
a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1600
/* theoretically this should include 4*ref_cost,
1601
* but 3 seems a better approximation of cabac. */
1602
if( h->param.b_cabac )
1603
a->l0.i_cost8x8 -= i_ref_cost;
1604
h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1605
h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1606
}
1607
1608
static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1609
{
1610
x264_me_t m;
1611
pixel **p_fenc = h->mb.pic.p_fenc;
1612
ALIGNED_4( int16_t mvc[3][2] );
1613
1614
/* XXX Needed for x264_mb_predict_mv */
1615
h->mb.i_partition = D_16x8;
1616
1617
for( int i = 0; i < 2; i++ )
1618
{
1619
x264_me_t *l0m = &a->l0.me16x8[i];
1620
const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1621
const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1622
const int ref8[2] = { minref, maxref };
1623
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1624
1625
m.i_pixel = PIXEL_16x8;
1626
1627
LOAD_FENC( &m, p_fenc, 0, 8*i );
1628
l0m->cost = INT_MAX;
1629
for( int j = 0; j < i_ref8s; j++ )
1630
{
1631
const int i_ref = ref8[j];
1632
m.i_ref_cost = REF_COST( 0, i_ref );
1633
1634
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1635
CP32( mvc[0], a->l0.mvc[i_ref][0] );
1636
CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1637
CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1638
1639
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1640
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1641
1642
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1643
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1644
/* We can only take this shortcut if the first search was performed on ref0. */
1645
if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1646
{
1647
/* We can just leave the MV from the previous ref search. */
1648
x264_me_refine_qpel_refdupe( h, &m, NULL );
1649
}
1650
else
1651
x264_me_search( h, &m, mvc, 3 );
1652
1653
m.cost += m.i_ref_cost;
1654
1655
if( m.cost < l0m->cost )
1656
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1657
}
1658
1659
/* Early termination based on the current SATD score of partition[0]
1660
plus the estimated SATD score of partition[1] */
1661
if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1662
{
1663
a->l0.i_cost16x8 = COST_MAX;
1664
return;
1665
}
1666
1667
x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1668
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1669
}
1670
1671
a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1672
}
1673
1674
static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1675
{
1676
x264_me_t m;
1677
pixel **p_fenc = h->mb.pic.p_fenc;
1678
ALIGNED_4( int16_t mvc[3][2] );
1679
1680
/* XXX Needed for x264_mb_predict_mv */
1681
h->mb.i_partition = D_8x16;
1682
1683
for( int i = 0; i < 2; i++ )
1684
{
1685
x264_me_t *l0m = &a->l0.me8x16[i];
1686
const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1687
const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1688
const int ref8[2] = { minref, maxref };
1689
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1690
1691
m.i_pixel = PIXEL_8x16;
1692
1693
LOAD_FENC( &m, p_fenc, 8*i, 0 );
1694
l0m->cost = INT_MAX;
1695
for( int j = 0; j < i_ref8s; j++ )
1696
{
1697
const int i_ref = ref8[j];
1698
m.i_ref_cost = REF_COST( 0, i_ref );
1699
1700
CP32( mvc[0], a->l0.mvc[i_ref][0] );
1701
CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1702
CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1703
1704
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1705
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1706
1707
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1708
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1709
/* We can only take this shortcut if the first search was performed on ref0. */
1710
if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1711
{
1712
/* We can just leave the MV from the previous ref search. */
1713
x264_me_refine_qpel_refdupe( h, &m, NULL );
1714
}
1715
else
1716
x264_me_search( h, &m, mvc, 3 );
1717
1718
m.cost += m.i_ref_cost;
1719
1720
if( m.cost < l0m->cost )
1721
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1722
}
1723
1724
/* Early termination based on the current SATD score of partition[0]
1725
plus the estimated SATD score of partition[1] */
1726
if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1727
{
1728
a->l0.i_cost8x16 = COST_MAX;
1729
return;
1730
}
1731
1732
x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1733
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1734
}
1735
1736
a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1737
}
1738
1739
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1740
pixel **p_fref, int i8x8, int size, int chroma )
1741
{
1742
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1743
pixel *pix2 = pix1+8;
1744
int i_stride = h->mb.pic.i_stride[1];
1745
int chroma_h_shift = chroma <= CHROMA_422;
1746
int chroma_v_shift = chroma == CHROMA_420;
1747
int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1748
int i_ref = a->l0.me8x8[i8x8].i_ref;
1749
int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1750
x264_weight_t *weight = h->sh.weight[i_ref];
1751
1752
// FIXME weight can be done on 4x4 blocks even if mc is smaller
1753
#define CHROMA4x4MC( width, height, me, x, y ) \
1754
if( chroma == CHROMA_444 ) \
1755
{ \
1756
int mvx = (me).mv[0] + 4*2*x; \
1757
int mvy = (me).mv[1] + 4*2*y; \
1758
h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1759
mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1760
h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1761
mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1762
} \
1763
else \
1764
{ \
1765
int offset = x + (2>>chroma_v_shift)*16*y; \
1766
int chroma_height = (2>>chroma_v_shift)*height; \
1767
h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1768
(me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1769
if( weight[1].weightfn ) \
1770
weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1771
if( weight[2].weightfn ) \
1772
weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1773
}
1774
1775
if( size == PIXEL_4x4 )
1776
{
1777
x264_me_t *m = a->l0.me4x4[i8x8];
1778
CHROMA4x4MC( 2,2, m[0], 0,0 );
1779
CHROMA4x4MC( 2,2, m[1], 2,0 );
1780
CHROMA4x4MC( 2,2, m[2], 0,2 );
1781
CHROMA4x4MC( 2,2, m[3], 2,2 );
1782
}
1783
else if( size == PIXEL_8x4 )
1784
{
1785
x264_me_t *m = a->l0.me8x4[i8x8];
1786
CHROMA4x4MC( 4,2, m[0], 0,0 );
1787
CHROMA4x4MC( 4,2, m[1], 0,2 );
1788
}
1789
else
1790
{
1791
x264_me_t *m = a->l0.me4x8[i8x8];
1792
CHROMA4x4MC( 2,4, m[0], 0,0 );
1793
CHROMA4x4MC( 2,4, m[1], 2,0 );
1794
}
1795
#undef CHROMA4x4MC
1796
1797
int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1798
int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1799
return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1800
+ h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1801
}
1802
1803
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1804
{
1805
if( CHROMA_FORMAT == CHROMA_444 )
1806
return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1807
else if( CHROMA_FORMAT == CHROMA_422 )
1808
return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1809
else
1810
return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1811
}
1812
1813
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1814
{
1815
pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1816
pixel **p_fenc = h->mb.pic.p_fenc;
1817
const int i_ref = a->l0.me8x8[i8x8].i_ref;
1818
1819
/* XXX Needed for x264_mb_predict_mv */
1820
h->mb.i_partition = D_8x8;
1821
1822
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1823
{
1824
const int idx = 4*i8x8 + i4x4;
1825
const int x4 = block_idx_x[idx];
1826
const int y4 = block_idx_y[idx];
1827
const int i_mvc = (i4x4 == 0);
1828
1829
x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1830
1831
m->i_pixel = PIXEL_4x4;
1832
1833
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1834
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1835
LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1836
1837
x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1838
x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1839
1840
x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1841
}
1842
a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1843
a->l0.me4x4[i8x8][1].cost +
1844
a->l0.me4x4[i8x8][2].cost +
1845
a->l0.me4x4[i8x8][3].cost +
1846
REF_COST( 0, i_ref ) +
1847
a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1848
if( h->mb.b_chroma_me )
1849
a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1850
}
1851
1852
static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1853
{
1854
pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1855
pixel **p_fenc = h->mb.pic.p_fenc;
1856
const int i_ref = a->l0.me8x8[i8x8].i_ref;
1857
1858
/* XXX Needed for x264_mb_predict_mv */
1859
h->mb.i_partition = D_8x8;
1860
1861
for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1862
{
1863
const int idx = 4*i8x8 + 2*i8x4;
1864
const int x4 = block_idx_x[idx];
1865
const int y4 = block_idx_y[idx];
1866
const int i_mvc = (i8x4 == 0);
1867
1868
x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1869
1870
m->i_pixel = PIXEL_8x4;
1871
1872
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1873
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1874
LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1875
1876
x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1877
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1878
1879
x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1880
}
1881
a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1882
REF_COST( 0, i_ref ) +
1883
a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1884
if( h->mb.b_chroma_me )
1885
a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1886
}
1887
1888
static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1889
{
1890
pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1891
pixel **p_fenc = h->mb.pic.p_fenc;
1892
const int i_ref = a->l0.me8x8[i8x8].i_ref;
1893
1894
/* XXX Needed for x264_mb_predict_mv */
1895
h->mb.i_partition = D_8x8;
1896
1897
for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1898
{
1899
const int idx = 4*i8x8 + i4x8;
1900
const int x4 = block_idx_x[idx];
1901
const int y4 = block_idx_y[idx];
1902
const int i_mvc = (i4x8 == 0);
1903
1904
x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1905
1906
m->i_pixel = PIXEL_4x8;
1907
1908
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1909
LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1910
LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1911
1912
x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1913
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1914
1915
x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1916
}
1917
a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1918
REF_COST( 0, i_ref ) +
1919
a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1920
if( h->mb.b_chroma_me )
1921
a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1922
}
1923
1924
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1925
{
1926
ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1927
ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1928
int i_chroma_cost = 0;
1929
int chromapix = h->luma2chroma_pixel[i_pixel];
1930
1931
#define COST_BI_CHROMA( m0, m1, width, height ) \
1932
{ \
1933
if( CHROMA444 ) \
1934
{ \
1935
h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1936
m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1937
h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1938
m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1939
h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1940
m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1941
h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1942
m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1943
} \
1944
else \
1945
{ \
1946
int v_shift = CHROMA_V_SHIFT; \
1947
int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1948
int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1949
h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1950
m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1951
h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1952
m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1953
} \
1954
h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1955
h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1956
i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1957
+ h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1958
}
1959
1960
if( i_pixel == PIXEL_16x16 )
1961
COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1962
else if( i_pixel == PIXEL_16x8 )
1963
COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1964
else if( i_pixel == PIXEL_8x16 )
1965
COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1966
else
1967
COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1968
1969
return i_chroma_cost;
1970
}
1971
1972
static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1973
{
1974
/* Assumes that fdec still contains the results of
1975
* x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1976
1977
pixel *p_fenc = h->mb.pic.p_fenc[0];
1978
pixel *p_fdec = h->mb.pic.p_fdec[0];
1979
1980
a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1981
if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1982
{
1983
int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1984
1985
for( int i = 0; i < 4; i++ )
1986
{
1987
const int x = (i&1)*8;
1988
const int y = (i>>1)*8;
1989
a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1990
&p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1991
if( h->mb.b_chroma_me )
1992
{
1993
int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1994
int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1995
a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1996
&h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1997
+ h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1998
&h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1999
}
2000
a->i_cost16x16direct += a->i_cost8x8direct[i];
2001
2002
/* mb type cost */
2003
a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2004
}
2005
}
2006
else
2007
{
2008
a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2009
if( h->mb.b_chroma_me )
2010
{
2011
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2012
a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2013
+ h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2014
}
2015
}
2016
}
2017
2018
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2019
{
2020
ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2021
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2022
pixel *src0, *src1;
2023
intptr_t stride0 = 16, stride1 = 16;
2024
int i_ref, i_mvc;
2025
ALIGNED_4( int16_t mvc[9][2] );
2026
int try_skip = a->b_try_skip;
2027
int list1_skipped = 0;
2028
int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2029
int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2030
(a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2031
2032
x264_me_t m;
2033
m.i_pixel = PIXEL_16x16;
2034
2035
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2036
2037
/* 16x16 Search on list 0 and list 1 */
2038
a->l0.me16x16.cost = INT_MAX;
2039
a->l1.me16x16.cost = INT_MAX;
2040
for( int l = 1; l >= 0; )
2041
{
2042
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2043
2044
/* This loop is extremely munged in order to facilitate the following order of operations,
2045
* necessary for an efficient fast skip.
2046
* 1. Search list1 ref0.
2047
* 2. Search list0 ref0.
2048
* 3. Try skip.
2049
* 4. Search the rest of list0.
2050
* 5. Go back and finish list1.
2051
*/
2052
for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2053
{
2054
if( try_skip && l == 1 && i_ref > 0 )
2055
{
2056
list1_skipped = 1;
2057
break;
2058
}
2059
2060
m.i_ref_cost = REF_COST( l, i_ref );
2061
2062
/* search with ref */
2063
LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2064
x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2065
x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2066
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2067
2068
/* add ref cost */
2069
m.cost += m.i_ref_cost;
2070
2071
if( m.cost < lX->me16x16.cost )
2072
h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2073
2074
/* save mv for predicting neighbors */
2075
CP32( lX->mvc[i_ref][0], m.mv );
2076
CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2077
2078
/* Fast skip detection. */
2079
if( i_ref == 0 && try_skip )
2080
{
2081
if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2082
abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2083
{
2084
try_skip = 0;
2085
}
2086
else if( !l )
2087
{
2088
/* We already tested skip */
2089
h->mb.i_type = B_SKIP;
2090
x264_analyse_update_cache( h, a );
2091
return;
2092
}
2093
}
2094
}
2095
if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2096
break;
2097
if( list1_skipped && l == 0 )
2098
l = 1;
2099
else
2100
l--;
2101
}
2102
2103
/* get cost of BI mode */
2104
h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2105
h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2106
int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2107
src0 = h->mc.get_ref( pix0, &stride0,
2108
h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2109
a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2110
src1 = h->mc.get_ref( pix1, &stride1,
2111
h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2112
a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2113
2114
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2115
2116
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2117
+ ref_costs
2118
+ a->l0.bi16x16.cost_mv
2119
+ a->l1.bi16x16.cost_mv;
2120
2121
if( h->mb.b_chroma_me )
2122
a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2123
2124
/* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2125
if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2126
{
2127
int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2128
+ a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2129
int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2130
+ a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2131
h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2132
h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2133
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2134
int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2135
+ ref_costs + l0_mv_cost + l1_mv_cost;
2136
2137
if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2138
{
2139
ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2140
2141
if( CHROMA444 )
2142
{
2143
h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2144
h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2145
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2146
cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2147
h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2148
h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2149
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2150
cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2151
}
2152
else
2153
{
2154
ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2155
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2156
int v_shift = CHROMA_V_SHIFT;
2157
2158
if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2159
{
2160
int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2161
h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2162
h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2163
}
2164
else
2165
h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2166
h->mb.pic.i_stride[1], 16>>v_shift );
2167
2168
if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2169
{
2170
int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2171
h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2172
h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2173
}
2174
else
2175
h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2176
h->mb.pic.i_stride[1], 16>>v_shift );
2177
2178
h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2179
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2180
h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2181
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2182
2183
cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2184
+ h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2185
}
2186
}
2187
2188
if( cost00 < a->i_cost16x16bi )
2189
{
2190
M32( a->l0.bi16x16.mv ) = 0;
2191
M32( a->l1.bi16x16.mv ) = 0;
2192
a->l0.bi16x16.cost_mv = l0_mv_cost;
2193
a->l1.bi16x16.cost_mv = l1_mv_cost;
2194
a->i_cost16x16bi = cost00;
2195
}
2196
}
2197
2198
/* mb type cost */
2199
a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2200
a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2201
a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2202
}
2203
2204
static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2205
{
2206
int x = 2*(i&1);
2207
int y = i&2;
2208
2209
switch( h->mb.i_sub_partition[i] )
2210
{
2211
case D_L0_8x8:
2212
x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2213
break;
2214
case D_L0_8x4:
2215
x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2216
x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2217
break;
2218
case D_L0_4x8:
2219
x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2220
x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2221
break;
2222
case D_L0_4x4:
2223
x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2224
x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2225
x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2226
x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2227
break;
2228
default:
2229
x264_log( h, X264_LOG_ERROR, "internal error\n" );
2230
break;
2231
}
2232
}
2233
2234
static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2235
{
2236
int x = 2*(idx&1);
2237
int y = idx&2;
2238
x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2239
x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2240
x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2241
x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2242
}
2243
2244
#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2245
if( x264_mb_partition_listX_table[0][part] ) \
2246
{ \
2247
x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2248
x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2249
} \
2250
else \
2251
{ \
2252
x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2253
x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2254
if( b_mvd ) \
2255
x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2256
} \
2257
if( x264_mb_partition_listX_table[1][part] ) \
2258
{ \
2259
x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2260
x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2261
} \
2262
else \
2263
{ \
2264
x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2265
x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2266
if( b_mvd ) \
2267
x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2268
}
2269
2270
static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2271
{
2272
int x = 2*(i&1);
2273
int y = i&2;
2274
if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2275
{
2276
x264_mb_load_mv_direct8x8( h, i );
2277
if( b_mvd )
2278
{
2279
x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2280
x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2281
x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2282
}
2283
}
2284
else
2285
{
2286
CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2287
}
2288
}
2289
static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2290
{
2291
CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2292
}
2293
static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2294
{
2295
CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2296
}
2297
#undef CACHE_MV_BI
2298
2299
static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2300
{
2301
ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2302
int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2303
2304
/* early termination: if 16x16 chose ref 0, then evalute no refs older
2305
* than those used by the neighbors */
2306
#define CHECK_NEIGHBOUR(i)\
2307
{\
2308
int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2309
if( ref > i_maxref[l] )\
2310
i_maxref[l] = ref;\
2311
}
2312
2313
for( int l = 0; l < 2; l++ )
2314
{
2315
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2316
if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2317
h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2318
{
2319
i_maxref[l] = 0;
2320
CHECK_NEIGHBOUR( -8 - 1 );
2321
CHECK_NEIGHBOUR( -8 + 0 );
2322
CHECK_NEIGHBOUR( -8 + 2 );
2323
CHECK_NEIGHBOUR( -8 + 4 );
2324
CHECK_NEIGHBOUR( 0 - 1 );
2325
CHECK_NEIGHBOUR( 2*8 - 1 );
2326
}
2327
}
2328
2329
/* XXX Needed for x264_mb_predict_mv */
2330
h->mb.i_partition = D_8x8;
2331
2332
a->i_cost8x8bi = 0;
2333
2334
for( int i = 0; i < 4; i++ )
2335
{
2336
int x8 = i&1;
2337
int y8 = i>>1;
2338
int i_part_cost;
2339
int i_part_cost_bi;
2340
intptr_t stride[2] = {8,8};
2341
pixel *src[2];
2342
x264_me_t m;
2343
m.i_pixel = PIXEL_8x8;
2344
LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2345
2346
for( int l = 0; l < 2; l++ )
2347
{
2348
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2349
2350
lX->me8x8[i].cost = INT_MAX;
2351
for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2352
{
2353
m.i_ref_cost = REF_COST( l, i_ref );
2354
2355
LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2356
2357
x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2358
x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2359
x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2360
m.cost += m.i_ref_cost;
2361
2362
if( m.cost < lX->me8x8[i].cost )
2363
{
2364
h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2365
a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2366
}
2367
2368
/* save mv for predicting other partitions within this MB */
2369
CP32( lX->mvc[i_ref][i+1], m.mv );
2370
}
2371
}
2372
2373
/* BI mode */
2374
src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2375
a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2376
src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2377
a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2378
h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2379
h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2380
2381
a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2382
i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2383
+ a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2384
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2385
2386
if( h->mb.b_chroma_me )
2387
{
2388
int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2389
i_part_cost_bi += i_chroma_cost;
2390
a->i_satd8x8[2][i] += i_chroma_cost;
2391
}
2392
2393
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2394
a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2395
2396
i_part_cost = a->l0.me8x8[i].cost;
2397
h->mb.i_sub_partition[i] = D_L0_8x8;
2398
COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2399
COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2400
COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2401
a->i_cost8x8bi += i_part_cost;
2402
2403
/* XXX Needed for x264_mb_predict_mv */
2404
x264_mb_cache_mv_b8x8( h, a, i, 0 );
2405
}
2406
2407
/* mb type cost */
2408
a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2409
}
2410
2411
static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2412
{
2413
pixel **p_fref[2] =
2414
{ h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2415
h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2416
ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2417
2418
/* XXX Needed for x264_mb_predict_mv */
2419
h->mb.i_partition = D_8x8;
2420
2421
a->i_cost8x8bi = 0;
2422
2423
for( int i = 0; i < 4; i++ )
2424
{
2425
int x8 = i&1;
2426
int y8 = i>>1;
2427
int i_part_cost;
2428
int i_part_cost_bi = 0;
2429
intptr_t stride[2] = {8,8};
2430
pixel *src[2];
2431
2432
for( int l = 0; l < 2; l++ )
2433
{
2434
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2435
x264_me_t *m = &lX->me8x8[i];
2436
m->i_pixel = PIXEL_8x8;
2437
LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2438
2439
m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2440
m->i_ref = lX->me16x16.i_ref;
2441
2442
LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2443
2444
x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2445
x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2446
x264_me_search( h, m, &lX->me16x16.mv, 1 );
2447
a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2448
m->cost += m->i_ref_cost;
2449
2450
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2451
2452
/* save mv for predicting other partitions within this MB */
2453
CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2454
2455
/* BI mode */
2456
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2457
m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2458
i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2459
}
2460
h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2461
a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2462
i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2463
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2464
a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2465
2466
if( h->mb.b_chroma_me )
2467
{
2468
int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2469
i_part_cost_bi += i_chroma_cost;
2470
a->i_satd8x8[2][i] += i_chroma_cost;
2471
}
2472
2473
i_part_cost = a->l0.me8x8[i].cost;
2474
h->mb.i_sub_partition[i] = D_L0_8x8;
2475
COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2476
COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2477
COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2478
a->i_cost8x8bi += i_part_cost;
2479
2480
/* XXX Needed for x264_mb_predict_mv */
2481
x264_mb_cache_mv_b8x8( h, a, i, 0 );
2482
}
2483
2484
/* mb type cost */
2485
a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2486
}
2487
2488
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2489
{
2490
ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2491
ALIGNED_4( int16_t mvc[3][2] );
2492
2493
h->mb.i_partition = D_16x8;
2494
a->i_cost16x8bi = 0;
2495
2496
for( int i = 0; i < 2; i++ )
2497
{
2498
int i_part_cost;
2499
int i_part_cost_bi = 0;
2500
intptr_t stride[2] = {16,16};
2501
pixel *src[2];
2502
x264_me_t m;
2503
m.i_pixel = PIXEL_16x8;
2504
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2505
2506
for( int l = 0; l < 2; l++ )
2507
{
2508
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2509
int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2510
int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2511
lX->me16x8[i].cost = INT_MAX;
2512
for( int j = 0; j < i_ref8s; j++ )
2513
{
2514
int i_ref = ref8[j];
2515
m.i_ref_cost = REF_COST( l, i_ref );
2516
2517
LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2518
2519
CP32( mvc[0], lX->mvc[i_ref][0] );
2520
CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2521
CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2522
2523
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2524
x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2525
x264_me_search( h, &m, mvc, 3 );
2526
m.cost += m.i_ref_cost;
2527
2528
if( m.cost < lX->me16x8[i].cost )
2529
h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2530
}
2531
}
2532
2533
/* BI mode */
2534
src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2535
a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2536
src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2537
a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2538
h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2539
h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2540
2541
i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2542
+ a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2543
+ a->l1.me16x8[i].i_ref_cost;
2544
2545
if( h->mb.b_chroma_me )
2546
i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2547
2548
i_part_cost = a->l0.me16x8[i].cost;
2549
a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2550
2551
if( a->l1.me16x8[i].cost < i_part_cost )
2552
{
2553
i_part_cost = a->l1.me16x8[i].cost;
2554
a->i_mb_partition16x8[i] = D_L1_8x8;
2555
}
2556
if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2557
{
2558
i_part_cost = i_part_cost_bi;
2559
a->i_mb_partition16x8[i] = D_BI_8x8;
2560
}
2561
a->i_cost16x8bi += i_part_cost;
2562
2563
/* Early termination based on the current SATD score of partition[0]
2564
plus the estimated SATD score of partition[1] */
2565
if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2566
* (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2567
{
2568
a->i_cost16x8bi = COST_MAX;
2569
return;
2570
}
2571
2572
x264_mb_cache_mv_b16x8( h, a, i, 0 );
2573
}
2574
2575
/* mb type cost */
2576
a->i_mb_type16x8 = B_L0_L0
2577
+ (a->i_mb_partition16x8[0]>>2) * 3
2578
+ (a->i_mb_partition16x8[1]>>2);
2579
a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2580
}
2581
2582
static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2583
{
2584
ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2585
ALIGNED_4( int16_t mvc[3][2] );
2586
2587
h->mb.i_partition = D_8x16;
2588
a->i_cost8x16bi = 0;
2589
2590
for( int i = 0; i < 2; i++ )
2591
{
2592
int i_part_cost;
2593
int i_part_cost_bi = 0;
2594
intptr_t stride[2] = {8,8};
2595
pixel *src[2];
2596
x264_me_t m;
2597
m.i_pixel = PIXEL_8x16;
2598
LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2599
2600
for( int l = 0; l < 2; l++ )
2601
{
2602
x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2603
int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2604
int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2605
lX->me8x16[i].cost = INT_MAX;
2606
for( int j = 0; j < i_ref8s; j++ )
2607
{
2608
int i_ref = ref8[j];
2609
m.i_ref_cost = REF_COST( l, i_ref );
2610
2611
LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2612
2613
CP32( mvc[0], lX->mvc[i_ref][0] );
2614
CP32( mvc[1], lX->mvc[i_ref][i+1] );
2615
CP32( mvc[2], lX->mvc[i_ref][i+3] );
2616
2617
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2618
x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2619
x264_me_search( h, &m, mvc, 3 );
2620
m.cost += m.i_ref_cost;
2621
2622
if( m.cost < lX->me8x16[i].cost )
2623
h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2624
}
2625
}
2626
2627
/* BI mode */
2628
src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2629
a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2630
src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2631
a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2632
h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2633
2634
i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2635
+ a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2636
+ a->l1.me8x16[i].i_ref_cost;
2637
2638
if( h->mb.b_chroma_me )
2639
i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2640
2641
i_part_cost = a->l0.me8x16[i].cost;
2642
a->i_mb_partition8x16[i] = D_L0_8x8;
2643
2644
if( a->l1.me8x16[i].cost < i_part_cost )
2645
{
2646
i_part_cost = a->l1.me8x16[i].cost;
2647
a->i_mb_partition8x16[i] = D_L1_8x8;
2648
}
2649
if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2650
{
2651
i_part_cost = i_part_cost_bi;
2652
a->i_mb_partition8x16[i] = D_BI_8x8;
2653
}
2654
a->i_cost8x16bi += i_part_cost;
2655
2656
/* Early termination based on the current SATD score of partition[0]
2657
plus the estimated SATD score of partition[1] */
2658
if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2659
* (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2660
{
2661
a->i_cost8x16bi = COST_MAX;
2662
return;
2663
}
2664
2665
x264_mb_cache_mv_b8x16( h, a, i, 0 );
2666
}
2667
2668
/* mb type cost */
2669
a->i_mb_type8x16 = B_L0_L0
2670
+ (a->i_mb_partition8x16[0]>>2) * 3
2671
+ (a->i_mb_partition8x16[1]>>2);
2672
a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2673
}
2674
2675
static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2676
{
2677
int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2678
2679
h->mb.i_type = P_L0;
2680
if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2681
{
2682
h->mb.i_partition = D_16x16;
2683
x264_analyse_update_cache( h, a );
2684
a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2685
}
2686
2687
if( a->l0.i_cost16x8 < thresh )
2688
{
2689
h->mb.i_partition = D_16x8;
2690
x264_analyse_update_cache( h, a );
2691
a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2692
}
2693
else
2694
a->l0.i_cost16x8 = COST_MAX;
2695
2696
if( a->l0.i_cost8x16 < thresh )
2697
{
2698
h->mb.i_partition = D_8x16;
2699
x264_analyse_update_cache( h, a );
2700
a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2701
}
2702
else
2703
a->l0.i_cost8x16 = COST_MAX;
2704
2705
if( a->l0.i_cost8x8 < thresh )
2706
{
2707
h->mb.i_type = P_8x8;
2708
h->mb.i_partition = D_8x8;
2709
if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2710
{
2711
x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2712
x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2713
x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2714
x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2715
/* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2716
* for future blocks are those left over from previous RDO calls. */
2717
for( int i = 0; i < 4; i++ )
2718
{
2719
int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2720
int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2721
int subtype, btype = D_L0_8x8;
2722
uint64_t bcost = COST_MAX64;
2723
for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2724
{
2725
uint64_t cost;
2726
if( costs[subtype] > sub8x8_thresh )
2727
continue;
2728
h->mb.i_sub_partition[i] = subtype;
2729
x264_mb_cache_mv_p8x8( h, a, i );
2730
if( subtype == btype )
2731
continue;
2732
cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2733
COPY2_IF_LT( bcost, cost, btype, subtype );
2734
}
2735
if( h->mb.i_sub_partition[i] != btype )
2736
{
2737
h->mb.i_sub_partition[i] = btype;
2738
x264_mb_cache_mv_p8x8( h, a, i );
2739
}
2740
}
2741
}
2742
else
2743
x264_analyse_update_cache( h, a );
2744
a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2745
}
2746
else
2747
a->l0.i_cost8x8 = COST_MAX;
2748
}
2749
2750
static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2751
{
2752
int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2753
2754
if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2755
{
2756
h->mb.i_type = B_DIRECT;
2757
/* Assumes direct/skip MC is still in fdec */
2758
/* Requires b-rdo to be done before intra analysis */
2759
h->mb.b_skip_mc = 1;
2760
x264_analyse_update_cache( h, a );
2761
a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2762
h->mb.b_skip_mc = 0;
2763
}
2764
2765
//FIXME not all the update_cache calls are needed
2766
h->mb.i_partition = D_16x16;
2767
/* L0 */
2768
if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2769
{
2770
h->mb.i_type = B_L0_L0;
2771
x264_analyse_update_cache( h, a );
2772
a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2773
}
2774
2775
/* L1 */
2776
if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2777
{
2778
h->mb.i_type = B_L1_L1;
2779
x264_analyse_update_cache( h, a );
2780
a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2781
}
2782
2783
/* BI */
2784
if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2785
{
2786
h->mb.i_type = B_BI_BI;
2787
x264_analyse_update_cache( h, a );
2788
a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2789
}
2790
2791
/* 8x8 */
2792
if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2793
{
2794
h->mb.i_type = B_8x8;
2795
h->mb.i_partition = D_8x8;
2796
x264_analyse_update_cache( h, a );
2797
a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2798
x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2799
}
2800
2801
/* 16x8 */
2802
if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2803
{
2804
h->mb.i_type = a->i_mb_type16x8;
2805
h->mb.i_partition = D_16x8;
2806
x264_analyse_update_cache( h, a );
2807
a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2808
}
2809
2810
/* 8x16 */
2811
if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2812
{
2813
h->mb.i_type = a->i_mb_type8x16;
2814
h->mb.i_partition = D_8x16;
2815
x264_analyse_update_cache( h, a );
2816
a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2817
}
2818
}
2819
2820
static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2821
{
2822
int i_biweight;
2823
2824
if( IS_INTRA(h->mb.i_type) )
2825
return;
2826
2827
switch( h->mb.i_partition )
2828
{
2829
case D_16x16:
2830
if( h->mb.i_type == B_BI_BI )
2831
{
2832
i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2833
x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2834
}
2835
break;
2836
case D_16x8:
2837
for( int i = 0; i < 2; i++ )
2838
if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2839
{
2840
i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2841
x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2842
}
2843
break;
2844
case D_8x16:
2845
for( int i = 0; i < 2; i++ )
2846
if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2847
{
2848
i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2849
x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2850
}
2851
break;
2852
case D_8x8:
2853
for( int i = 0; i < 4; i++ )
2854
if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2855
{
2856
i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2857
x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2858
}
2859
break;
2860
}
2861
}
2862
2863
static inline void x264_mb_analyse_transform( x264_t *h )
2864
{
2865
if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2866
{
2867
/* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2868
x264_mb_mc( h );
2869
2870
int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2871
int i_cost8 = 0, i_cost4 = 0;
2872
/* Not all platforms have a merged SATD function */
2873
if( h->pixf.sa8d_satd[PIXEL_16x16] )
2874
{
2875
uint64_t cost = 0;
2876
for( int p = 0; p < plane_count; p++ )
2877
{
2878
cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2879
h->mb.pic.p_fdec[p], FDEC_STRIDE );
2880
2881
}
2882
i_cost8 = (uint32_t)cost;
2883
i_cost4 = (uint32_t)(cost >> 32);
2884
}
2885
else
2886
{
2887
for( int p = 0; p < plane_count; p++ )
2888
{
2889
i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2890
h->mb.pic.p_fdec[p], FDEC_STRIDE );
2891
i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2892
h->mb.pic.p_fdec[p], FDEC_STRIDE );
2893
}
2894
}
2895
2896
h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2897
h->mb.b_skip_mc = 1;
2898
}
2899
}
2900
2901
static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2902
{
2903
if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2904
{
2905
uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2906
/* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2907
if( h->mb.i_type == P_8x8 )
2908
M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2909
else if( !x264_transform_allowed[h->mb.i_type] )
2910
return;
2911
2912
x264_analyse_update_cache( h, a );
2913
h->mb.b_transform_8x8 ^= 1;
2914
/* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2915
int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2916
2917
if( *i_rd >= i_rd8 )
2918
{
2919
if( *i_rd > 0 )
2920
*i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2921
*i_rd = i_rd8;
2922
}
2923
else
2924
{
2925
h->mb.b_transform_8x8 ^= 1;
2926
M32( h->mb.i_sub_partition ) = subpart_bak;
2927
}
2928
}
2929
}
2930
2931
/* Rate-distortion optimal QP selection.
2932
* FIXME: More than half of the benefit of this function seems to be
2933
* in the way it improves the coding of chroma DC (by decimating or
2934
* finding a better way to code a single DC coefficient.)
2935
* There must be a more efficient way to get that portion of the benefit
2936
* without doing full QP-RD, but RD-decimation doesn't seem to do the
2937
* trick. */
2938
static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2939
{
2940
int bcost, cost, failures, prevcost, origcost;
2941
int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2942
int last_qp_tried = 0;
2943
origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2944
int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2945
2946
/* If CBP is already zero, don't raise the quantizer any higher. */
2947
for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2948
{
2949
/* Without psy-RD, require monotonicity when moving quant away from previous
2950
* macroblock's quant; allow 1 failure when moving quant towards previous quant.
2951
* With psy-RD, allow 1 failure when moving quant away from previous quant,
2952
* allow 2 failures when moving quant towards previous quant.
2953
* Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2954
int threshold = (!!h->mb.i_psy_rd);
2955
/* Raise the threshold for failures if we're moving towards the last QP. */
2956
if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2957
( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2958
threshold++;
2959
h->mb.i_qp = orig_qp;
2960
failures = 0;
2961
prevcost = origcost;
2962
2963
/* If the current QP results in an empty CBP, it's highly likely that lower QPs
2964
* (up to a point) will too. So, jump down to where the threshold will kick in
2965
* and check the QP there. If the CBP is still empty, skip the main loop.
2966
* If it isn't empty, we would have ended up having to check this QP anyways,
2967
* so as long as we store it for later lookup, we lose nothing. */
2968
int already_checked_qp = -1;
2969
int already_checked_cost = COST_MAX;
2970
if( direction == -1 )
2971
{
2972
if( !origcbp )
2973
{
2974
h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2975
h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2976
already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2977
if( !h->mb.cbp[h->mb.i_mb_xy] )
2978
{
2979
/* If our empty-CBP block is lower QP than the last QP,
2980
* the last QP almost surely doesn't have a CBP either. */
2981
if( h->mb.i_last_qp > h->mb.i_qp )
2982
last_qp_tried = 1;
2983
break;
2984
}
2985
already_checked_qp = h->mb.i_qp;
2986
h->mb.i_qp = orig_qp;
2987
}
2988
}
2989
2990
h->mb.i_qp += direction;
2991
while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2992
{
2993
if( h->mb.i_last_qp == h->mb.i_qp )
2994
last_qp_tried = 1;
2995
if( h->mb.i_qp == already_checked_qp )
2996
cost = already_checked_cost;
2997
else
2998
{
2999
h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3000
cost = x264_rd_cost_mb( h, a->i_lambda2 );
3001
COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3002
}
3003
3004
/* We can't assume that the costs are monotonic over QPs.
3005
* Tie case-as-failure seems to give better results. */
3006
if( cost < prevcost )
3007
failures = 0;
3008
else
3009
failures++;
3010
prevcost = cost;
3011
3012
if( failures > threshold )
3013
break;
3014
if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3015
break;
3016
h->mb.i_qp += direction;
3017
}
3018
}
3019
3020
/* Always try the last block's QP. */
3021
if( !last_qp_tried )
3022
{
3023
h->mb.i_qp = h->mb.i_last_qp;
3024
h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3025
cost = x264_rd_cost_mb( h, a->i_lambda2 );
3026
COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3027
}
3028
3029
h->mb.i_qp = bqp;
3030
h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3031
3032
/* Check transform again; decision from before may no longer be optimal. */
3033
if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3034
x264_mb_transform_8x8_allowed( h ) )
3035
{
3036
h->mb.b_transform_8x8 ^= 1;
3037
cost = x264_rd_cost_mb( h, a->i_lambda2 );
3038
if( cost > bcost )
3039
h->mb.b_transform_8x8 ^= 1;
3040
}
3041
}
3042
3043
/*****************************************************************************
3044
* x264_macroblock_analyse:
3045
*****************************************************************************/
3046
void x264_macroblock_analyse( x264_t *h )
3047
{
3048
x264_mb_analysis_t analysis;
3049
int i_cost = COST_MAX;
3050
3051
h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3052
/* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3053
* to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3054
if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3055
h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3056
3057
if( h->param.analyse.b_mb_info )
3058
h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3059
x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3060
3061
/*--------------------------- Do the analysis ---------------------------*/
3062
if( h->sh.i_type == SLICE_TYPE_I )
3063
{
3064
intra_analysis:
3065
if( analysis.i_mbrd )
3066
x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3067
x264_mb_analyse_intra( h, &analysis, COST_MAX );
3068
if( analysis.i_mbrd )
3069
x264_intra_rd( h, &analysis, COST_MAX );
3070
3071
i_cost = analysis.i_satd_i16x16;
3072
h->mb.i_type = I_16x16;
3073
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3074
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3075
if( analysis.i_satd_pcm < i_cost )
3076
h->mb.i_type = I_PCM;
3077
3078
else if( analysis.i_mbrd >= 2 )
3079
x264_intra_rd_refine( h, &analysis );
3080
}
3081
else if( h->sh.i_type == SLICE_TYPE_P )
3082
{
3083
int b_skip = 0;
3084
3085
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3086
3087
analysis.b_try_skip = 0;
3088
if( analysis.b_force_intra )
3089
{
3090
if( !h->param.analyse.b_psy )
3091
{
3092
x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3093
goto intra_analysis;
3094
}
3095
}
3096
else
3097
{
3098
/* Special fast-skip logic using information from mb_info. */
3099
if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3100
{
3101
if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3102
h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3103
{
3104
h->mb.i_partition = D_16x16;
3105
/* Use the P-SKIP MV if we can... */
3106
if( !M32(h->mb.cache.pskip_mv) )
3107
{
3108
b_skip = 1;
3109
h->mb.i_type = P_SKIP;
3110
}
3111
/* Otherwise, just force a 16x16 block. */
3112
else
3113
{
3114
h->mb.i_type = P_L0;
3115
analysis.l0.me16x16.i_ref = 0;
3116
M32( analysis.l0.me16x16.mv ) = 0;
3117
}
3118
goto skip_analysis;
3119
}
3120
/* Reset the information accordingly */
3121
else if( h->param.analyse.b_mb_info_update )
3122
h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3123
}
3124
3125
int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3126
/* If the current macroblock is off the frame, just skip it. */
3127
if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3128
b_skip = 1;
3129
/* Fast P_SKIP detection */
3130
else if( h->param.analyse.b_fast_pskip )
3131
{
3132
if( skip_invalid )
3133
// FIXME don't need to check this if the reference frame is done
3134
{}
3135
else if( h->param.analyse.i_subpel_refine >= 3 )
3136
analysis.b_try_skip = 1;
3137
else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3138
h->mb.i_mb_type_top == P_SKIP ||
3139
h->mb.i_mb_type_topleft == P_SKIP ||
3140
h->mb.i_mb_type_topright == P_SKIP )
3141
b_skip = x264_macroblock_probe_pskip( h );
3142
}
3143
}
3144
3145
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3146
3147
if( b_skip )
3148
{
3149
h->mb.i_type = P_SKIP;
3150
h->mb.i_partition = D_16x16;
3151
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3152
skip_analysis:
3153
/* Set up MVs for future predictors */
3154
for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3155
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3156
}
3157
else
3158
{
3159
const unsigned int flags = h->param.analyse.inter;
3160
int i_type;
3161
int i_partition;
3162
int i_satd_inter, i_satd_intra;
3163
3164
x264_mb_analyse_load_costs( h, &analysis );
3165
3166
x264_mb_analyse_inter_p16x16( h, &analysis );
3167
3168
if( h->mb.i_type == P_SKIP )
3169
{
3170
for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3171
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3172
return;
3173
}
3174
3175
if( flags & X264_ANALYSE_PSUB16x16 )
3176
{
3177
if( h->param.analyse.b_mixed_references )
3178
x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3179
else
3180
x264_mb_analyse_inter_p8x8( h, &analysis );
3181
}
3182
3183
/* Select best inter mode */
3184
i_type = P_L0;
3185
i_partition = D_16x16;
3186
i_cost = analysis.l0.me16x16.cost;
3187
3188
if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3189
analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3190
{
3191
i_type = P_8x8;
3192
i_partition = D_8x8;
3193
i_cost = analysis.l0.i_cost8x8;
3194
3195
/* Do sub 8x8 */
3196
if( flags & X264_ANALYSE_PSUB8x8 )
3197
{
3198
for( int i = 0; i < 4; i++ )
3199
{
3200
x264_mb_analyse_inter_p4x4( h, &analysis, i );
3201
int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3202
if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3203
{
3204
int i_cost8x8 = analysis.l0.i_cost4x4[i];
3205
h->mb.i_sub_partition[i] = D_L0_4x4;
3206
3207
x264_mb_analyse_inter_p8x4( h, &analysis, i );
3208
COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3209
h->mb.i_sub_partition[i], D_L0_8x4 );
3210
3211
x264_mb_analyse_inter_p4x8( h, &analysis, i );
3212
COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3213
h->mb.i_sub_partition[i], D_L0_4x8 );
3214
3215
i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3216
}
3217
x264_mb_cache_mv_p8x8( h, &analysis, i );
3218
}
3219
analysis.l0.i_cost8x8 = i_cost;
3220
}
3221
}
3222
3223
/* Now do 16x8/8x16 */
3224
int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3225
if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3226
analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3227
{
3228
int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3229
+ analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3230
analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3231
3232
x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3233
COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3234
3235
i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3236
+ analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3237
analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3238
3239
x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3240
COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3241
}
3242
3243
h->mb.i_partition = i_partition;
3244
3245
/* refine qpel */
3246
//FIXME mb_type costs?
3247
if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3248
{
3249
/* refine later */
3250
}
3251
else if( i_partition == D_16x16 )
3252
{
3253
x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3254
i_cost = analysis.l0.me16x16.cost;
3255
}
3256
else if( i_partition == D_16x8 )
3257
{
3258
x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3259
x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3260
i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3261
}
3262
else if( i_partition == D_8x16 )
3263
{
3264
x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3265
x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3266
i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3267
}
3268
else if( i_partition == D_8x8 )
3269
{
3270
i_cost = 0;
3271
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3272
{
3273
switch( h->mb.i_sub_partition[i8x8] )
3274
{
3275
case D_L0_8x8:
3276
x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3277
i_cost += analysis.l0.me8x8[i8x8].cost;
3278
break;
3279
case D_L0_8x4:
3280
x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3281
x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3282
i_cost += analysis.l0.me8x4[i8x8][0].cost +
3283
analysis.l0.me8x4[i8x8][1].cost;
3284
break;
3285
case D_L0_4x8:
3286
x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3287
x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3288
i_cost += analysis.l0.me4x8[i8x8][0].cost +
3289
analysis.l0.me4x8[i8x8][1].cost;
3290
break;
3291
3292
case D_L0_4x4:
3293
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3294
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3295
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3296
x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3297
i_cost += analysis.l0.me4x4[i8x8][0].cost +
3298
analysis.l0.me4x4[i8x8][1].cost +
3299
analysis.l0.me4x4[i8x8][2].cost +
3300
analysis.l0.me4x4[i8x8][3].cost;
3301
break;
3302
default:
3303
x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3304
break;
3305
}
3306
}
3307
}
3308
3309
if( h->mb.b_chroma_me )
3310
{
3311
if( CHROMA444 )
3312
{
3313
x264_mb_analyse_intra( h, &analysis, i_cost );
3314
x264_mb_analyse_intra_chroma( h, &analysis );
3315
}
3316
else
3317
{
3318
x264_mb_analyse_intra_chroma( h, &analysis );
3319
x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3320
}
3321
analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3322
analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3323
analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3324
}
3325
else
3326
x264_mb_analyse_intra( h, &analysis, i_cost );
3327
3328
i_satd_inter = i_cost;
3329
i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3330
analysis.i_satd_i8x8,
3331
analysis.i_satd_i4x4 );
3332
3333
if( analysis.i_mbrd )
3334
{
3335
x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3336
i_type = P_L0;
3337
i_partition = D_16x16;
3338
i_cost = analysis.l0.i_rd16x16;
3339
COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3340
COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3341
COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3342
h->mb.i_type = i_type;
3343
h->mb.i_partition = i_partition;
3344
if( i_cost < COST_MAX )
3345
x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3346
x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3347
}
3348
3349
COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3350
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3351
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3352
COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3353
3354
h->mb.i_type = i_type;
3355
3356
if( analysis.b_force_intra && !IS_INTRA(i_type) )
3357
{
3358
/* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3359
* it was an inter block. */
3360
x264_analyse_update_cache( h, &analysis );
3361
x264_macroblock_encode( h );
3362
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3363
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3364
if( !CHROMA444 )
3365
{
3366
int height = 16 >> CHROMA_V_SHIFT;
3367
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3368
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3369
}
3370
x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3371
goto intra_analysis;
3372
}
3373
3374
if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3375
{
3376
if( IS_INTRA( h->mb.i_type ) )
3377
{
3378
x264_intra_rd_refine( h, &analysis );
3379
}
3380
else if( i_partition == D_16x16 )
3381
{
3382
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3383
analysis.l0.me16x16.cost = i_cost;
3384
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3385
}
3386
else if( i_partition == D_16x8 )
3387
{
3388
h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3389
h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3390
x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3391
x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3392
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3393
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3394
}
3395
else if( i_partition == D_8x16 )
3396
{
3397
h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3398
h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3399
x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3400
x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3401
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3402
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3403
}
3404
else if( i_partition == D_8x8 )
3405
{
3406
x264_analyse_update_cache( h, &analysis );
3407
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3408
{
3409
if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3410
{
3411
x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3412
}
3413
else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3414
{
3415
x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3416
x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3417
}
3418
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3419
{
3420
x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3421
x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3422
}
3423
else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3424
{
3425
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3426
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3427
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3428
x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3429
}
3430
}
3431
}
3432
}
3433
}
3434
}
3435
else if( h->sh.i_type == SLICE_TYPE_B )
3436
{
3437
int i_bskip_cost = COST_MAX;
3438
int b_skip = 0;
3439
3440
if( analysis.i_mbrd )
3441
x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3442
3443
h->mb.i_type = B_SKIP;
3444
if( h->mb.b_direct_auto_write )
3445
{
3446
/* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3447
for( int i = 0; i < 2; i++ )
3448
{
3449
int b_changed = 1;
3450
h->sh.b_direct_spatial_mv_pred ^= 1;
3451
analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3452
if( analysis.b_direct_available )
3453
{
3454
if( b_changed )
3455
{
3456
x264_mb_mc( h );
3457
b_skip = x264_macroblock_probe_bskip( h );
3458
}
3459
h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3460
}
3461
else
3462
b_skip = 0;
3463
}
3464
}
3465
else
3466
analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3467
3468
analysis.b_try_skip = 0;
3469
if( analysis.b_direct_available )
3470
{
3471
if( !h->mb.b_direct_auto_write )
3472
x264_mb_mc( h );
3473
/* If the current macroblock is off the frame, just skip it. */
3474
if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3475
b_skip = 1;
3476
else if( analysis.i_mbrd )
3477
{
3478
i_bskip_cost = ssd_mb( h );
3479
/* 6 = minimum cavlc cost of a non-skipped MB */
3480
b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3481
}
3482
else if( !h->mb.b_direct_auto_write )
3483
{
3484
/* Conditioning the probe on neighboring block types
3485
* doesn't seem to help speed or quality. */
3486
analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3487
if( h->param.analyse.i_subpel_refine < 3 )
3488
b_skip = analysis.b_try_skip;
3489
}
3490
/* Set up MVs for future predictors */
3491
if( b_skip )
3492
{
3493
for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3494
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3495
for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3496
M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3497
}
3498
}
3499
3500
if( !b_skip )
3501
{
3502
const unsigned int flags = h->param.analyse.inter;
3503
int i_type;
3504
int i_partition;
3505
int i_satd_inter;
3506
h->mb.b_skip_mc = 0;
3507
h->mb.i_type = B_DIRECT;
3508
3509
x264_mb_analyse_load_costs( h, &analysis );
3510
3511
/* select best inter mode */
3512
/* direct must be first */
3513
if( analysis.b_direct_available )
3514
x264_mb_analyse_inter_direct( h, &analysis );
3515
3516
x264_mb_analyse_inter_b16x16( h, &analysis );
3517
3518
if( h->mb.i_type == B_SKIP )
3519
{
3520
for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3521
M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3522
for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3523
M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3524
return;
3525
}
3526
3527
i_type = B_L0_L0;
3528
i_partition = D_16x16;
3529
i_cost = analysis.l0.me16x16.cost;
3530
COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3531
COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3532
COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3533
3534
if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3535
{
3536
x264_mb_analyse_b_rd( h, &analysis, i_cost );
3537
if( i_bskip_cost < analysis.i_rd16x16direct &&
3538
i_bskip_cost < analysis.i_rd16x16bi &&
3539
i_bskip_cost < analysis.l0.i_rd16x16 &&
3540
i_bskip_cost < analysis.l1.i_rd16x16 )
3541
{
3542
h->mb.i_type = B_SKIP;
3543
x264_analyse_update_cache( h, &analysis );
3544
return;
3545
}
3546
}
3547
3548
if( flags & X264_ANALYSE_BSUB16x16 )
3549
{
3550
if( h->param.analyse.b_mixed_references )
3551
x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3552
else
3553
x264_mb_analyse_inter_b8x8( h, &analysis );
3554
3555
COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3556
3557
/* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3558
int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3559
int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3560
for( int i = 0; i < 2; i++ )
3561
{
3562
int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3563
int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3564
// 16x8
3565
i_best_cost = COST_MAX;
3566
i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3567
i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3568
i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3569
avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3570
+ analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3571
avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3572
+ analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3573
COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3574
COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3575
COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3576
analysis.i_cost_est16x8[i] = i_best_cost;
3577
3578
// 8x16
3579
i_best_cost = COST_MAX;
3580
i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3581
i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3582
i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3583
avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3584
+ analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3585
avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3586
+ analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3587
COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3588
COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3589
COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3590
analysis.i_cost_est8x16[i] = i_best_cost;
3591
}
3592
i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3593
analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3594
i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3595
i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3596
analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3597
i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3598
3599
/* We can gain a little speed by checking the mode with the lowest estimated cost first */
3600
int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3601
if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3602
{
3603
x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3604
COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3605
}
3606
if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3607
{
3608
x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3609
COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3610
}
3611
if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3612
{
3613
x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3614
COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3615
}
3616
}
3617
3618
if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3619
{
3620
/* refine later */
3621
}
3622
/* refine qpel */
3623
else if( i_partition == D_16x16 )
3624
{
3625
analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3626
analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3627
if( i_type == B_L0_L0 )
3628
{
3629
x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3630
i_cost = analysis.l0.me16x16.cost
3631
+ analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3632
}
3633
else if( i_type == B_L1_L1 )
3634
{
3635
x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3636
i_cost = analysis.l1.me16x16.cost
3637
+ analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3638
}
3639
else if( i_type == B_BI_BI )
3640
{
3641
x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3642
x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3643
}
3644
}
3645
else if( i_partition == D_16x8 )
3646
{
3647
for( int i = 0; i < 2; i++ )
3648
{
3649
if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3650
x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3651
if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3652
x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3653
}
3654
}
3655
else if( i_partition == D_8x16 )
3656
{
3657
for( int i = 0; i < 2; i++ )
3658
{
3659
if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3660
x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3661
if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3662
x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3663
}
3664
}
3665
else if( i_partition == D_8x8 )
3666
{
3667
for( int i = 0; i < 4; i++ )
3668
{
3669
x264_me_t *m;
3670
int i_part_cost_old;
3671
int i_type_cost;
3672
int i_part_type = h->mb.i_sub_partition[i];
3673
int b_bidir = (i_part_type == D_BI_8x8);
3674
3675
if( i_part_type == D_DIRECT_8x8 )
3676
continue;
3677
if( x264_mb_partition_listX_table[0][i_part_type] )
3678
{
3679
m = &analysis.l0.me8x8[i];
3680
i_part_cost_old = m->cost;
3681
i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3682
m->cost -= i_type_cost;
3683
x264_me_refine_qpel( h, m );
3684
if( !b_bidir )
3685
analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3686
}
3687
if( x264_mb_partition_listX_table[1][i_part_type] )
3688
{
3689
m = &analysis.l1.me8x8[i];
3690
i_part_cost_old = m->cost;
3691
i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3692
m->cost -= i_type_cost;
3693
x264_me_refine_qpel( h, m );
3694
if( !b_bidir )
3695
analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3696
}
3697
/* TODO: update mvp? */
3698
}
3699
}
3700
3701
i_satd_inter = i_cost;
3702
3703
if( analysis.i_mbrd )
3704
{
3705
x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3706
i_type = B_SKIP;
3707
i_cost = i_bskip_cost;
3708
i_partition = D_16x16;
3709
COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3710
COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3711
COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3712
COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3713
COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3714
COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3715
COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3716
3717
h->mb.i_type = i_type;
3718
h->mb.i_partition = i_partition;
3719
}
3720
3721
if( h->mb.b_chroma_me )
3722
{
3723
if( CHROMA444 )
3724
{
3725
x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3726
x264_mb_analyse_intra_chroma( h, &analysis );
3727
}
3728
else
3729
{
3730
x264_mb_analyse_intra_chroma( h, &analysis );
3731
x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3732
}
3733
analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3734
analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3735
analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3736
}
3737
else
3738
x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3739
3740
if( analysis.i_mbrd )
3741
{
3742
x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3743
x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3744
}
3745
3746
COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3747
COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3748
COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3749
COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3750
3751
h->mb.i_type = i_type;
3752
h->mb.i_partition = i_partition;
3753
3754
if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3755
x264_intra_rd_refine( h, &analysis );
3756
if( h->mb.i_subpel_refine >= 5 )
3757
x264_refine_bidir( h, &analysis );
3758
3759
if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3760
{
3761
int i_biweight;
3762
x264_analyse_update_cache( h, &analysis );
3763
3764
if( i_partition == D_16x16 )
3765
{
3766
if( i_type == B_L0_L0 )
3767
{
3768
analysis.l0.me16x16.cost = i_cost;
3769
x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3770
}
3771
else if( i_type == B_L1_L1 )
3772
{
3773
analysis.l1.me16x16.cost = i_cost;
3774
x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3775
}
3776
else if( i_type == B_BI_BI )
3777
{
3778
i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3779
x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3780
}
3781
}
3782
else if( i_partition == D_16x8 )
3783
{
3784
for( int i = 0; i < 2; i++ )
3785
{
3786
h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3787
if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3788
x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3789
else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3790
x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3791
else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3792
{
3793
i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3794
x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3795
}
3796
}
3797
}
3798
else if( i_partition == D_8x16 )
3799
{
3800
for( int i = 0; i < 2; i++ )
3801
{
3802
h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3803
if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3804
x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3805
else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3806
x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3807
else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3808
{
3809
i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3810
x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3811
}
3812
}
3813
}
3814
else if( i_partition == D_8x8 )
3815
{
3816
for( int i = 0; i < 4; i++ )
3817
{
3818
if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3819
x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3820
else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3821
x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3822
else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3823
{
3824
i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3825
x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3826
}
3827
}
3828
}
3829
}
3830
}
3831
}
3832
3833
x264_analyse_update_cache( h, &analysis );
3834
3835
/* In rare cases we can end up qpel-RDing our way back to a larger partition size
3836
* without realizing it. Check for this and account for it if necessary. */
3837
if( analysis.i_mbrd >= 2 )
3838
{
3839
/* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3840
static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3841
int list = check_mv_lists[h->mb.i_type] - 1;
3842
if( list >= 0 && h->mb.i_partition != D_16x16 &&
3843
M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3844
h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3845
h->mb.i_partition = D_16x16;
3846
}
3847
3848
if( !analysis.i_mbrd )
3849
x264_mb_analyse_transform( h );
3850
3851
if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3852
x264_mb_analyse_qp_rd( h, &analysis );
3853
3854
h->mb.b_trellis = h->param.analyse.i_trellis;
3855
h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3856
3857
if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3858
x264_psy_trellis_init( h, 0 );
3859
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3860
h->mb.i_skip_intra = 0;
3861
}
3862
3863
/*-------------------- Update MB from the analysis ----------------------*/
3864
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3865
{
3866
switch( h->mb.i_type )
3867
{
3868
case I_4x4:
3869
for( int i = 0; i < 16; i++ )
3870
h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3871
3872
x264_mb_analyse_intra_chroma( h, a );
3873
break;
3874
case I_8x8:
3875
for( int i = 0; i < 4; i++ )
3876
x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3877
3878
x264_mb_analyse_intra_chroma( h, a );
3879
break;
3880
case I_16x16:
3881
h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3882
x264_mb_analyse_intra_chroma( h, a );
3883
break;
3884
3885
case I_PCM:
3886
break;
3887
3888
case P_L0:
3889
switch( h->mb.i_partition )
3890
{
3891
case D_16x16:
3892
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3893
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3894
break;
3895
3896
case D_16x8:
3897
x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3898
x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3899
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3900
x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3901
break;
3902
3903
case D_8x16:
3904
x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3905
x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3906
x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3907
x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3908
break;
3909
3910
default:
3911
x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3912
break;
3913
}
3914
break;
3915
3916
case P_8x8:
3917
x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3918
x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3919
x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3920
x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3921
for( int i = 0; i < 4; i++ )
3922
x264_mb_cache_mv_p8x8( h, a, i );
3923
break;
3924
3925
case P_SKIP:
3926
{
3927
h->mb.i_partition = D_16x16;
3928
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3929
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3930
break;
3931
}
3932
3933
case B_SKIP:
3934
case B_DIRECT:
3935
h->mb.i_partition = h->mb.cache.direct_partition;
3936
x264_mb_load_mv_direct8x8( h, 0 );
3937
x264_mb_load_mv_direct8x8( h, 1 );
3938
x264_mb_load_mv_direct8x8( h, 2 );
3939
x264_mb_load_mv_direct8x8( h, 3 );
3940
break;
3941
3942
case B_8x8:
3943
/* optimize: cache might not need to be rewritten */
3944
for( int i = 0; i < 4; i++ )
3945
x264_mb_cache_mv_b8x8( h, a, i, 1 );
3946
break;
3947
3948
default: /* the rest of the B types */
3949
switch( h->mb.i_partition )
3950
{
3951
case D_16x16:
3952
switch( h->mb.i_type )
3953
{
3954
case B_L0_L0:
3955
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3956
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3957
3958
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3959
x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3960
x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3961
break;
3962
case B_L1_L1:
3963
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3964
x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3965
x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3966
3967
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3968
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3969
break;
3970
case B_BI_BI:
3971
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3972
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3973
3974
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3975
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3976
break;
3977
}
3978
break;
3979
case D_16x8:
3980
x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3981
x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3982
break;
3983
case D_8x16:
3984
x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3985
x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3986
break;
3987
default:
3988
x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3989
break;
3990
}
3991
}
3992
3993
#ifndef NDEBUG
3994
if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3995
{
3996
for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3997
{
3998
int completed;
3999
int ref = h->mb.cache.ref[l][x264_scan8[0]];
4000
if( ref < 0 )
4001
continue;
4002
completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
4003
if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
4004
{
4005
x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4006
x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4007
x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4008
h->mb.cache.mv[l][x264_scan8[15]][0],
4009
h->mb.cache.mv[l][x264_scan8[15]][1] );
4010
x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4011
x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4012
x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4013
x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4014
x264_mb_analyse_intra( h, a, COST_MAX );
4015
h->mb.i_type = I_16x16;
4016
h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4017
x264_mb_analyse_intra_chroma( h, a );
4018
}
4019
}
4020
}
4021
#endif
4022
}
4023
4024
#include "slicetype.c"
4025
4026
4027