Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52866 views
1
/*****************************************************************************
2
* macroblock.c: macroblock encoding
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Laurent Aimar <[email protected]>
7
* Loren Merritt <[email protected]>
8
* Fiona Glaser <[email protected]>
9
* Henrik Gramner <[email protected]>
10
*
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; either version 2 of the License, or
14
* (at your option) any later version.
15
*
16
* This program is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
* GNU General Public License for more details.
20
*
21
* You should have received a copy of the GNU General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24
*
25
* This program is also available under a commercial proprietary license.
26
* For more information, contact us at [email protected].
27
*****************************************************************************/
28
29
#include "common/common.h"
30
#include "macroblock.h"
31
32
/* These chroma DC functions don't have assembly versions and are only used here. */
33
34
#define ZIG(i,y,x) level[i] = dct[x*2+y];
35
static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
36
{
37
ZIG(0,0,0)
38
ZIG(1,0,1)
39
ZIG(2,1,0)
40
ZIG(3,1,1)
41
}
42
#undef ZIG
43
44
static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
45
{
46
level[0] = dct[0];
47
level[1] = dct[2];
48
level[2] = dct[1];
49
level[3] = dct[4];
50
level[4] = dct[6];
51
level[5] = dct[3];
52
level[6] = dct[5];
53
level[7] = dct[7];
54
}
55
56
#define IDCT_DEQUANT_2X2_START \
57
int d0 = dct[0] + dct[1]; \
58
int d1 = dct[2] + dct[3]; \
59
int d2 = dct[0] - dct[1]; \
60
int d3 = dct[2] - dct[3]; \
61
int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
62
63
static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
64
{
65
IDCT_DEQUANT_2X2_START
66
dct4x4[0][0] = (d0 + d1) * dmf >> 5;
67
dct4x4[1][0] = (d0 - d1) * dmf >> 5;
68
dct4x4[2][0] = (d2 + d3) * dmf >> 5;
69
dct4x4[3][0] = (d2 - d3) * dmf >> 5;
70
}
71
72
static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
73
{
74
IDCT_DEQUANT_2X2_START
75
dct[0] = (d0 + d1) * dmf >> 5;
76
dct[1] = (d0 - d1) * dmf >> 5;
77
dct[2] = (d2 + d3) * dmf >> 5;
78
dct[3] = (d2 - d3) * dmf >> 5;
79
}
80
#undef IDCT_2X2_DEQUANT_START
81
82
static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
83
{
84
int d0 = dct4x4[0][0] + dct4x4[1][0];
85
int d1 = dct4x4[2][0] + dct4x4[3][0];
86
int d2 = dct4x4[0][0] - dct4x4[1][0];
87
int d3 = dct4x4[2][0] - dct4x4[3][0];
88
d[0] = d0 + d1;
89
d[2] = d2 + d3;
90
d[1] = d0 - d1;
91
d[3] = d2 - d3;
92
dct4x4[0][0] = 0;
93
dct4x4[1][0] = 0;
94
dct4x4[2][0] = 0;
95
dct4x4[3][0] = 0;
96
}
97
98
static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
99
{
100
if( WORD_SIZE == 8 )
101
{
102
for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
103
if( M64( &v[i] ) )
104
return 1;
105
}
106
else
107
{
108
for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
109
if( M32( &v[i] ) )
110
return 1;
111
}
112
return 0;
113
}
114
115
/* All encoding functions must output the correct CBP and NNZ values.
116
* The entropy coding functions will check CBP first, then NNZ, before
117
* actually reading the DCT coefficients. NNZ still must be correct even
118
* if CBP is zero because of the use of NNZ values for context selection.
119
* "NNZ" need only be 0 or 1 rather than the exact coefficient count because
120
* that is only needed in CAVLC, and will be calculated by CAVLC's residual
121
* coding and stored as necessary. */
122
123
/* This means that decimation can be done merely by adjusting the CBP and NNZ
124
* rather than memsetting the coefficients. */
125
126
static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
127
{
128
pixel *p_src = h->mb.pic.p_fenc[p];
129
pixel *p_dst = h->mb.pic.p_fdec[p];
130
131
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
132
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
133
134
int nz, block_cbp = 0;
135
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
136
int i_quant_cat = p ? CQM_4IC : CQM_4IY;
137
int i_mode = h->mb.i_intra16x16_pred_mode;
138
139
if( h->mb.b_lossless )
140
x264_predict_lossless_16x16( h, p, i_mode );
141
else
142
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
143
144
if( h->mb.b_lossless )
145
{
146
for( int i = 0; i < 16; i++ )
147
{
148
int oe = block_idx_xy_fenc[i];
149
int od = block_idx_xy_fdec[i];
150
nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
151
h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
152
block_cbp |= nz;
153
}
154
h->mb.i_cbp_luma |= block_cbp * 0xf;
155
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
156
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
157
return;
158
}
159
160
CLEAR_16x16_NNZ( p );
161
162
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
163
164
if( h->mb.b_noise_reduction )
165
for( int idx = 0; idx < 16; idx++ )
166
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
167
168
for( int idx = 0; idx < 16; idx++ )
169
{
170
dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
171
dct4x4[idx][0] = 0;
172
}
173
174
if( h->mb.b_trellis )
175
{
176
for( int idx = 0; idx < 16; idx++ )
177
if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
178
{
179
block_cbp = 0xf;
180
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
181
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
182
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
183
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
184
}
185
}
186
else
187
{
188
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
189
{
190
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
191
if( nz )
192
{
193
block_cbp = 0xf;
194
FOREACH_BIT( idx, i8x8*4, nz )
195
{
196
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
197
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
198
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
199
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
200
}
201
}
202
}
203
}
204
205
/* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
206
/* More useful with CAVLC, but still useful with CABAC. */
207
if( decimate_score < 6 )
208
{
209
CLEAR_16x16_NNZ( p );
210
block_cbp = 0;
211
}
212
else
213
h->mb.i_cbp_luma |= block_cbp;
214
215
h->dctf.dct4x4dc( dct_dc4x4 );
216
if( h->mb.b_trellis )
217
nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
218
else
219
nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
220
221
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
222
if( nz )
223
{
224
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
225
226
/* output samples to fdec */
227
h->dctf.idct4x4dc( dct_dc4x4 );
228
h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */
229
if( block_cbp )
230
for( int i = 0; i < 16; i++ )
231
dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
232
}
233
234
/* put pixels to fdec */
235
if( block_cbp )
236
h->dctf.add16x16_idct( p_dst, dct4x4 );
237
else if( nz )
238
h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
239
}
240
241
/* Round down coefficients losslessly in DC-only chroma blocks.
242
* Unlike luma blocks, this can't be done with a lookup table or
243
* other shortcut technique because of the interdependencies
244
* between the coefficients due to the chroma DC transform. */
245
static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
246
{
247
int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
248
249
/* If the QP is too high, there's no benefit to rounding optimization. */
250
if( dmf > 32*64 )
251
return 1;
252
253
if( chroma422 )
254
return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
255
else
256
return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
257
}
258
259
static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
260
{
261
int nz, nz_dc;
262
int b_decimate = b_inter && h->mb.b_dct_decimate;
263
int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
264
ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
265
h->mb.i_cbp_chroma = 0;
266
h->nr_count[2] += h->mb.b_noise_reduction * 4;
267
268
M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
269
M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
270
M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
271
M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
272
if( chroma422 )
273
{
274
M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
275
M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
276
M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
277
M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
278
}
279
280
/* Early termination: check variance of chroma residual before encoding.
281
* Don't bother trying early termination at low QPs.
282
* Values are experimentally derived. */
283
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
284
{
285
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
286
int ssd[2];
287
int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
288
289
int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
290
if( score < thresh*4 )
291
score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
292
if( score < thresh*4 )
293
{
294
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
295
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
296
297
for( int ch = 0; ch < 2; ch++ )
298
{
299
if( ssd[ch] > thresh )
300
{
301
pixel *p_src = h->mb.pic.p_fenc[1+ch];
302
pixel *p_dst = h->mb.pic.p_fdec[1+ch];
303
304
if( chroma422 )
305
/* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
306
h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
307
else
308
h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
309
310
if( h->mb.b_trellis )
311
nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
312
else
313
{
314
nz_dc = 0;
315
for( int i = 0; i <= chroma422; i++ )
316
nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
317
h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
318
}
319
320
if( nz_dc )
321
{
322
if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
323
continue;
324
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
325
if( chroma422 )
326
{
327
zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
328
h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
329
}
330
else
331
{
332
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
333
idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
334
}
335
336
for( int i = 0; i <= chroma422; i++ )
337
h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
338
h->mb.i_cbp_chroma = 1;
339
}
340
}
341
}
342
return;
343
}
344
}
345
346
for( int ch = 0; ch < 2; ch++ )
347
{
348
pixel *p_src = h->mb.pic.p_fenc[1+ch];
349
pixel *p_dst = h->mb.pic.p_fdec[1+ch];
350
int i_decimate_score = b_decimate ? 0 : 7;
351
int nz_ac = 0;
352
353
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
354
355
if( h->mb.b_lossless )
356
{
357
static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
358
359
for( int i = 0; i < (chroma422?8:4); i++ )
360
{
361
int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
362
int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
363
nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
364
&h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
365
h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
366
h->mb.i_cbp_chroma |= nz;
367
}
368
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
369
continue;
370
}
371
372
for( int i = 0; i <= chroma422; i++ )
373
h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
374
375
if( h->mb.b_noise_reduction )
376
for( int i = 0; i < (chroma422?8:4); i++ )
377
h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
378
379
if( chroma422 )
380
h->dctf.dct2x4dc( dct_dc, dct4x4 );
381
else
382
dct2x2dc( dct_dc, dct4x4 );
383
384
/* calculate dct coeffs */
385
for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
386
{
387
if( h->mb.b_trellis )
388
{
389
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
390
{
391
if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
392
{
393
int idx = 16+ch*16+i8x8*8+i4x4;
394
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
395
h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
396
if( i_decimate_score < 7 )
397
i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
398
h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
399
nz_ac = 1;
400
}
401
}
402
}
403
else
404
{
405
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
406
h->quant4_bias[CQM_4IC+b_inter][i_qp] );
407
nz_ac |= nz;
408
409
FOREACH_BIT( i4x4, 0, nz )
410
{
411
int idx = 16+ch*16+i8x8*8+i4x4;
412
413
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
414
h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
415
if( i_decimate_score < 7 )
416
i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
417
h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
418
}
419
}
420
}
421
422
if( h->mb.b_trellis )
423
nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
424
else
425
{
426
nz_dc = 0;
427
for( int i = 0; i <= chroma422; i++ )
428
nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
429
h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
430
}
431
432
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
433
434
if( i_decimate_score < 7 || !nz_ac )
435
{
436
/* Decimate the block */
437
M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
438
M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
439
if( chroma422 )
440
{
441
M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
442
M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
443
}
444
445
if( !nz_dc ) /* Whole block is empty */
446
continue;
447
if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
448
{
449
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
450
continue;
451
}
452
/* DC-only */
453
if( chroma422 )
454
{
455
zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
456
h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
457
}
458
else
459
{
460
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
461
idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
462
}
463
464
for( int i = 0; i <= chroma422; i++ )
465
h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
466
}
467
else
468
{
469
h->mb.i_cbp_chroma = 1;
470
471
if( nz_dc )
472
{
473
if( chroma422 )
474
{
475
zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
476
h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
477
}
478
else
479
{
480
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
481
idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
482
}
483
}
484
485
for( int i = 0; i <= chroma422; i++ )
486
h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
487
}
488
}
489
490
/* 0 = none, 1 = DC only, 2 = DC+AC */
491
h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
492
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
493
}
494
495
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
496
{
497
if( CHROMA_FORMAT == CHROMA_420 )
498
x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
499
else
500
x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
501
}
502
503
static void x264_macroblock_encode_skip( x264_t *h )
504
{
505
M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
506
M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
507
M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
508
M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
509
M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
510
M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
511
M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
512
M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
513
if( CHROMA_FORMAT >= CHROMA_422 )
514
{
515
M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
516
M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
517
M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
518
M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
519
}
520
h->mb.i_cbp_luma = 0;
521
h->mb.i_cbp_chroma = 0;
522
h->mb.cbp[h->mb.i_mb_xy] = 0;
523
}
524
525
/*****************************************************************************
526
* Intra prediction for predictive lossless mode.
527
*****************************************************************************/
528
529
void x264_predict_lossless_chroma( x264_t *h, int i_mode )
530
{
531
int height = 16 >> CHROMA_V_SHIFT;
532
if( i_mode == I_PRED_CHROMA_V )
533
{
534
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
535
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
536
memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
537
memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
538
}
539
else if( i_mode == I_PRED_CHROMA_H )
540
{
541
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
542
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
543
x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
544
x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
545
if( CHROMA_FORMAT == CHROMA_422 )
546
{
547
x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
548
x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
549
}
550
}
551
else
552
{
553
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
554
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
555
}
556
}
557
558
void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
559
{
560
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
561
pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
562
563
if( i_mode == I_PRED_4x4_V )
564
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
565
else if( i_mode == I_PRED_4x4_H )
566
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
567
else
568
h->predict_4x4[i_mode]( p_dst );
569
}
570
571
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
572
{
573
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
574
pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
575
576
if( i_mode == I_PRED_8x8_V )
577
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
578
else if( i_mode == I_PRED_8x8_H )
579
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
580
else
581
h->predict_8x8[i_mode]( p_dst, edge );
582
}
583
584
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
585
{
586
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
587
if( i_mode == I_PRED_16x16_V )
588
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
589
else if( i_mode == I_PRED_16x16_H )
590
h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
591
else
592
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
593
}
594
595
/*****************************************************************************
596
* x264_macroblock_encode:
597
*****************************************************************************/
598
static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
599
{
600
int i_qp = h->mb.i_qp;
601
int b_decimate = h->mb.b_dct_decimate;
602
int b_force_no_skip = 0;
603
int nz;
604
h->mb.i_cbp_luma = 0;
605
for( int p = 0; p < plane_count; p++ )
606
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
607
608
if( h->mb.i_type == I_PCM )
609
{
610
/* if PCM is chosen, we need to store reconstructed frame data */
611
for( int p = 0; p < plane_count; p++ )
612
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
613
if( chroma )
614
{
615
int height = 16 >> CHROMA_V_SHIFT;
616
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
617
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
618
}
619
return;
620
}
621
622
if( !h->mb.b_allow_skip )
623
{
624
b_force_no_skip = 1;
625
if( IS_SKIP(h->mb.i_type) )
626
{
627
if( h->mb.i_type == P_SKIP )
628
h->mb.i_type = P_L0;
629
else if( h->mb.i_type == B_SKIP )
630
h->mb.i_type = B_DIRECT;
631
}
632
}
633
634
if( h->mb.i_type == P_SKIP )
635
{
636
/* don't do pskip motion compensation if it was already done in macroblock_analyse */
637
if( !h->mb.b_skip_mc )
638
{
639
int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
640
h->mb.mv_min[0], h->mb.mv_max[0] );
641
int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
642
h->mb.mv_min[1], h->mb.mv_max[1] );
643
644
for( int p = 0; p < plane_count; p++ )
645
h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
646
&h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
647
mvx, mvy, 16, 16, &h->sh.weight[0][p] );
648
649
if( chroma )
650
{
651
int v_shift = CHROMA_V_SHIFT;
652
int height = 16 >> v_shift;
653
654
/* Special case for mv0, which is (of course) very common in P-skip mode. */
655
if( mvx | mvy )
656
h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
657
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
658
mvx, 2*mvy>>v_shift, 8, height );
659
else
660
h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
661
h->mb.pic.i_stride[1], height );
662
663
if( h->sh.weight[0][1].weightfn )
664
h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
665
h->mb.pic.p_fdec[1], FDEC_STRIDE,
666
&h->sh.weight[0][1], height );
667
if( h->sh.weight[0][2].weightfn )
668
h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
669
h->mb.pic.p_fdec[2], FDEC_STRIDE,
670
&h->sh.weight[0][2], height );
671
}
672
}
673
674
x264_macroblock_encode_skip( h );
675
return;
676
}
677
if( h->mb.i_type == B_SKIP )
678
{
679
/* don't do bskip motion compensation if it was already done in macroblock_analyse */
680
if( !h->mb.b_skip_mc )
681
x264_mb_mc( h );
682
x264_macroblock_encode_skip( h );
683
return;
684
}
685
686
if( h->mb.i_type == I_16x16 )
687
{
688
h->mb.b_transform_8x8 = 0;
689
690
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
691
x264_mb_encode_i16x16( h, p, i_qp );
692
}
693
else if( h->mb.i_type == I_8x8 )
694
{
695
h->mb.b_transform_8x8 = 1;
696
/* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
697
if( h->mb.i_skip_intra )
698
{
699
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
700
M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
701
M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
702
M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
703
M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
704
h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
705
/* In RD mode, restore the now-overwritten DCT data. */
706
if( h->mb.i_skip_intra == 2 )
707
h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
708
}
709
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
710
{
711
for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
712
{
713
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
714
x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
715
}
716
}
717
}
718
else if( h->mb.i_type == I_4x4 )
719
{
720
h->mb.b_transform_8x8 = 0;
721
/* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
722
if( h->mb.i_skip_intra )
723
{
724
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
725
M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
726
M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
727
M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
728
M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
729
h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
730
/* In RD mode, restore the now-overwritten DCT data. */
731
if( h->mb.i_skip_intra == 2 )
732
h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
733
}
734
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
735
{
736
for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
737
{
738
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
739
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
740
741
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
742
/* emulate missing topright samples */
743
MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
744
745
x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
746
}
747
}
748
}
749
else /* Inter MB */
750
{
751
int i_decimate_mb = 0;
752
753
/* Don't repeat motion compensation if it was already done in non-RD transform analysis */
754
if( !h->mb.b_skip_mc )
755
x264_mb_mc( h );
756
757
if( h->mb.b_lossless )
758
{
759
if( h->mb.b_transform_8x8 )
760
for( int p = 0; p < plane_count; p++ )
761
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
762
{
763
int x = i8x8&1;
764
int y = i8x8>>1;
765
nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
766
h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
767
STORE_8x8_NNZ( p, i8x8, nz );
768
h->mb.i_cbp_luma |= nz << i8x8;
769
}
770
else
771
for( int p = 0; p < plane_count; p++ )
772
for( int i4x4 = 0; i4x4 < 16; i4x4++ )
773
{
774
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
775
h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
776
h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
777
h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
778
h->mb.i_cbp_luma |= nz << (i4x4>>2);
779
}
780
}
781
else if( h->mb.b_transform_8x8 )
782
{
783
ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
784
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
785
786
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
787
{
788
CLEAR_16x16_NNZ( p );
789
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
790
h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
791
792
int plane_cbp = 0;
793
for( int idx = 0; idx < 4; idx++ )
794
{
795
nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
796
797
if( nz )
798
{
799
h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
800
if( b_decimate )
801
{
802
int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
803
i_decimate_mb += i_decimate_8x8;
804
if( i_decimate_8x8 >= 4 )
805
plane_cbp |= 1<<idx;
806
}
807
else
808
plane_cbp |= 1<<idx;
809
}
810
}
811
812
if( i_decimate_mb >= 6 || !b_decimate )
813
{
814
h->mb.i_cbp_luma |= plane_cbp;
815
FOREACH_BIT( idx, 0, plane_cbp )
816
{
817
h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
818
h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
819
STORE_8x8_NNZ( p, idx, 1 );
820
}
821
}
822
}
823
}
824
else
825
{
826
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
827
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
828
{
829
CLEAR_16x16_NNZ( p );
830
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
831
832
if( h->mb.b_noise_reduction )
833
{
834
h->nr_count[0+!!p*2] += 16;
835
for( int idx = 0; idx < 16; idx++ )
836
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
837
}
838
839
int plane_cbp = 0;
840
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
841
{
842
int i_decimate_8x8 = b_decimate ? 0 : 6;
843
int nnz8x8 = 0;
844
if( h->mb.b_trellis )
845
{
846
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
847
{
848
int idx = i8x8*4+i4x4;
849
if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
850
{
851
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
852
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
853
if( i_decimate_8x8 < 6 )
854
i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
855
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
856
nnz8x8 = 1;
857
}
858
}
859
}
860
else
861
{
862
nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
863
if( nz )
864
{
865
FOREACH_BIT( idx, i8x8*4, nz )
866
{
867
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
868
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
869
if( i_decimate_8x8 < 6 )
870
i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
871
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
872
}
873
}
874
}
875
if( nnz8x8 )
876
{
877
i_decimate_mb += i_decimate_8x8;
878
if( i_decimate_8x8 < 4 )
879
STORE_8x8_NNZ( p, i8x8, 0 );
880
else
881
plane_cbp |= 1<<i8x8;
882
}
883
}
884
885
if( i_decimate_mb < 6 )
886
{
887
plane_cbp = 0;
888
CLEAR_16x16_NNZ( p );
889
}
890
else
891
{
892
h->mb.i_cbp_luma |= plane_cbp;
893
FOREACH_BIT( i8x8, 0, plane_cbp )
894
{
895
h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
896
}
897
}
898
}
899
}
900
}
901
902
/* encode chroma */
903
if( chroma )
904
{
905
if( IS_INTRA( h->mb.i_type ) )
906
{
907
int i_mode = h->mb.i_chroma_pred_mode;
908
if( h->mb.b_lossless )
909
x264_predict_lossless_chroma( h, i_mode );
910
else
911
{
912
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
913
h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
914
}
915
}
916
917
/* encode the 8x8 blocks */
918
x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
919
}
920
else
921
h->mb.i_cbp_chroma = 0;
922
923
/* store cbp */
924
int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
925
if( h->param.b_cabac )
926
cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC ]] << 8
927
| h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
928
| h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
929
h->mb.cbp[h->mb.i_mb_xy] = cbp;
930
931
/* Check for P_SKIP
932
* XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
933
* (if multiple mv give same result)*/
934
if( !b_force_no_skip )
935
{
936
if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
937
!(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
938
M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
939
&& h->mb.cache.ref[0][x264_scan8[0]] == 0 )
940
{
941
h->mb.i_type = P_SKIP;
942
}
943
944
/* Check for B_SKIP */
945
if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
946
{
947
h->mb.i_type = B_SKIP;
948
}
949
}
950
}
951
952
void x264_macroblock_encode( x264_t *h )
953
{
954
if( CHROMA444 )
955
x264_macroblock_encode_internal( h, 3, 0 );
956
else
957
x264_macroblock_encode_internal( h, 1, 1 );
958
}
959
960
/*****************************************************************************
961
* x264_macroblock_probe_skip:
962
* Check if the current MB could be encoded as a [PB]_SKIP
963
*****************************************************************************/
964
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
965
{
966
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
967
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
968
ALIGNED_4( int16_t mvp[2] );
969
int i_qp = h->mb.i_qp;
970
971
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
972
{
973
int quant_cat = p ? CQM_4PC : CQM_4PY;
974
if( !b_bidir )
975
{
976
/* Get the MV */
977
mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
978
mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
979
980
/* Motion compensation */
981
h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
982
&h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
983
mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
984
}
985
986
for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
987
{
988
int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
989
int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
990
991
h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
992
h->mb.pic.p_fdec[p] + fdec_offset );
993
994
if( h->mb.b_noise_reduction )
995
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
996
h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
997
998
int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
999
FOREACH_BIT( idx, 0, nz )
1000
{
1001
h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
1002
i_decimate_mb += h->quantf.decimate_score16( dctscan );
1003
if( i_decimate_mb >= 6 )
1004
return 0;
1005
}
1006
}
1007
}
1008
1009
if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1010
{
1011
i_qp = h->mb.i_chroma_qp;
1012
int chroma422 = chroma == CHROMA_422;
1013
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
1014
int ssd;
1015
ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
1016
1017
if( !b_bidir )
1018
{
1019
/* Special case for mv0, which is (of course) very common in P-skip mode. */
1020
if( M32( mvp ) )
1021
h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
1022
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
1023
mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
1024
else
1025
h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
1026
h->mb.pic.i_stride[1], chroma422?16:8 );
1027
}
1028
1029
for( int ch = 0; ch < 2; ch++ )
1030
{
1031
pixel *p_src = h->mb.pic.p_fenc[1+ch];
1032
pixel *p_dst = h->mb.pic.p_fdec[1+ch];
1033
1034
if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
1035
h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1036
h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1037
&h->sh.weight[0][1+ch], chroma422?16:8 );
1038
1039
/* there is almost never a termination during chroma, but we can't avoid the check entirely */
1040
/* so instead we check SSD and skip the actual check if the score is low enough. */
1041
ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
1042
if( ssd < thresh )
1043
continue;
1044
1045
/* The vast majority of chroma checks will terminate during the DC check or the higher
1046
* threshold check, so we can save time by doing a DC-only DCT. */
1047
if( h->mb.b_noise_reduction )
1048
{
1049
for( int i = 0; i <= chroma422; i++ )
1050
h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1051
1052
for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
1053
{
1054
h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1055
dct_dc[i4x4] = dct4x4[i4x4][0];
1056
dct4x4[i4x4][0] = 0;
1057
}
1058
}
1059
else
1060
{
1061
if( chroma422 )
1062
h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
1063
else
1064
h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
1065
}
1066
1067
for( int i = 0; i <= chroma422; i++ )
1068
if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
1069
h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
1070
return 0;
1071
1072
/* If there wasn't a termination in DC, we can check against a much higher threshold. */
1073
if( ssd < thresh*4 )
1074
continue;
1075
1076
if( !h->mb.b_noise_reduction )
1077
for( int i = 0; i <= chroma422; i++ )
1078
{
1079
h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1080
dct4x4[i*4+0][0] = 0;
1081
dct4x4[i*4+1][0] = 0;
1082
dct4x4[i*4+2][0] = 0;
1083
dct4x4[i*4+3][0] = 0;
1084
}
1085
1086
/* calculate dct coeffs */
1087
for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
1088
{
1089
int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1090
FOREACH_BIT( idx, i8x8*4, nz )
1091
{
1092
h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
1093
i_decimate_mb += h->quantf.decimate_score15( dctscan );
1094
if( i_decimate_mb >= 7 )
1095
return 0;
1096
}
1097
}
1098
}
1099
}
1100
1101
h->mb.b_skip_mc = 1;
1102
return 1;
1103
}
1104
1105
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
1106
{
1107
if( CHROMA_FORMAT == CHROMA_444 )
1108
return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
1109
else if( CHROMA_FORMAT == CHROMA_422 )
1110
return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
1111
else
1112
return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
1113
}
1114
1115
/****************************************************************************
1116
* DCT-domain noise reduction / adaptive deadzone
1117
* from libavcodec
1118
****************************************************************************/
1119
1120
void x264_noise_reduction_update( x264_t *h )
1121
{
1122
h->nr_offset = h->nr_offset_denoise;
1123
h->nr_residual_sum = h->nr_residual_sum_buf[0];
1124
h->nr_count = h->nr_count_buf[0];
1125
for( int cat = 0; cat < 3 + CHROMA444; cat++ )
1126
{
1127
int dct8x8 = cat&1;
1128
int size = dct8x8 ? 64 : 16;
1129
const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1130
1131
if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
1132
{
1133
for( int i = 0; i < size; i++ )
1134
h->nr_residual_sum[cat][i] >>= 1;
1135
h->nr_count[cat] >>= 1;
1136
}
1137
1138
for( int i = 0; i < size; i++ )
1139
h->nr_offset[cat][i] =
1140
((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1141
+ h->nr_residual_sum[cat][i]/2)
1142
/ ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1143
1144
/* Don't denoise DC coefficients */
1145
h->nr_offset[cat][0] = 0;
1146
}
1147
}
1148
1149
/*****************************************************************************
1150
* RD only; 4 calls to this do not make up for one macroblock_encode.
1151
* doesn't transform chroma dc.
1152
*****************************************************************************/
1153
static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
1154
{
1155
int b_decimate = h->mb.b_dct_decimate;
1156
int i_qp = h->mb.i_qp;
1157
int x = i8&1;
1158
int y = i8>>1;
1159
int nz;
1160
int chroma422 = chroma == CHROMA_422;
1161
1162
h->mb.i_cbp_chroma = 0;
1163
h->mb.i_cbp_luma &= ~(1 << i8);
1164
1165
if( !h->mb.b_skip_mc )
1166
x264_mb_mc_8x8( h, i8 );
1167
1168
if( h->mb.b_lossless )
1169
{
1170
for( int p = 0; p < plane_count; p++ )
1171
{
1172
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1173
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1174
int nnz8x8 = 0;
1175
if( h->mb.b_transform_8x8 )
1176
{
1177
nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
1178
STORE_8x8_NNZ( p, i8, nnz8x8 );
1179
}
1180
else
1181
{
1182
for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1183
{
1184
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
1185
h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
1186
h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
1187
h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
1188
nnz8x8 |= nz;
1189
}
1190
}
1191
h->mb.i_cbp_luma |= nnz8x8 << i8;
1192
}
1193
if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1194
{
1195
for( int ch = 0; ch < 2; ch++ )
1196
{
1197
dctcoef dc;
1198
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1199
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1200
1201
for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1202
{
1203
int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
1204
nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
1205
h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1206
}
1207
}
1208
h->mb.i_cbp_chroma = 0x02;
1209
}
1210
}
1211
else
1212
{
1213
if( h->mb.b_transform_8x8 )
1214
{
1215
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1216
{
1217
int quant_cat = p ? CQM_8PC : CQM_8PY;
1218
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1219
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1220
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
1221
1222
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1223
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
1224
if( nnz8x8 )
1225
{
1226
h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
1227
1228
if( b_decimate && !h->mb.b_trellis )
1229
nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
1230
1231
if( nnz8x8 )
1232
{
1233
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
1234
h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1235
STORE_8x8_NNZ( p, i8, 1 );
1236
h->mb.i_cbp_luma |= 1 << i8;
1237
}
1238
else
1239
STORE_8x8_NNZ( p, i8, 0 );
1240
}
1241
else
1242
STORE_8x8_NNZ( p, i8, 0 );
1243
}
1244
}
1245
else
1246
{
1247
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1248
{
1249
int quant_cat = p ? CQM_4PC : CQM_4PY;
1250
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1251
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1252
int i_decimate_8x8 = b_decimate ? 0 : 4;
1253
ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
1254
int nnz8x8 = 0;
1255
1256
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1257
STORE_8x8_NNZ( p, i8, 0 );
1258
1259
if( h->mb.b_noise_reduction )
1260
for( int idx = 0; idx < 4; idx++ )
1261
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
1262
1263
if( h->mb.b_trellis )
1264
{
1265
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1266
{
1267
if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
1268
{
1269
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
1270
h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
1271
if( i_decimate_8x8 < 4 )
1272
i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
1273
h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
1274
nnz8x8 = 1;
1275
}
1276
}
1277
}
1278
else
1279
{
1280
nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
1281
if( nz )
1282
{
1283
FOREACH_BIT( i4x4, 0, nz )
1284
{
1285
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
1286
h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
1287
if( i_decimate_8x8 < 4 )
1288
i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
1289
h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
1290
}
1291
}
1292
}
1293
if( nnz8x8 )
1294
{
1295
/* decimate this 8x8 block */
1296
if( i_decimate_8x8 < 4 )
1297
STORE_8x8_NNZ( p, i8, 0 );
1298
else
1299
{
1300
h->dctf.add8x8_idct( p_fdec, dct4x4 );
1301
h->mb.i_cbp_luma |= 1 << i8;
1302
}
1303
}
1304
}
1305
}
1306
1307
if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1308
{
1309
i_qp = h->mb.i_chroma_qp;
1310
for( int ch = 0; ch < 2; ch++ )
1311
{
1312
ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
1313
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1314
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1315
1316
for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1317
{
1318
h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
1319
1320
if( h->mb.b_noise_reduction )
1321
h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1322
dct4x4[i4x4][0] = 0;
1323
1324
if( h->mb.b_trellis )
1325
nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1326
else
1327
nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1328
1329
int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
1330
h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1331
if( nz )
1332
{
1333
h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
1334
h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
1335
h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
1336
}
1337
}
1338
}
1339
h->mb.i_cbp_chroma = 0x02;
1340
}
1341
}
1342
}
1343
1344
void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1345
{
1346
if( CHROMA444 )
1347
x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
1348
else if( CHROMA_FORMAT == CHROMA_422 )
1349
x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
1350
else
1351
x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
1352
}
1353
1354
/*****************************************************************************
1355
* RD only, luma only (for 4:2:0)
1356
*****************************************************************************/
1357
static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
1358
{
1359
int i_qp = h->mb.i_qp;
1360
1361
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1362
{
1363
int quant_cat = p ? CQM_4PC : CQM_4PY;
1364
pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
1365
pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
1366
int nz;
1367
1368
/* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1369
1370
if( h->mb.b_lossless )
1371
{
1372
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
1373
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1374
}
1375
else
1376
{
1377
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
1378
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1379
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
1380
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1381
if( nz )
1382
{
1383
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
1384
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
1385
h->dctf.add4x4_idct( p_fdec, dct4x4 );
1386
}
1387
}
1388
}
1389
}
1390
1391
void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
1392
{
1393
if( CHROMA444 )
1394
x264_macroblock_encode_p4x4_internal( h, i8, 3 );
1395
else
1396
x264_macroblock_encode_p4x4_internal( h, i8, 1 );
1397
}
1398
1399