Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52866 views
1
/*****************************************************************************
2
* dct.c: transform and zigzag
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Loren Merritt <[email protected]>
7
* Laurent Aimar <[email protected]>
8
* Henrik Gramner <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include "common.h"
29
#if HAVE_MMX
30
# include "x86/dct.h"
31
#endif
32
#if ARCH_PPC
33
# include "ppc/dct.h"
34
#endif
35
#if ARCH_ARM
36
# include "arm/dct.h"
37
#endif
38
#if ARCH_AARCH64
39
# include "aarch64/dct.h"
40
#endif
41
#if ARCH_MIPS
42
# include "mips/dct.h"
43
#endif
44
45
/* the inverse of the scaling factors introduced by 8x8 fdct */
46
/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47
#define W(i) (i==0 ? FIX8(1.0000) :\
48
i==1 ? FIX8(0.8859) :\
49
i==2 ? FIX8(1.6000) :\
50
i==3 ? FIX8(0.9415) :\
51
i==4 ? FIX8(1.2651) :\
52
i==5 ? FIX8(1.1910) :0)
53
const uint32_t x264_dct8_weight_tab[64] = {
54
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
58
59
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
63
};
64
#undef W
65
66
#define W(i) (i==0 ? FIX8(1.76777) :\
67
i==1 ? FIX8(1.11803) :\
68
i==2 ? FIX8(0.70711) :0)
69
const uint32_t x264_dct4_weight_tab[16] = {
70
W(0), W(1), W(0), W(1),
71
W(1), W(2), W(1), W(2),
72
W(0), W(1), W(0), W(1),
73
W(1), W(2), W(1), W(2)
74
};
75
#undef W
76
77
/* inverse squared */
78
#define W(i) (i==0 ? FIX8(3.125) :\
79
i==1 ? FIX8(1.25) :\
80
i==2 ? FIX8(0.5) :0)
81
const uint32_t x264_dct4_weight2_tab[16] = {
82
W(0), W(1), W(0), W(1),
83
W(1), W(2), W(1), W(2),
84
W(0), W(1), W(0), W(1),
85
W(1), W(2), W(1), W(2)
86
};
87
#undef W
88
89
#define W(i) (i==0 ? FIX8(1.00000) :\
90
i==1 ? FIX8(0.78487) :\
91
i==2 ? FIX8(2.56132) :\
92
i==3 ? FIX8(0.88637) :\
93
i==4 ? FIX8(1.60040) :\
94
i==5 ? FIX8(1.41850) :0)
95
const uint32_t x264_dct8_weight2_tab[64] = {
96
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
100
101
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
105
};
106
#undef W
107
108
109
static void dct4x4dc( dctcoef d[16] )
110
{
111
dctcoef tmp[16];
112
113
for( int i = 0; i < 4; i++ )
114
{
115
int s01 = d[i*4+0] + d[i*4+1];
116
int d01 = d[i*4+0] - d[i*4+1];
117
int s23 = d[i*4+2] + d[i*4+3];
118
int d23 = d[i*4+2] - d[i*4+3];
119
120
tmp[0*4+i] = s01 + s23;
121
tmp[1*4+i] = s01 - s23;
122
tmp[2*4+i] = d01 - d23;
123
tmp[3*4+i] = d01 + d23;
124
}
125
126
for( int i = 0; i < 4; i++ )
127
{
128
int s01 = tmp[i*4+0] + tmp[i*4+1];
129
int d01 = tmp[i*4+0] - tmp[i*4+1];
130
int s23 = tmp[i*4+2] + tmp[i*4+3];
131
int d23 = tmp[i*4+2] - tmp[i*4+3];
132
133
d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134
d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135
d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136
d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
137
}
138
}
139
140
static void idct4x4dc( dctcoef d[16] )
141
{
142
dctcoef tmp[16];
143
144
for( int i = 0; i < 4; i++ )
145
{
146
int s01 = d[i*4+0] + d[i*4+1];
147
int d01 = d[i*4+0] - d[i*4+1];
148
int s23 = d[i*4+2] + d[i*4+3];
149
int d23 = d[i*4+2] - d[i*4+3];
150
151
tmp[0*4+i] = s01 + s23;
152
tmp[1*4+i] = s01 - s23;
153
tmp[2*4+i] = d01 - d23;
154
tmp[3*4+i] = d01 + d23;
155
}
156
157
for( int i = 0; i < 4; i++ )
158
{
159
int s01 = tmp[i*4+0] + tmp[i*4+1];
160
int d01 = tmp[i*4+0] - tmp[i*4+1];
161
int s23 = tmp[i*4+2] + tmp[i*4+3];
162
int d23 = tmp[i*4+2] - tmp[i*4+3];
163
164
d[i*4+0] = s01 + s23;
165
d[i*4+1] = s01 - s23;
166
d[i*4+2] = d01 - d23;
167
d[i*4+3] = d01 + d23;
168
}
169
}
170
171
static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
172
{
173
int a0 = dct4x4[0][0] + dct4x4[1][0];
174
int a1 = dct4x4[2][0] + dct4x4[3][0];
175
int a2 = dct4x4[4][0] + dct4x4[5][0];
176
int a3 = dct4x4[6][0] + dct4x4[7][0];
177
int a4 = dct4x4[0][0] - dct4x4[1][0];
178
int a5 = dct4x4[2][0] - dct4x4[3][0];
179
int a6 = dct4x4[4][0] - dct4x4[5][0];
180
int a7 = dct4x4[6][0] - dct4x4[7][0];
181
int b0 = a0 + a1;
182
int b1 = a2 + a3;
183
int b2 = a4 + a5;
184
int b3 = a6 + a7;
185
int b4 = a0 - a1;
186
int b5 = a2 - a3;
187
int b6 = a4 - a5;
188
int b7 = a6 - a7;
189
dct[0] = b0 + b1;
190
dct[1] = b2 + b3;
191
dct[2] = b0 - b1;
192
dct[3] = b2 - b3;
193
dct[4] = b4 - b5;
194
dct[5] = b6 - b7;
195
dct[6] = b4 + b5;
196
dct[7] = b6 + b7;
197
dct4x4[0][0] = 0;
198
dct4x4[1][0] = 0;
199
dct4x4[2][0] = 0;
200
dct4x4[3][0] = 0;
201
dct4x4[4][0] = 0;
202
dct4x4[5][0] = 0;
203
dct4x4[6][0] = 0;
204
dct4x4[7][0] = 0;
205
}
206
207
static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208
pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
209
{
210
for( int y = 0; y < i_size; y++ )
211
{
212
for( int x = 0; x < i_size; x++ )
213
diff[x + y*i_size] = pix1[x] - pix2[x];
214
pix1 += i_pix1;
215
pix2 += i_pix2;
216
}
217
}
218
219
static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
220
{
221
dctcoef d[16];
222
dctcoef tmp[16];
223
224
pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
225
226
for( int i = 0; i < 4; i++ )
227
{
228
int s03 = d[i*4+0] + d[i*4+3];
229
int s12 = d[i*4+1] + d[i*4+2];
230
int d03 = d[i*4+0] - d[i*4+3];
231
int d12 = d[i*4+1] - d[i*4+2];
232
233
tmp[0*4+i] = s03 + s12;
234
tmp[1*4+i] = 2*d03 + d12;
235
tmp[2*4+i] = s03 - s12;
236
tmp[3*4+i] = d03 - 2*d12;
237
}
238
239
for( int i = 0; i < 4; i++ )
240
{
241
int s03 = tmp[i*4+0] + tmp[i*4+3];
242
int s12 = tmp[i*4+1] + tmp[i*4+2];
243
int d03 = tmp[i*4+0] - tmp[i*4+3];
244
int d12 = tmp[i*4+1] - tmp[i*4+2];
245
246
dct[i*4+0] = s03 + s12;
247
dct[i*4+1] = 2*d03 + d12;
248
dct[i*4+2] = s03 - s12;
249
dct[i*4+3] = d03 - 2*d12;
250
}
251
}
252
253
static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
254
{
255
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257
sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
259
}
260
261
static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
262
{
263
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265
sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
267
}
268
269
static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
270
{
271
int sum = 0;
272
for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273
sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274
- pix2[0] - pix2[1] - pix2[2] - pix2[3];
275
return sum;
276
}
277
278
static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
279
{
280
dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281
dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282
dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283
dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
284
285
/* 2x2 DC transform */
286
int d0 = dct[0] + dct[1];
287
int d1 = dct[2] + dct[3];
288
int d2 = dct[0] - dct[1];
289
int d3 = dct[2] - dct[3];
290
dct[0] = d0 + d1;
291
dct[1] = d0 - d1;
292
dct[2] = d2 + d3;
293
dct[3] = d2 - d3;
294
}
295
296
static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
297
{
298
int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299
int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300
int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301
int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302
int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303
int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304
int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305
int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
306
307
/* 2x4 DC transform */
308
int b0 = a0 + a1;
309
int b1 = a2 + a3;
310
int b2 = a4 + a5;
311
int b3 = a6 + a7;
312
int b4 = a0 - a1;
313
int b5 = a2 - a3;
314
int b6 = a4 - a5;
315
int b7 = a6 - a7;
316
a0 = b0 + b1;
317
a1 = b2 + b3;
318
a2 = b4 + b5;
319
a3 = b6 + b7;
320
a4 = b0 - b1;
321
a5 = b2 - b3;
322
a6 = b4 - b5;
323
a7 = b6 - b7;
324
dct[0] = a0 + a1;
325
dct[1] = a2 + a3;
326
dct[2] = a0 - a1;
327
dct[3] = a2 - a3;
328
dct[4] = a4 - a5;
329
dct[5] = a6 - a7;
330
dct[6] = a4 + a5;
331
dct[7] = a6 + a7;
332
}
333
334
static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
335
{
336
dctcoef d[16];
337
dctcoef tmp[16];
338
339
for( int i = 0; i < 4; i++ )
340
{
341
int s02 = dct[0*4+i] + dct[2*4+i];
342
int d02 = dct[0*4+i] - dct[2*4+i];
343
int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
344
int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
345
346
tmp[i*4+0] = s02 + s13;
347
tmp[i*4+1] = d02 + d13;
348
tmp[i*4+2] = d02 - d13;
349
tmp[i*4+3] = s02 - s13;
350
}
351
352
for( int i = 0; i < 4; i++ )
353
{
354
int s02 = tmp[0*4+i] + tmp[2*4+i];
355
int d02 = tmp[0*4+i] - tmp[2*4+i];
356
int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
357
int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
358
359
d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360
d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361
d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362
d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
363
}
364
365
366
for( int y = 0; y < 4; y++ )
367
{
368
for( int x = 0; x < 4; x++ )
369
p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370
p_dst += FDEC_STRIDE;
371
}
372
}
373
374
static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
375
{
376
add4x4_idct( &p_dst[0], dct[0] );
377
add4x4_idct( &p_dst[4], dct[1] );
378
add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
380
}
381
382
static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
383
{
384
add8x8_idct( &p_dst[0], &dct[0] );
385
add8x8_idct( &p_dst[8], &dct[4] );
386
add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387
add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
388
}
389
390
/****************************************************************************
391
* 8x8 transform:
392
****************************************************************************/
393
394
#define DCT8_1D {\
395
int s07 = SRC(0) + SRC(7);\
396
int s16 = SRC(1) + SRC(6);\
397
int s25 = SRC(2) + SRC(5);\
398
int s34 = SRC(3) + SRC(4);\
399
int a0 = s07 + s34;\
400
int a1 = s16 + s25;\
401
int a2 = s07 - s34;\
402
int a3 = s16 - s25;\
403
int d07 = SRC(0) - SRC(7);\
404
int d16 = SRC(1) - SRC(6);\
405
int d25 = SRC(2) - SRC(5);\
406
int d34 = SRC(3) - SRC(4);\
407
int a4 = d16 + d25 + (d07 + (d07>>1));\
408
int a5 = d07 - d34 - (d25 + (d25>>1));\
409
int a6 = d07 + d34 - (d16 + (d16>>1));\
410
int a7 = d16 - d25 + (d34 + (d34>>1));\
411
DST(0) = a0 + a1 ;\
412
DST(1) = a4 + (a7>>2);\
413
DST(2) = a2 + (a3>>1);\
414
DST(3) = a5 + (a6>>2);\
415
DST(4) = a0 - a1 ;\
416
DST(5) = a6 - (a5>>2);\
417
DST(6) = (a2>>1) - a3 ;\
418
DST(7) = (a4>>2) - a7 ;\
419
}
420
421
static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
422
{
423
dctcoef tmp[64];
424
425
pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
426
427
#define SRC(x) tmp[x*8+i]
428
#define DST(x) tmp[x*8+i]
429
for( int i = 0; i < 8; i++ )
430
DCT8_1D
431
#undef SRC
432
#undef DST
433
434
#define SRC(x) tmp[i*8+x]
435
#define DST(x) dct[x*8+i]
436
for( int i = 0; i < 8; i++ )
437
DCT8_1D
438
#undef SRC
439
#undef DST
440
}
441
442
static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
443
{
444
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
445
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
446
sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447
sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
448
}
449
450
#define IDCT8_1D {\
451
int a0 = SRC(0) + SRC(4);\
452
int a2 = SRC(0) - SRC(4);\
453
int a4 = (SRC(2)>>1) - SRC(6);\
454
int a6 = (SRC(6)>>1) + SRC(2);\
455
int b0 = a0 + a6;\
456
int b2 = a2 + a4;\
457
int b4 = a2 - a4;\
458
int b6 = a0 - a6;\
459
int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460
int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461
int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462
int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463
int b1 = (a7>>2) + a1;\
464
int b3 = a3 + (a5>>2);\
465
int b5 = (a3>>2) - a5;\
466
int b7 = a7 - (a1>>2);\
467
DST(0, b0 + b7);\
468
DST(1, b2 + b5);\
469
DST(2, b4 + b3);\
470
DST(3, b6 + b1);\
471
DST(4, b6 - b1);\
472
DST(5, b4 - b3);\
473
DST(6, b2 - b5);\
474
DST(7, b0 - b7);\
475
}
476
477
static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
478
{
479
dct[0] += 32; // rounding for the >>6 at the end
480
481
#define SRC(x) dct[x*8+i]
482
#define DST(x,rhs) dct[x*8+i] = (rhs)
483
for( int i = 0; i < 8; i++ )
484
IDCT8_1D
485
#undef SRC
486
#undef DST
487
488
#define SRC(x) dct[i*8+x]
489
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490
for( int i = 0; i < 8; i++ )
491
IDCT8_1D
492
#undef SRC
493
#undef DST
494
}
495
496
static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
497
{
498
add8x8_idct8( &dst[0], dct[0] );
499
add8x8_idct8( &dst[8], dct[1] );
500
add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
502
}
503
504
static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
505
{
506
dc = (dc + 32) >> 6;
507
for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
508
{
509
p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510
p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511
p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512
p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
513
}
514
}
515
516
static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
517
{
518
add4x4_idct_dc( &p_dst[0], dct[0] );
519
add4x4_idct_dc( &p_dst[4], dct[1] );
520
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
522
}
523
524
static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
525
{
526
for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
527
{
528
add4x4_idct_dc( &p_dst[ 0], dct[0] );
529
add4x4_idct_dc( &p_dst[ 4], dct[1] );
530
add4x4_idct_dc( &p_dst[ 8], dct[2] );
531
add4x4_idct_dc( &p_dst[12], dct[3] );
532
}
533
}
534
535
536
/****************************************************************************
537
* x264_dct_init:
538
****************************************************************************/
539
void x264_dct_init( int cpu, x264_dct_function_t *dctf )
540
{
541
dctf->sub4x4_dct = sub4x4_dct;
542
dctf->add4x4_idct = add4x4_idct;
543
544
dctf->sub8x8_dct = sub8x8_dct;
545
dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546
dctf->add8x8_idct = add8x8_idct;
547
dctf->add8x8_idct_dc = add8x8_idct_dc;
548
549
dctf->sub8x16_dct_dc = sub8x16_dct_dc;
550
551
dctf->sub16x16_dct = sub16x16_dct;
552
dctf->add16x16_idct = add16x16_idct;
553
dctf->add16x16_idct_dc = add16x16_idct_dc;
554
555
dctf->sub8x8_dct8 = sub8x8_dct8;
556
dctf->add8x8_idct8 = add8x8_idct8;
557
558
dctf->sub16x16_dct8 = sub16x16_dct8;
559
dctf->add16x16_idct8 = add16x16_idct8;
560
561
dctf->dct4x4dc = dct4x4dc;
562
dctf->idct4x4dc = idct4x4dc;
563
564
dctf->dct2x4dc = dct2x4dc;
565
566
#if HIGH_BIT_DEPTH
567
#if HAVE_MMX
568
if( cpu&X264_CPU_MMX )
569
{
570
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
571
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
572
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
573
}
574
if( cpu&X264_CPU_SSE2 )
575
{
576
dctf->add4x4_idct = x264_add4x4_idct_sse2;
577
dctf->dct4x4dc = x264_dct4x4dc_sse2;
578
dctf->idct4x4dc = x264_idct4x4dc_sse2;
579
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
580
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
581
dctf->add8x8_idct = x264_add8x8_idct_sse2;
582
dctf->add16x16_idct = x264_add16x16_idct_sse2;
583
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
584
dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
585
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
586
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
587
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
588
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
589
}
590
if( cpu&X264_CPU_SSE4 )
591
{
592
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
593
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
594
}
595
if( cpu&X264_CPU_AVX )
596
{
597
dctf->add4x4_idct = x264_add4x4_idct_avx;
598
dctf->dct4x4dc = x264_dct4x4dc_avx;
599
dctf->idct4x4dc = x264_idct4x4dc_avx;
600
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
601
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
602
dctf->add8x8_idct = x264_add8x8_idct_avx;
603
dctf->add16x16_idct = x264_add16x16_idct_avx;
604
dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
605
dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
606
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
607
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
608
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
609
}
610
#endif // HAVE_MMX
611
#else // !HIGH_BIT_DEPTH
612
#if HAVE_MMX
613
if( cpu&X264_CPU_MMX )
614
{
615
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
616
dctf->add4x4_idct = x264_add4x4_idct_mmx;
617
dctf->idct4x4dc = x264_idct4x4dc_mmx;
618
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
619
620
#if !ARCH_X86_64
621
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
622
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
623
dctf->add8x8_idct = x264_add8x8_idct_mmx;
624
dctf->add16x16_idct = x264_add16x16_idct_mmx;
625
626
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
627
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
628
dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
629
dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
630
#endif
631
}
632
633
if( cpu&X264_CPU_MMX2 )
634
{
635
dctf->dct4x4dc = x264_dct4x4dc_mmx2;
636
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
637
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
638
}
639
640
if( cpu&X264_CPU_SSE2 )
641
{
642
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
643
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
644
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
645
dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
646
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
647
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
648
649
if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
650
{
651
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
652
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
653
dctf->add8x8_idct = x264_add8x8_idct_sse2;
654
dctf->add16x16_idct = x264_add16x16_idct_sse2;
655
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
656
}
657
}
658
659
if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
660
{
661
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
662
if( !(cpu&X264_CPU_SLOW_ATOM) )
663
{
664
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
665
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
666
dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
667
dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
668
dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
669
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
670
{
671
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
672
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
673
}
674
}
675
}
676
677
if( cpu&X264_CPU_SSE4 )
678
dctf->add4x4_idct = x264_add4x4_idct_sse4;
679
680
if( cpu&X264_CPU_AVX )
681
{
682
dctf->add4x4_idct = x264_add4x4_idct_avx;
683
dctf->add8x8_idct = x264_add8x8_idct_avx;
684
dctf->add16x16_idct = x264_add16x16_idct_avx;
685
dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
686
dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
687
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
688
dctf->sub8x8_dct = x264_sub8x8_dct_avx;
689
dctf->sub16x16_dct = x264_sub16x16_dct_avx;
690
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
691
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
692
}
693
694
if( cpu&X264_CPU_XOP )
695
{
696
dctf->sub8x8_dct = x264_sub8x8_dct_xop;
697
dctf->sub16x16_dct = x264_sub16x16_dct_xop;
698
}
699
700
if( cpu&X264_CPU_AVX2 )
701
{
702
dctf->add8x8_idct = x264_add8x8_idct_avx2;
703
dctf->add16x16_idct = x264_add16x16_idct_avx2;
704
dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
705
dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
706
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
707
#if ARCH_X86_64
708
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
709
#endif
710
}
711
#endif //HAVE_MMX
712
713
#if HAVE_ALTIVEC
714
if( cpu&X264_CPU_ALTIVEC )
715
{
716
dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
717
dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
718
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
719
720
dctf->add4x4_idct = x264_add4x4_idct_altivec;
721
dctf->add8x8_idct = x264_add8x8_idct_altivec;
722
dctf->add16x16_idct = x264_add16x16_idct_altivec;
723
724
dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
725
dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
726
727
dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
728
dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
729
}
730
#endif
731
732
#if HAVE_ARMV6 || ARCH_AARCH64
733
if( cpu&X264_CPU_NEON )
734
{
735
dctf->sub4x4_dct = x264_sub4x4_dct_neon;
736
dctf->sub8x8_dct = x264_sub8x8_dct_neon;
737
dctf->sub16x16_dct = x264_sub16x16_dct_neon;
738
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
739
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
740
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
741
dctf->dct4x4dc = x264_dct4x4dc_neon;
742
dctf->idct4x4dc = x264_idct4x4dc_neon;
743
744
dctf->add4x4_idct = x264_add4x4_idct_neon;
745
dctf->add8x8_idct = x264_add8x8_idct_neon;
746
dctf->add16x16_idct = x264_add16x16_idct_neon;
747
748
dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
749
dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
750
751
dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
752
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
753
dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
754
}
755
#endif
756
757
#if HAVE_MSA
758
if( cpu&X264_CPU_MSA )
759
{
760
dctf->sub4x4_dct = x264_sub4x4_dct_msa;
761
dctf->sub8x8_dct = x264_sub8x8_dct_msa;
762
dctf->sub16x16_dct = x264_sub16x16_dct_msa;
763
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
764
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
765
dctf->dct4x4dc = x264_dct4x4dc_msa;
766
dctf->idct4x4dc = x264_idct4x4dc_msa;
767
dctf->add4x4_idct = x264_add4x4_idct_msa;
768
dctf->add8x8_idct = x264_add8x8_idct_msa;
769
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
770
dctf->add16x16_idct = x264_add16x16_idct_msa;
771
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
772
dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
773
dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
774
}
775
#endif
776
777
#endif // HIGH_BIT_DEPTH
778
}
779
780
781
#define ZIG(i,y,x) level[i] = dct[x*8+y];
782
#define ZIGZAG8_FRAME\
783
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
784
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
785
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
786
ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
787
ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
788
ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
789
ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
790
ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
791
ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
792
ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
793
ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
794
ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
795
ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
796
ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
797
ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
798
ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
799
800
#define ZIGZAG8_FIELD\
801
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
802
ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
803
ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
804
ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
805
ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
806
ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
807
ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
808
ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
809
ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
810
ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
811
ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
812
ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
813
ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
814
ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
815
ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
816
ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
817
818
#define ZIGZAG4_FRAME\
819
ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
820
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
821
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
822
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
823
824
#define ZIGZAG4_FIELD\
825
ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
826
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
827
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
828
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
829
830
static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
831
{
832
ZIGZAG8_FRAME
833
}
834
835
static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
836
{
837
ZIGZAG8_FIELD
838
}
839
840
#undef ZIG
841
#define ZIG(i,y,x) level[i] = dct[x*4+y];
842
#define ZIGDC(i,y,x) ZIG(i,y,x)
843
844
static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
845
{
846
ZIGZAG4_FRAME
847
}
848
849
static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
850
{
851
memcpy( level, dct, 2 * sizeof(dctcoef) );
852
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
853
memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
854
}
855
856
#undef ZIG
857
#define ZIG(i,y,x) {\
858
int oe = x+y*FENC_STRIDE;\
859
int od = x+y*FDEC_STRIDE;\
860
level[i] = p_src[oe] - p_dst[od];\
861
nz |= level[i];\
862
}
863
#define COPY4x4\
864
CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
865
CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
866
CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
867
CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
868
#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
869
#define COPY8x8\
870
CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
871
CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
872
CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
873
CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
874
CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
875
CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
876
CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
877
CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
878
879
static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
880
{
881
int nz = 0;
882
ZIGZAG4_FRAME
883
COPY4x4
884
return !!nz;
885
}
886
887
static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
888
{
889
int nz = 0;
890
ZIGZAG4_FIELD
891
COPY4x4
892
return !!nz;
893
}
894
895
#undef ZIGDC
896
#define ZIGDC(i,y,x) {\
897
int oe = x+y*FENC_STRIDE;\
898
int od = x+y*FDEC_STRIDE;\
899
*dc = p_src[oe] - p_dst[od];\
900
level[0] = 0;\
901
}
902
903
static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
904
{
905
int nz = 0;
906
ZIGZAG4_FRAME
907
COPY4x4
908
return !!nz;
909
}
910
911
static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
912
{
913
int nz = 0;
914
ZIGZAG4_FIELD
915
COPY4x4
916
return !!nz;
917
}
918
919
static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
920
{
921
int nz = 0;
922
ZIGZAG8_FRAME
923
COPY8x8
924
return !!nz;
925
}
926
static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
927
{
928
int nz = 0;
929
ZIGZAG8_FIELD
930
COPY8x8
931
return !!nz;
932
}
933
934
#undef ZIG
935
#undef COPY4x4
936
937
static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
938
{
939
for( int i = 0; i < 4; i++ )
940
{
941
int nz = 0;
942
for( int j = 0; j < 16; j++ )
943
{
944
nz |= src[i+j*4];
945
dst[i*16+j] = src[i+j*4];
946
}
947
nnz[(i&1) + (i>>1)*8] = !!nz;
948
}
949
}
950
951
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
952
{
953
pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
954
pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
955
pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
956
pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
957
pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
958
pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
959
pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
960
pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
961
pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
962
pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
963
964
#if HIGH_BIT_DEPTH
965
#if HAVE_MMX
966
if( cpu&X264_CPU_SSE2 )
967
{
968
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
969
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
970
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
971
}
972
if( cpu&X264_CPU_SSE4 )
973
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
974
if( cpu&X264_CPU_AVX )
975
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
976
#if ARCH_X86_64
977
if( cpu&X264_CPU_AVX )
978
{
979
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
980
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
981
}
982
#endif // ARCH_X86_64
983
#endif // HAVE_MMX
984
#else
985
#if HAVE_MMX
986
if( cpu&X264_CPU_MMX )
987
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
988
if( cpu&X264_CPU_MMX2 )
989
{
990
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
991
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
992
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
993
}
994
if( cpu&X264_CPU_SSE2_IS_FAST )
995
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
996
if( cpu&X264_CPU_SSSE3 )
997
{
998
pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
999
pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
1000
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1001
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1002
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1003
if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1004
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1005
}
1006
if( cpu&X264_CPU_AVX )
1007
{
1008
pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
1009
pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
1010
#if ARCH_X86_64
1011
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1012
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1013
#endif
1014
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1015
}
1016
if( cpu&X264_CPU_XOP )
1017
{
1018
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1019
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1020
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1021
}
1022
#endif // HAVE_MMX
1023
#if HAVE_ALTIVEC
1024
if( cpu&X264_CPU_ALTIVEC )
1025
{
1026
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
1027
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1028
}
1029
#endif
1030
#if HAVE_ARMV6 || ARCH_AARCH64
1031
if( cpu&X264_CPU_NEON )
1032
{
1033
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1034
#if ARCH_AARCH64
1035
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
1036
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
1037
pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
1038
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
1039
pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
1040
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
1041
pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
1042
pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1043
pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
1044
#endif // ARCH_AARCH64
1045
}
1046
#endif // HAVE_ARMV6 || ARCH_AARCH64
1047
#endif // HIGH_BIT_DEPTH
1048
1049
pf_interlaced->interleave_8x8_cavlc =
1050
pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1051
#if HAVE_MMX
1052
#if HIGH_BIT_DEPTH
1053
if( cpu&X264_CPU_SSE2 )
1054
{
1055
pf_interlaced->interleave_8x8_cavlc =
1056
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1057
}
1058
if( cpu&X264_CPU_AVX )
1059
{
1060
pf_interlaced->interleave_8x8_cavlc =
1061
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1062
}
1063
#else
1064
if( cpu&X264_CPU_MMX )
1065
{
1066
pf_interlaced->interleave_8x8_cavlc =
1067
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1068
}
1069
if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1070
{
1071
pf_interlaced->interleave_8x8_cavlc =
1072
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1073
}
1074
1075
if( cpu&X264_CPU_AVX )
1076
{
1077
pf_interlaced->interleave_8x8_cavlc =
1078
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1079
}
1080
1081
if( cpu&X264_CPU_AVX2 )
1082
{
1083
pf_interlaced->interleave_8x8_cavlc =
1084
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1085
}
1086
#endif // HIGH_BIT_DEPTH
1087
#endif
1088
#if !HIGH_BIT_DEPTH
1089
#if ARCH_AARCH64
1090
if( cpu&X264_CPU_NEON )
1091
{
1092
pf_interlaced->interleave_8x8_cavlc =
1093
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
1094
}
1095
#endif // ARCH_AARCH64
1096
#endif // !HIGH_BIT_DEPTH
1097
#if !HIGH_BIT_DEPTH
1098
#if HAVE_MSA
1099
if( cpu&X264_CPU_MSA )
1100
{
1101
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;
1102
}
1103
#endif
1104
#endif
1105
}
1106
1107