CoCalc -- dct.c

05. Matplotlib / ffmpeg-3.0 / libx264 / common / dct.c
⁵²⁸⁶⁶ views
1
/*****************************************************************************
2
 * dct.c: transform and zigzag
3
 *****************************************************************************
4
 * Copyright (C) 2003-2016 x264 project
5
 *
6
 * Authors: Loren Merritt <[email protected]>
7
 *          Laurent Aimar <[email protected]>
8
 *          Henrik Gramner <[email protected]>
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
 *
24
 * This program is also available under a commercial proprietary license.
25
 * For more information, contact us at [email protected].
26
 *****************************************************************************/
27

28
#include "common.h"
29
#if HAVE_MMX
30
#   include "x86/dct.h"
31
#endif
32
#if ARCH_PPC
33
#   include "ppc/dct.h"
34
#endif
35
#if ARCH_ARM
36
#   include "arm/dct.h"
37
#endif
38
#if ARCH_AARCH64
39
#   include "aarch64/dct.h"
40
#endif
41
#if ARCH_MIPS
42
#   include "mips/dct.h"
43
#endif
44

45
/* the inverse of the scaling factors introduced by 8x8 fdct */
46
/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47
#define W(i) (i==0 ? FIX8(1.0000) :\
48
              i==1 ? FIX8(0.8859) :\
49
              i==2 ? FIX8(1.6000) :\
50
              i==3 ? FIX8(0.9415) :\
51
              i==4 ? FIX8(1.2651) :\
52
              i==5 ? FIX8(1.1910) :0)
53
const uint32_t x264_dct8_weight_tab[64] = {
54
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
55
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
56
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
57
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
58

59
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
60
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
61
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
62
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
63
};
64
#undef W
65

66
#define W(i) (i==0 ? FIX8(1.76777) :\
67
              i==1 ? FIX8(1.11803) :\
68
              i==2 ? FIX8(0.70711) :0)
69
const uint32_t x264_dct4_weight_tab[16] = {
70
    W(0), W(1), W(0), W(1),
71
    W(1), W(2), W(1), W(2),
72
    W(0), W(1), W(0), W(1),
73
    W(1), W(2), W(1), W(2)
74
};
75
#undef W
76

77
/* inverse squared */
78
#define W(i) (i==0 ? FIX8(3.125) :\
79
              i==1 ? FIX8(1.25) :\
80
              i==2 ? FIX8(0.5) :0)
81
const uint32_t x264_dct4_weight2_tab[16] = {
82
    W(0), W(1), W(0), W(1),
83
    W(1), W(2), W(1), W(2),
84
    W(0), W(1), W(0), W(1),
85
    W(1), W(2), W(1), W(2)
86
};
87
#undef W
88

89
#define W(i) (i==0 ? FIX8(1.00000) :\
90
              i==1 ? FIX8(0.78487) :\
91
              i==2 ? FIX8(2.56132) :\
92
              i==3 ? FIX8(0.88637) :\
93
              i==4 ? FIX8(1.60040) :\
94
              i==5 ? FIX8(1.41850) :0)
95
const uint32_t x264_dct8_weight2_tab[64] = {
96
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
97
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
98
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
99
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
100

101
    W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
102
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
103
    W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
104
    W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
105
};
106
#undef W
107

108

109
static void dct4x4dc( dctcoef d[16] )
110
{
111
    dctcoef tmp[16];
112

113
    for( int i = 0; i < 4; i++ )
114
    {
115
        int s01 = d[i*4+0] + d[i*4+1];
116
        int d01 = d[i*4+0] - d[i*4+1];
117
        int s23 = d[i*4+2] + d[i*4+3];
118
        int d23 = d[i*4+2] - d[i*4+3];
119

120
        tmp[0*4+i] = s01 + s23;
121
        tmp[1*4+i] = s01 - s23;
122
        tmp[2*4+i] = d01 - d23;
123
        tmp[3*4+i] = d01 + d23;
124
    }
125

126
    for( int i = 0; i < 4; i++ )
127
    {
128
        int s01 = tmp[i*4+0] + tmp[i*4+1];
129
        int d01 = tmp[i*4+0] - tmp[i*4+1];
130
        int s23 = tmp[i*4+2] + tmp[i*4+3];
131
        int d23 = tmp[i*4+2] - tmp[i*4+3];
132

133
        d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134
        d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135
        d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136
        d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
137
    }
138
}
139

140
static void idct4x4dc( dctcoef d[16] )
141
{
142
    dctcoef tmp[16];
143

144
    for( int i = 0; i < 4; i++ )
145
    {
146
        int s01 = d[i*4+0] + d[i*4+1];
147
        int d01 = d[i*4+0] - d[i*4+1];
148
        int s23 = d[i*4+2] + d[i*4+3];
149
        int d23 = d[i*4+2] - d[i*4+3];
150

151
        tmp[0*4+i] = s01 + s23;
152
        tmp[1*4+i] = s01 - s23;
153
        tmp[2*4+i] = d01 - d23;
154
        tmp[3*4+i] = d01 + d23;
155
    }
156

157
    for( int i = 0; i < 4; i++ )
158
    {
159
        int s01 = tmp[i*4+0] + tmp[i*4+1];
160
        int d01 = tmp[i*4+0] - tmp[i*4+1];
161
        int s23 = tmp[i*4+2] + tmp[i*4+3];
162
        int d23 = tmp[i*4+2] - tmp[i*4+3];
163

164
        d[i*4+0] = s01 + s23;
165
        d[i*4+1] = s01 - s23;
166
        d[i*4+2] = d01 - d23;
167
        d[i*4+3] = d01 + d23;
168
    }
169
}
170

171
static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
172
{
173
    int a0 = dct4x4[0][0] + dct4x4[1][0];
174
    int a1 = dct4x4[2][0] + dct4x4[3][0];
175
    int a2 = dct4x4[4][0] + dct4x4[5][0];
176
    int a3 = dct4x4[6][0] + dct4x4[7][0];
177
    int a4 = dct4x4[0][0] - dct4x4[1][0];
178
    int a5 = dct4x4[2][0] - dct4x4[3][0];
179
    int a6 = dct4x4[4][0] - dct4x4[5][0];
180
    int a7 = dct4x4[6][0] - dct4x4[7][0];
181
    int b0 = a0 + a1;
182
    int b1 = a2 + a3;
183
    int b2 = a4 + a5;
184
    int b3 = a6 + a7;
185
    int b4 = a0 - a1;
186
    int b5 = a2 - a3;
187
    int b6 = a4 - a5;
188
    int b7 = a6 - a7;
189
    dct[0] = b0 + b1;
190
    dct[1] = b2 + b3;
191
    dct[2] = b0 - b1;
192
    dct[3] = b2 - b3;
193
    dct[4] = b4 - b5;
194
    dct[5] = b6 - b7;
195
    dct[6] = b4 + b5;
196
    dct[7] = b6 + b7;
197
    dct4x4[0][0] = 0;
198
    dct4x4[1][0] = 0;
199
    dct4x4[2][0] = 0;
200
    dct4x4[3][0] = 0;
201
    dct4x4[4][0] = 0;
202
    dct4x4[5][0] = 0;
203
    dct4x4[6][0] = 0;
204
    dct4x4[7][0] = 0;
205
}
206

207
static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208
                                  pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
209
{
210
    for( int y = 0; y < i_size; y++ )
211
    {
212
        for( int x = 0; x < i_size; x++ )
213
            diff[x + y*i_size] = pix1[x] - pix2[x];
214
        pix1 += i_pix1;
215
        pix2 += i_pix2;
216
    }
217
}
218

219
static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
220
{
221
    dctcoef d[16];
222
    dctcoef tmp[16];
223

224
    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
225

226
    for( int i = 0; i < 4; i++ )
227
    {
228
        int s03 = d[i*4+0] + d[i*4+3];
229
        int s12 = d[i*4+1] + d[i*4+2];
230
        int d03 = d[i*4+0] - d[i*4+3];
231
        int d12 = d[i*4+1] - d[i*4+2];
232

233
        tmp[0*4+i] =   s03 +   s12;
234
        tmp[1*4+i] = 2*d03 +   d12;
235
        tmp[2*4+i] =   s03 -   s12;
236
        tmp[3*4+i] =   d03 - 2*d12;
237
    }
238

239
    for( int i = 0; i < 4; i++ )
240
    {
241
        int s03 = tmp[i*4+0] + tmp[i*4+3];
242
        int s12 = tmp[i*4+1] + tmp[i*4+2];
243
        int d03 = tmp[i*4+0] - tmp[i*4+3];
244
        int d12 = tmp[i*4+1] - tmp[i*4+2];
245

246
        dct[i*4+0] =   s03 +   s12;
247
        dct[i*4+1] = 2*d03 +   d12;
248
        dct[i*4+2] =   s03 -   s12;
249
        dct[i*4+3] =   d03 - 2*d12;
250
    }
251
}
252

253
static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
254
{
255
    sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256
    sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257
    sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258
    sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
259
}
260

261
static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
262
{
263
    sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264
    sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265
    sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266
    sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
267
}
268

269
static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
270
{
271
    int sum = 0;
272
    for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273
        sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274
             - pix2[0] - pix2[1] - pix2[2] - pix2[3];
275
    return sum;
276
}
277

278
static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
279
{
280
    dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281
    dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282
    dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283
    dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
284

285
    /* 2x2 DC transform */
286
    int d0 = dct[0] + dct[1];
287
    int d1 = dct[2] + dct[3];
288
    int d2 = dct[0] - dct[1];
289
    int d3 = dct[2] - dct[3];
290
    dct[0] = d0 + d1;
291
    dct[1] = d0 - d1;
292
    dct[2] = d2 + d3;
293
    dct[3] = d2 - d3;
294
}
295

296
static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
297
{
298
    int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299
    int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300
    int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301
    int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302
    int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303
    int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304
    int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305
    int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
306

307
    /* 2x4 DC transform */
308
    int b0 = a0 + a1;
309
    int b1 = a2 + a3;
310
    int b2 = a4 + a5;
311
    int b3 = a6 + a7;
312
    int b4 = a0 - a1;
313
    int b5 = a2 - a3;
314
    int b6 = a4 - a5;
315
    int b7 = a6 - a7;
316
    a0 = b0 + b1;
317
    a1 = b2 + b3;
318
    a2 = b4 + b5;
319
    a3 = b6 + b7;
320
    a4 = b0 - b1;
321
    a5 = b2 - b3;
322
    a6 = b4 - b5;
323
    a7 = b6 - b7;
324
    dct[0] = a0 + a1;
325
    dct[1] = a2 + a3;
326
    dct[2] = a0 - a1;
327
    dct[3] = a2 - a3;
328
    dct[4] = a4 - a5;
329
    dct[5] = a6 - a7;
330
    dct[6] = a4 + a5;
331
    dct[7] = a6 + a7;
332
}
333

334
static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
335
{
336
    dctcoef d[16];
337
    dctcoef tmp[16];
338

339
    for( int i = 0; i < 4; i++ )
340
    {
341
        int s02 =  dct[0*4+i]     +  dct[2*4+i];
342
        int d02 =  dct[0*4+i]     -  dct[2*4+i];
343
        int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
344
        int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
345

346
        tmp[i*4+0] = s02 + s13;
347
        tmp[i*4+1] = d02 + d13;
348
        tmp[i*4+2] = d02 - d13;
349
        tmp[i*4+3] = s02 - s13;
350
    }
351

352
    for( int i = 0; i < 4; i++ )
353
    {
354
        int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
355
        int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
356
        int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
357
        int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
358

359
        d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360
        d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361
        d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362
        d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
363
    }
364

365

366
    for( int y = 0; y < 4; y++ )
367
    {
368
        for( int x = 0; x < 4; x++ )
369
            p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370
        p_dst += FDEC_STRIDE;
371
    }
372
}
373

374
static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
375
{
376
    add4x4_idct( &p_dst[0],               dct[0] );
377
    add4x4_idct( &p_dst[4],               dct[1] );
378
    add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379
    add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
380
}
381

382
static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
383
{
384
    add8x8_idct( &p_dst[0],               &dct[0] );
385
    add8x8_idct( &p_dst[8],               &dct[4] );
386
    add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387
    add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
388
}
389

390
/****************************************************************************
391
 * 8x8 transform:
392
 ****************************************************************************/
393

394
#define DCT8_1D {\
395
    int s07 = SRC(0) + SRC(7);\
396
    int s16 = SRC(1) + SRC(6);\
397
    int s25 = SRC(2) + SRC(5);\
398
    int s34 = SRC(3) + SRC(4);\
399
    int a0 = s07 + s34;\
400
    int a1 = s16 + s25;\
401
    int a2 = s07 - s34;\
402
    int a3 = s16 - s25;\
403
    int d07 = SRC(0) - SRC(7);\
404
    int d16 = SRC(1) - SRC(6);\
405
    int d25 = SRC(2) - SRC(5);\
406
    int d34 = SRC(3) - SRC(4);\
407
    int a4 = d16 + d25 + (d07 + (d07>>1));\
408
    int a5 = d07 - d34 - (d25 + (d25>>1));\
409
    int a6 = d07 + d34 - (d16 + (d16>>1));\
410
    int a7 = d16 - d25 + (d34 + (d34>>1));\
411
    DST(0) =  a0 + a1     ;\
412
    DST(1) =  a4 + (a7>>2);\
413
    DST(2) =  a2 + (a3>>1);\
414
    DST(3) =  a5 + (a6>>2);\
415
    DST(4) =  a0 - a1     ;\
416
    DST(5) =  a6 - (a5>>2);\
417
    DST(6) = (a2>>1) - a3 ;\
418
    DST(7) = (a4>>2) - a7 ;\
419
}
420

421
static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
422
{
423
    dctcoef tmp[64];
424

425
    pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
426

427
#define SRC(x) tmp[x*8+i]
428
#define DST(x) tmp[x*8+i]
429
    for( int i = 0; i < 8; i++ )
430
        DCT8_1D
431
#undef SRC
432
#undef DST
433

434
#define SRC(x) tmp[i*8+x]
435
#define DST(x) dct[x*8+i]
436
    for( int i = 0; i < 8; i++ )
437
        DCT8_1D
438
#undef SRC
439
#undef DST
440
}
441

442
static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
443
{
444
    sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
445
    sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
446
    sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447
    sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
448
}
449

450
#define IDCT8_1D {\
451
    int a0 =  SRC(0) + SRC(4);\
452
    int a2 =  SRC(0) - SRC(4);\
453
    int a4 = (SRC(2)>>1) - SRC(6);\
454
    int a6 = (SRC(6)>>1) + SRC(2);\
455
    int b0 = a0 + a6;\
456
    int b2 = a2 + a4;\
457
    int b4 = a2 - a4;\
458
    int b6 = a0 - a6;\
459
    int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460
    int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461
    int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462
    int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463
    int b1 = (a7>>2) + a1;\
464
    int b3 =  a3 + (a5>>2);\
465
    int b5 = (a3>>2) - a5;\
466
    int b7 =  a7 - (a1>>2);\
467
    DST(0, b0 + b7);\
468
    DST(1, b2 + b5);\
469
    DST(2, b4 + b3);\
470
    DST(3, b6 + b1);\
471
    DST(4, b6 - b1);\
472
    DST(5, b4 - b3);\
473
    DST(6, b2 - b5);\
474
    DST(7, b0 - b7);\
475
}
476

477
static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
478
{
479
    dct[0] += 32; // rounding for the >>6 at the end
480

481
#define SRC(x)     dct[x*8+i]
482
#define DST(x,rhs) dct[x*8+i] = (rhs)
483
    for( int i = 0; i < 8; i++ )
484
        IDCT8_1D
485
#undef SRC
486
#undef DST
487

488
#define SRC(x)     dct[i*8+x]
489
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490
    for( int i = 0; i < 8; i++ )
491
        IDCT8_1D
492
#undef SRC
493
#undef DST
494
}
495

496
static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
497
{
498
    add8x8_idct8( &dst[0],               dct[0] );
499
    add8x8_idct8( &dst[8],               dct[1] );
500
    add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501
    add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
502
}
503

504
static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
505
{
506
    dc = (dc + 32) >> 6;
507
    for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
508
    {
509
        p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510
        p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511
        p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512
        p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
513
    }
514
}
515

516
static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
517
{
518
    add4x4_idct_dc( &p_dst[0],               dct[0] );
519
    add4x4_idct_dc( &p_dst[4],               dct[1] );
520
    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521
    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
522
}
523

524
static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
525
{
526
    for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
527
    {
528
        add4x4_idct_dc( &p_dst[ 0], dct[0] );
529
        add4x4_idct_dc( &p_dst[ 4], dct[1] );
530
        add4x4_idct_dc( &p_dst[ 8], dct[2] );
531
        add4x4_idct_dc( &p_dst[12], dct[3] );
532
    }
533
}
534

535

536
/****************************************************************************
537
 * x264_dct_init:
538
 ****************************************************************************/
539
void x264_dct_init( int cpu, x264_dct_function_t *dctf )
540
{
541
    dctf->sub4x4_dct    = sub4x4_dct;
542
    dctf->add4x4_idct   = add4x4_idct;
543

544
    dctf->sub8x8_dct    = sub8x8_dct;
545
    dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546
    dctf->add8x8_idct   = add8x8_idct;
547
    dctf->add8x8_idct_dc = add8x8_idct_dc;
548

549
    dctf->sub8x16_dct_dc = sub8x16_dct_dc;
550

551
    dctf->sub16x16_dct  = sub16x16_dct;
552
    dctf->add16x16_idct = add16x16_idct;
553
    dctf->add16x16_idct_dc = add16x16_idct_dc;
554

555
    dctf->sub8x8_dct8   = sub8x8_dct8;
556
    dctf->add8x8_idct8  = add8x8_idct8;
557

558
    dctf->sub16x16_dct8  = sub16x16_dct8;
559
    dctf->add16x16_idct8 = add16x16_idct8;
560

561
    dctf->dct4x4dc  = dct4x4dc;
562
    dctf->idct4x4dc = idct4x4dc;
563

564
    dctf->dct2x4dc = dct2x4dc;
565

566
#if HIGH_BIT_DEPTH
567
#if HAVE_MMX
568
    if( cpu&X264_CPU_MMX )
569
    {
570
        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
571
        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
572
        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
573
    }
574
    if( cpu&X264_CPU_SSE2 )
575
    {
576
        dctf->add4x4_idct     = x264_add4x4_idct_sse2;
577
        dctf->dct4x4dc        = x264_dct4x4dc_sse2;
578
        dctf->idct4x4dc       = x264_idct4x4dc_sse2;
579
        dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
580
        dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
581
        dctf->add8x8_idct     = x264_add8x8_idct_sse2;
582
        dctf->add16x16_idct   = x264_add16x16_idct_sse2;
583
        dctf->add8x8_idct8    = x264_add8x8_idct8_sse2;
584
        dctf->add16x16_idct8    = x264_add16x16_idct8_sse2;
585
        dctf->sub8x8_dct_dc   = x264_sub8x8_dct_dc_sse2;
586
        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_sse2;
587
        dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_sse2;
588
        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
589
    }
590
    if( cpu&X264_CPU_SSE4 )
591
    {
592
        dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse4;
593
        dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse4;
594
    }
595
    if( cpu&X264_CPU_AVX )
596
    {
597
        dctf->add4x4_idct     = x264_add4x4_idct_avx;
598
        dctf->dct4x4dc        = x264_dct4x4dc_avx;
599
        dctf->idct4x4dc       = x264_idct4x4dc_avx;
600
        dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
601
        dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
602
        dctf->add8x8_idct     = x264_add8x8_idct_avx;
603
        dctf->add16x16_idct   = x264_add16x16_idct_avx;
604
        dctf->add8x8_idct8    = x264_add8x8_idct8_avx;
605
        dctf->add16x16_idct8  = x264_add16x16_idct8_avx;
606
        dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_avx;
607
        dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_avx;
608
        dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
609
    }
610
#endif // HAVE_MMX
611
#else // !HIGH_BIT_DEPTH
612
#if HAVE_MMX
613
    if( cpu&X264_CPU_MMX )
614
    {
615
        dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
616
        dctf->add4x4_idct   = x264_add4x4_idct_mmx;
617
        dctf->idct4x4dc     = x264_idct4x4dc_mmx;
618
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
619

620
#if !ARCH_X86_64
621
        dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
622
        dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
623
        dctf->add8x8_idct   = x264_add8x8_idct_mmx;
624
        dctf->add16x16_idct = x264_add16x16_idct_mmx;
625

626
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
627
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
628
        dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
629
        dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
630
#endif
631
    }
632

633
    if( cpu&X264_CPU_MMX2 )
634
    {
635
        dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
636
        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
637
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
638
    }
639

640
    if( cpu&X264_CPU_SSE2 )
641
    {
642
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
643
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
644
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
645
        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
646
        dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
647
        dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
648

649
        if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
650
        {
651
            dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
652
            dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
653
            dctf->add8x8_idct   = x264_add8x8_idct_sse2;
654
            dctf->add16x16_idct = x264_add16x16_idct_sse2;
655
            dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
656
        }
657
    }
658

659
    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
660
    {
661
        dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
662
        if( !(cpu&X264_CPU_SLOW_ATOM) )
663
        {
664
            dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
665
            dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
666
            dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
667
            dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
668
            dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
669
            if( !(cpu&X264_CPU_SLOW_PSHUFB) )
670
            {
671
                dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
672
                dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
673
            }
674
        }
675
    }
676

677
    if( cpu&X264_CPU_SSE4 )
678
        dctf->add4x4_idct   = x264_add4x4_idct_sse4;
679

680
    if( cpu&X264_CPU_AVX )
681
    {
682
        dctf->add4x4_idct      = x264_add4x4_idct_avx;
683
        dctf->add8x8_idct      = x264_add8x8_idct_avx;
684
        dctf->add16x16_idct    = x264_add16x16_idct_avx;
685
        dctf->add8x8_idct8     = x264_add8x8_idct8_avx;
686
        dctf->add16x16_idct8   = x264_add16x16_idct8_avx;
687
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
688
        dctf->sub8x8_dct       = x264_sub8x8_dct_avx;
689
        dctf->sub16x16_dct     = x264_sub16x16_dct_avx;
690
        dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
691
        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
692
    }
693

694
    if( cpu&X264_CPU_XOP )
695
    {
696
        dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
697
        dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
698
    }
699

700
    if( cpu&X264_CPU_AVX2 )
701
    {
702
        dctf->add8x8_idct      = x264_add8x8_idct_avx2;
703
        dctf->add16x16_idct    = x264_add16x16_idct_avx2;
704
        dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
705
        dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
706
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
707
#if ARCH_X86_64
708
        dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
709
#endif
710
    }
711
#endif //HAVE_MMX
712

713
#if HAVE_ALTIVEC
714
    if( cpu&X264_CPU_ALTIVEC )
715
    {
716
        dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
717
        dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
718
        dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
719

720
        dctf->add4x4_idct   = x264_add4x4_idct_altivec;
721
        dctf->add8x8_idct   = x264_add8x8_idct_altivec;
722
        dctf->add16x16_idct = x264_add16x16_idct_altivec;
723

724
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
725
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
726

727
        dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
728
        dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
729
    }
730
#endif
731

732
#if HAVE_ARMV6 || ARCH_AARCH64
733
    if( cpu&X264_CPU_NEON )
734
    {
735
        dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
736
        dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
737
        dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
738
        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
739
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
740
        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
741
        dctf->dct4x4dc      = x264_dct4x4dc_neon;
742
        dctf->idct4x4dc     = x264_idct4x4dc_neon;
743

744
        dctf->add4x4_idct   = x264_add4x4_idct_neon;
745
        dctf->add8x8_idct   = x264_add8x8_idct_neon;
746
        dctf->add16x16_idct = x264_add16x16_idct_neon;
747

748
        dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
749
        dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
750

751
        dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
752
        dctf->add16x16_idct8= x264_add16x16_idct8_neon;
753
        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
754
    }
755
#endif
756

757
#if HAVE_MSA
758
    if( cpu&X264_CPU_MSA )
759
    {
760
        dctf->sub4x4_dct       = x264_sub4x4_dct_msa;
761
        dctf->sub8x8_dct       = x264_sub8x8_dct_msa;
762
        dctf->sub16x16_dct     = x264_sub16x16_dct_msa;
763
        dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_msa;
764
        dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_msa;
765
        dctf->dct4x4dc         = x264_dct4x4dc_msa;
766
        dctf->idct4x4dc        = x264_idct4x4dc_msa;
767
        dctf->add4x4_idct      = x264_add4x4_idct_msa;
768
        dctf->add8x8_idct      = x264_add8x8_idct_msa;
769
        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_msa;
770
        dctf->add16x16_idct    = x264_add16x16_idct_msa;
771
        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
772
        dctf->add8x8_idct8     = x264_add8x8_idct8_msa;
773
        dctf->add16x16_idct8   = x264_add16x16_idct8_msa;
774
    }
775
#endif
776

777
#endif // HIGH_BIT_DEPTH
778
}
779

780

781
#define ZIG(i,y,x) level[i] = dct[x*8+y];
782
#define ZIGZAG8_FRAME\
783
    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
784
    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
785
    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
786
    ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
787
    ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
788
    ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
789
    ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
790
    ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
791
    ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
792
    ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
793
    ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
794
    ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
795
    ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
796
    ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
797
    ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
798
    ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
799

800
#define ZIGZAG8_FIELD\
801
    ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
802
    ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
803
    ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
804
    ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
805
    ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
806
    ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
807
    ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
808
    ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
809
    ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
810
    ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
811
    ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
812
    ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
813
    ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
814
    ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
815
    ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
816
    ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
817

818
#define ZIGZAG4_FRAME\
819
    ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
820
    ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
821
    ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
822
    ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
823

824
#define ZIGZAG4_FIELD\
825
    ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
826
    ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
827
    ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
828
    ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
829

830
static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
831
{
832
    ZIGZAG8_FRAME
833
}
834

835
static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
836
{
837
    ZIGZAG8_FIELD
838
}
839

840
#undef ZIG
841
#define ZIG(i,y,x) level[i] = dct[x*4+y];
842
#define ZIGDC(i,y,x) ZIG(i,y,x)
843

844
static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
845
{
846
    ZIGZAG4_FRAME
847
}
848

849
static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
850
{
851
    memcpy( level, dct, 2 * sizeof(dctcoef) );
852
    ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
853
    memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
854
}
855

856
#undef ZIG
857
#define ZIG(i,y,x) {\
858
    int oe = x+y*FENC_STRIDE;\
859
    int od = x+y*FDEC_STRIDE;\
860
    level[i] = p_src[oe] - p_dst[od];\
861
    nz |= level[i];\
862
}
863
#define COPY4x4\
864
    CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
865
    CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
866
    CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
867
    CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
868
#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
869
#define COPY8x8\
870
    CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
871
    CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
872
    CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
873
    CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
874
    CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
875
    CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
876
    CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
877
    CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
878

879
static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
880
{
881
    int nz = 0;
882
    ZIGZAG4_FRAME
883
    COPY4x4
884
    return !!nz;
885
}
886

887
static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
888
{
889
    int nz = 0;
890
    ZIGZAG4_FIELD
891
    COPY4x4
892
    return !!nz;
893
}
894

895
#undef ZIGDC
896
#define ZIGDC(i,y,x) {\
897
    int oe = x+y*FENC_STRIDE;\
898
    int od = x+y*FDEC_STRIDE;\
899
    *dc = p_src[oe] - p_dst[od];\
900
    level[0] = 0;\
901
}
902

903
static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
904
{
905
    int nz = 0;
906
    ZIGZAG4_FRAME
907
    COPY4x4
908
    return !!nz;
909
}
910

911
static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
912
{
913
    int nz = 0;
914
    ZIGZAG4_FIELD
915
    COPY4x4
916
    return !!nz;
917
}
918

919
static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
920
{
921
    int nz = 0;
922
    ZIGZAG8_FRAME
923
    COPY8x8
924
    return !!nz;
925
}
926
static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
927
{
928
    int nz = 0;
929
    ZIGZAG8_FIELD
930
    COPY8x8
931
    return !!nz;
932
}
933

934
#undef ZIG
935
#undef COPY4x4
936

937
static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
938
{
939
    for( int i = 0; i < 4; i++ )
940
    {
941
        int nz = 0;
942
        for( int j = 0; j < 16; j++ )
943
        {
944
            nz |= src[i+j*4];
945
            dst[i*16+j] = src[i+j*4];
946
        }
947
        nnz[(i&1) + (i>>1)*8] = !!nz;
948
    }
949
}
950

951
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
952
{
953
    pf_interlaced->scan_8x8   = zigzag_scan_8x8_field;
954
    pf_progressive->scan_8x8  = zigzag_scan_8x8_frame;
955
    pf_interlaced->scan_4x4   = zigzag_scan_4x4_field;
956
    pf_progressive->scan_4x4  = zigzag_scan_4x4_frame;
957
    pf_interlaced->sub_8x8    = zigzag_sub_8x8_field;
958
    pf_progressive->sub_8x8   = zigzag_sub_8x8_frame;
959
    pf_interlaced->sub_4x4    = zigzag_sub_4x4_field;
960
    pf_progressive->sub_4x4   = zigzag_sub_4x4_frame;
961
    pf_interlaced->sub_4x4ac  = zigzag_sub_4x4ac_field;
962
    pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
963

964
#if HIGH_BIT_DEPTH
965
#if HAVE_MMX
966
    if( cpu&X264_CPU_SSE2 )
967
    {
968
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse2;
969
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
970
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
971
    }
972
    if( cpu&X264_CPU_SSE4 )
973
        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
974
    if( cpu&X264_CPU_AVX )
975
        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
976
#if ARCH_X86_64
977
    if( cpu&X264_CPU_AVX )
978
    {
979
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
980
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
981
    }
982
#endif // ARCH_X86_64
983
#endif // HAVE_MMX
984
#else
985
#if HAVE_MMX
986
    if( cpu&X264_CPU_MMX )
987
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
988
    if( cpu&X264_CPU_MMX2 )
989
    {
990
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_mmx2;
991
        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
992
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
993
    }
994
    if( cpu&X264_CPU_SSE2_IS_FAST )
995
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
996
    if( cpu&X264_CPU_SSSE3 )
997
    {
998
        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_ssse3;
999
        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
1000
        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1001
        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1002
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1003
        if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1004
            pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1005
    }
1006
    if( cpu&X264_CPU_AVX )
1007
    {
1008
        pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_avx;
1009
        pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_avx;
1010
#if ARCH_X86_64
1011
        pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1012
        pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1013
#endif
1014
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1015
    }
1016
    if( cpu&X264_CPU_XOP )
1017
    {
1018
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1019
        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1020
        pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1021
    }
1022
#endif // HAVE_MMX
1023
#if HAVE_ALTIVEC
1024
    if( cpu&X264_CPU_ALTIVEC )
1025
    {
1026
        pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_altivec;
1027
        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1028
    }
1029
#endif
1030
#if HAVE_ARMV6 || ARCH_AARCH64
1031
    if( cpu&X264_CPU_NEON )
1032
    {
1033
        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_neon;
1034
#if ARCH_AARCH64
1035
        pf_interlaced->scan_4x4   = x264_zigzag_scan_4x4_field_neon;
1036
        pf_interlaced->scan_8x8   = x264_zigzag_scan_8x8_field_neon;
1037
        pf_interlaced->sub_4x4    = x264_zigzag_sub_4x4_field_neon;
1038
        pf_interlaced->sub_4x4ac  = x264_zigzag_sub_4x4ac_field_neon;
1039
        pf_interlaced->sub_8x8    = x264_zigzag_sub_8x8_field_neon;
1040
        pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_neon;
1041
        pf_progressive->sub_4x4   = x264_zigzag_sub_4x4_frame_neon;
1042
        pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1043
        pf_progressive->sub_8x8   = x264_zigzag_sub_8x8_frame_neon;
1044
#endif // ARCH_AARCH64
1045
    }
1046
#endif // HAVE_ARMV6 || ARCH_AARCH64
1047
#endif // HIGH_BIT_DEPTH
1048

1049
    pf_interlaced->interleave_8x8_cavlc =
1050
    pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1051
#if HAVE_MMX
1052
#if HIGH_BIT_DEPTH
1053
    if( cpu&X264_CPU_SSE2 )
1054
    {
1055
        pf_interlaced->interleave_8x8_cavlc =
1056
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1057
    }
1058
    if( cpu&X264_CPU_AVX )
1059
    {
1060
        pf_interlaced->interleave_8x8_cavlc =
1061
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1062
    }
1063
#else
1064
    if( cpu&X264_CPU_MMX )
1065
    {
1066
        pf_interlaced->interleave_8x8_cavlc =
1067
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1068
    }
1069
    if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1070
    {
1071
        pf_interlaced->interleave_8x8_cavlc =
1072
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1073
    }
1074

1075
    if( cpu&X264_CPU_AVX )
1076
    {
1077
        pf_interlaced->interleave_8x8_cavlc =
1078
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1079
    }
1080

1081
    if( cpu&X264_CPU_AVX2 )
1082
    {
1083
        pf_interlaced->interleave_8x8_cavlc =
1084
        pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1085
    }
1086
#endif // HIGH_BIT_DEPTH
1087
#endif
1088
#if !HIGH_BIT_DEPTH
1089
#if ARCH_AARCH64
1090
    if( cpu&X264_CPU_NEON )
1091
    {
1092
        pf_interlaced->interleave_8x8_cavlc =
1093
        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
1094
    }
1095
#endif // ARCH_AARCH64
1096
#endif // !HIGH_BIT_DEPTH
1097
#if !HIGH_BIT_DEPTH
1098
#if HAVE_MSA
1099
    if( cpu&X264_CPU_MSA )
1100
    {
1101
        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
1102
    }
1103
#endif
1104
#endif
1105
}
1106

1107
Product

Resources

Company