Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52866 views
1
/*****************************************************************************
2
* pixel.c: pixel metrics
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Loren Merritt <[email protected]>
7
* Laurent Aimar <[email protected]>
8
* Fiona Glaser <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include "common.h"
29
30
#if HAVE_MMX
31
# include "x86/pixel.h"
32
# include "x86/predict.h"
33
#endif
34
#if ARCH_PPC
35
# include "ppc/pixel.h"
36
#endif
37
#if ARCH_ARM
38
# include "arm/pixel.h"
39
# include "arm/predict.h"
40
#endif
41
#if ARCH_AARCH64
42
# include "aarch64/pixel.h"
43
# include "aarch64/predict.h"
44
#endif
45
#if ARCH_MIPS
46
# include "mips/pixel.h"
47
#endif
48
49
50
/****************************************************************************
51
* pixel_sad_WxH
52
****************************************************************************/
53
#define PIXEL_SAD_C( name, lx, ly ) \
54
static int name( pixel *pix1, intptr_t i_stride_pix1, \
55
pixel *pix2, intptr_t i_stride_pix2 ) \
56
{ \
57
int i_sum = 0; \
58
for( int y = 0; y < ly; y++ ) \
59
{ \
60
for( int x = 0; x < lx; x++ ) \
61
{ \
62
i_sum += abs( pix1[x] - pix2[x] ); \
63
} \
64
pix1 += i_stride_pix1; \
65
pix2 += i_stride_pix2; \
66
} \
67
return i_sum; \
68
}
69
70
71
PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
72
PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 )
73
PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 )
74
PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 )
75
PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 )
76
PIXEL_SAD_C( x264_pixel_sad_4x16, 4, 16 )
77
PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 )
78
PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 )
79
80
/****************************************************************************
81
* pixel_ssd_WxH
82
****************************************************************************/
83
#define PIXEL_SSD_C( name, lx, ly ) \
84
static int name( pixel *pix1, intptr_t i_stride_pix1, \
85
pixel *pix2, intptr_t i_stride_pix2 ) \
86
{ \
87
int i_sum = 0; \
88
for( int y = 0; y < ly; y++ ) \
89
{ \
90
for( int x = 0; x < lx; x++ ) \
91
{ \
92
int d = pix1[x] - pix2[x]; \
93
i_sum += d*d; \
94
} \
95
pix1 += i_stride_pix1; \
96
pix2 += i_stride_pix2; \
97
} \
98
return i_sum; \
99
}
100
101
PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
102
PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 )
103
PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 )
104
PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 )
105
PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 )
106
PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 16 )
107
PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
108
PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
109
110
uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1,
111
pixel *pix2, intptr_t i_pix2, int i_width, int i_height )
112
{
113
uint64_t i_ssd = 0;
114
int y;
115
int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
116
117
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
118
pix2 + y*i_pix2 + x, i_pix2 );
119
for( y = 0; y < i_height-15; y += 16 )
120
{
121
int x = 0;
122
if( align )
123
for( ; x < i_width-15; x += 16 )
124
SSD(PIXEL_16x16);
125
for( ; x < i_width-7; x += 8 )
126
SSD(PIXEL_8x16);
127
}
128
if( y < i_height-7 )
129
for( int x = 0; x < i_width-7; x += 8 )
130
SSD(PIXEL_8x8);
131
#undef SSD
132
133
#define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
134
if( i_width & 7 )
135
{
136
for( y = 0; y < (i_height & ~7); y++ )
137
for( int x = i_width & ~7; x < i_width; x++ )
138
SSD1;
139
}
140
if( i_height & 7 )
141
{
142
for( y = i_height & ~7; y < i_height; y++ )
143
for( int x = 0; x < i_width; x++ )
144
SSD1;
145
}
146
#undef SSD1
147
148
return i_ssd;
149
}
150
151
static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2,
152
int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
153
{
154
*ssd_u = 0, *ssd_v = 0;
155
for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
156
for( int x = 0; x < width; x++ )
157
{
158
int du = pixuv1[2*x] - pixuv2[2*x];
159
int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
160
*ssd_u += du*du;
161
*ssd_v += dv*dv;
162
}
163
}
164
165
void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
166
int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
167
{
168
pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
169
if( i_width&7 )
170
{
171
uint64_t tmp[2];
172
pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] );
173
*ssd_u += tmp[0];
174
*ssd_v += tmp[1];
175
}
176
}
177
178
/****************************************************************************
179
* pixel_var_wxh
180
****************************************************************************/
181
#define PIXEL_VAR_C( name, w, h ) \
182
static uint64_t name( pixel *pix, intptr_t i_stride ) \
183
{ \
184
uint32_t sum = 0, sqr = 0; \
185
for( int y = 0; y < h; y++ ) \
186
{ \
187
for( int x = 0; x < w; x++ ) \
188
{ \
189
sum += pix[x]; \
190
sqr += pix[x] * pix[x]; \
191
} \
192
pix += i_stride; \
193
} \
194
return sum + ((uint64_t)sqr << 32); \
195
}
196
197
PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
198
PIXEL_VAR_C( x264_pixel_var_8x16, 8, 16 )
199
PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
200
201
/****************************************************************************
202
* pixel_var2_wxh
203
****************************************************************************/
204
#define PIXEL_VAR2_C( name, w, h, shift ) \
205
static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
206
{ \
207
int var = 0, sum = 0, sqr = 0; \
208
for( int y = 0; y < h; y++ ) \
209
{ \
210
for( int x = 0; x < w; x++ ) \
211
{ \
212
int diff = pix1[x] - pix2[x]; \
213
sum += diff; \
214
sqr += diff * diff; \
215
} \
216
pix1 += i_stride1; \
217
pix2 += i_stride2; \
218
} \
219
var = sqr - ((int64_t)sum * sum >> shift); \
220
*ssd = sqr; \
221
return var; \
222
}
223
224
PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
225
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 )
226
227
#if BIT_DEPTH > 8
228
typedef uint32_t sum_t;
229
typedef uint64_t sum2_t;
230
#else
231
typedef uint16_t sum_t;
232
typedef uint32_t sum2_t;
233
#endif
234
#define BITS_PER_SUM (8 * sizeof(sum_t))
235
236
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
237
sum2_t t0 = s0 + s1;\
238
sum2_t t1 = s0 - s1;\
239
sum2_t t2 = s2 + s3;\
240
sum2_t t3 = s2 - s3;\
241
d0 = t0 + t2;\
242
d2 = t0 - t2;\
243
d1 = t1 + t3;\
244
d3 = t1 - t3;\
245
}
246
247
// in: a pseudo-simd number of the form x+(y<<16)
248
// return: abs(x)+(abs(y)<<16)
249
static ALWAYS_INLINE sum2_t abs2( sum2_t a )
250
{
251
sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<<BITS_PER_SUM)+1))*((sum_t)-1);
252
return (a+s)^s;
253
}
254
255
/****************************************************************************
256
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
257
****************************************************************************/
258
259
static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
260
{
261
sum2_t tmp[4][2];
262
sum2_t a0, a1, a2, a3, b0, b1;
263
sum2_t sum = 0;
264
for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
265
{
266
a0 = pix1[0] - pix2[0];
267
a1 = pix1[1] - pix2[1];
268
b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
269
a2 = pix1[2] - pix2[2];
270
a3 = pix1[3] - pix2[3];
271
b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
272
tmp[i][0] = b0 + b1;
273
tmp[i][1] = b0 - b1;
274
}
275
for( int i = 0; i < 2; i++ )
276
{
277
HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
278
a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
279
sum += ((sum_t)a0) + (a0>>BITS_PER_SUM);
280
}
281
return sum >> 1;
282
}
283
284
static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
285
{
286
sum2_t tmp[4][4];
287
sum2_t a0, a1, a2, a3;
288
sum2_t sum = 0;
289
for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
290
{
291
a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
292
a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
293
a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
294
a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
295
HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
296
}
297
for( int i = 0; i < 4; i++ )
298
{
299
HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
300
sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
301
}
302
return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
303
}
304
305
#define PIXEL_SATD_C( w, h, sub )\
306
static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\
307
{\
308
int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
309
+ sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
310
if( w==16 )\
311
sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\
312
+ sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\
313
if( h==16 )\
314
sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\
315
+ sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\
316
if( w==16 && h==16 )\
317
sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\
318
+ sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\
319
return sum;\
320
}
321
PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
322
PIXEL_SATD_C( 16, 8, x264_pixel_satd_8x4 )
323
PIXEL_SATD_C( 8, 16, x264_pixel_satd_8x4 )
324
PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 )
325
PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 )
326
PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 )
327
328
static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
329
{
330
sum2_t tmp[8][4];
331
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
332
sum2_t sum = 0;
333
for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 )
334
{
335
a0 = pix1[0] - pix2[0];
336
a1 = pix1[1] - pix2[1];
337
b0 = (a0+a1) + ((a0-a1)<<BITS_PER_SUM);
338
a2 = pix1[2] - pix2[2];
339
a3 = pix1[3] - pix2[3];
340
b1 = (a2+a3) + ((a2-a3)<<BITS_PER_SUM);
341
a4 = pix1[4] - pix2[4];
342
a5 = pix1[5] - pix2[5];
343
b2 = (a4+a5) + ((a4-a5)<<BITS_PER_SUM);
344
a6 = pix1[6] - pix2[6];
345
a7 = pix1[7] - pix2[7];
346
b3 = (a6+a7) + ((a6-a7)<<BITS_PER_SUM);
347
HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 );
348
}
349
for( int i = 0; i < 4; i++ )
350
{
351
HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
352
HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] );
353
b0 = abs2(a0+a4) + abs2(a0-a4);
354
b0 += abs2(a1+a5) + abs2(a1-a5);
355
b0 += abs2(a2+a6) + abs2(a2-a6);
356
b0 += abs2(a3+a7) + abs2(a3-a7);
357
sum += (sum_t)b0 + (b0>>BITS_PER_SUM);
358
}
359
return sum;
360
}
361
362
static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
363
{
364
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
365
return (sum+2)>>2;
366
}
367
368
static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
369
{
370
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
371
+ sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
372
+ sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
373
+ sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 );
374
return (sum+2)>>2;
375
}
376
377
static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
378
{
379
sum2_t tmp[32];
380
sum2_t a0, a1, a2, a3, dc;
381
sum2_t sum4 = 0, sum8 = 0;
382
for( int i = 0; i < 8; i++, pix+=stride )
383
{
384
sum2_t *t = tmp + (i&3) + (i&4)*4;
385
a0 = (pix[0]+pix[1]) + ((sum2_t)(pix[0]-pix[1])<<BITS_PER_SUM);
386
a1 = (pix[2]+pix[3]) + ((sum2_t)(pix[2]-pix[3])<<BITS_PER_SUM);
387
t[0] = a0 + a1;
388
t[4] = a0 - a1;
389
a2 = (pix[4]+pix[5]) + ((sum2_t)(pix[4]-pix[5])<<BITS_PER_SUM);
390
a3 = (pix[6]+pix[7]) + ((sum2_t)(pix[6]-pix[7])<<BITS_PER_SUM);
391
t[8] = a2 + a3;
392
t[12] = a2 - a3;
393
}
394
for( int i = 0; i < 8; i++ )
395
{
396
HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] );
397
tmp[i*4+0] = a0;
398
tmp[i*4+1] = a1;
399
tmp[i*4+2] = a2;
400
tmp[i*4+3] = a3;
401
sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
402
}
403
for( int i = 0; i < 8; i++ )
404
{
405
HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] );
406
sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
407
}
408
dc = (sum_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]);
409
sum4 = (sum_t)sum4 + (sum4>>BITS_PER_SUM) - dc;
410
sum8 = (sum_t)sum8 + (sum8>>BITS_PER_SUM) - dc;
411
return ((uint64_t)sum8<<32) + sum4;
412
}
413
414
#define HADAMARD_AC(w,h) \
415
static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\
416
{\
417
uint64_t sum = pixel_hadamard_ac( pix, stride );\
418
if( w==16 )\
419
sum += pixel_hadamard_ac( pix+8, stride );\
420
if( h==16 )\
421
sum += pixel_hadamard_ac( pix+8*stride, stride );\
422
if( w==16 && h==16 )\
423
sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
424
return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
425
}
426
HADAMARD_AC( 16, 16 )
427
HADAMARD_AC( 16, 8 )
428
HADAMARD_AC( 8, 16 )
429
HADAMARD_AC( 8, 8 )
430
431
432
/****************************************************************************
433
* pixel_sad_x4
434
****************************************************************************/
435
#define SAD_X( size ) \
436
static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
437
intptr_t i_stride, int scores[3] )\
438
{\
439
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
440
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
441
scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
442
}\
443
static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\
444
intptr_t i_stride, int scores[4] )\
445
{\
446
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
447
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
448
scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
449
scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
450
}
451
452
SAD_X( 16x16 )
453
SAD_X( 16x8 )
454
SAD_X( 8x16 )
455
SAD_X( 8x8 )
456
SAD_X( 8x4 )
457
SAD_X( 4x8 )
458
SAD_X( 4x4 )
459
460
/****************************************************************************
461
* pixel_satd_x4
462
* no faster than single satd, but needed for satd to be a drop-in replacement for sad
463
****************************************************************************/
464
465
#define SATD_X( size, cpu ) \
466
static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
467
intptr_t i_stride, int scores[3] )\
468
{\
469
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
470
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
471
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
472
}\
473
static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\
474
intptr_t i_stride, int scores[4] )\
475
{\
476
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
477
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
478
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
479
scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
480
}
481
#define SATD_X_DECL6( cpu )\
482
SATD_X( 16x16, cpu )\
483
SATD_X( 16x8, cpu )\
484
SATD_X( 8x16, cpu )\
485
SATD_X( 8x8, cpu )\
486
SATD_X( 8x4, cpu )\
487
SATD_X( 4x8, cpu )
488
#define SATD_X_DECL7( cpu )\
489
SATD_X_DECL6( cpu )\
490
SATD_X( 4x4, cpu )
491
492
SATD_X_DECL7()
493
#if HAVE_MMX
494
SATD_X_DECL7( _mmx2 )
495
#if !HIGH_BIT_DEPTH
496
SATD_X_DECL6( _sse2 )
497
SATD_X_DECL7( _ssse3 )
498
SATD_X_DECL6( _ssse3_atom )
499
SATD_X_DECL7( _sse4 )
500
SATD_X_DECL7( _avx )
501
SATD_X_DECL7( _xop )
502
#endif // !HIGH_BIT_DEPTH
503
#endif
504
505
#if !HIGH_BIT_DEPTH
506
#if HAVE_ARMV6 || ARCH_AARCH64
507
SATD_X_DECL7( _neon )
508
#endif
509
#endif // !HIGH_BIT_DEPTH
510
511
#define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\
512
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\
513
{\
514
ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\
515
x264_predict_8x8_v##cpu2( pix, edge );\
516
res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
517
x264_predict_8x8_h##cpu2( pix, edge );\
518
res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
519
x264_predict_8x8_dc##cpu2( pix, edge );\
520
res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
521
}
522
523
INTRA_MBCMP_8x8( sad,, _c )
524
INTRA_MBCMP_8x8(sa8d,, _c )
525
#if HIGH_BIT_DEPTH && HAVE_MMX
526
#define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse
527
INTRA_MBCMP_8x8( sad, _mmx2, _c )
528
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
529
#endif
530
#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64)
531
INTRA_MBCMP_8x8( sad, _neon, _neon )
532
INTRA_MBCMP_8x8(sa8d, _neon, _neon )
533
#endif
534
535
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
536
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
537
{\
538
x264_predict_##size##chroma##_##pred1##cpu2( fdec );\
539
res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
540
x264_predict_##size##chroma##_##pred2##cpu2( fdec );\
541
res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
542
x264_predict_##size##chroma##_##pred3##cpu2( fdec );\
543
res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
544
}
545
546
INTRA_MBCMP( sad, 4x4, v, h, dc, ,, _c )
547
INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c )
548
INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c )
549
INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c )
550
INTRA_MBCMP( sad, 8x16, dc, h, v, c,, _c )
551
INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c )
552
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
553
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
554
555
#if HAVE_MMX
556
#if HIGH_BIT_DEPTH
557
#define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
558
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
559
#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
560
#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
561
#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
562
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
563
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _mmx2 )
564
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 )
565
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 )
566
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
567
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
568
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _sse2 )
569
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _sse2 )
570
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
571
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
572
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _ssse3, _sse2 )
573
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _sse2 )
574
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
575
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _sse2 )
576
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _sse2 )
577
#else
578
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
579
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 )
580
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 )
581
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 )
582
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 )
583
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 )
584
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 )
585
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 )
586
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 )
587
#endif
588
#endif
589
#if !HIGH_BIT_DEPTH && HAVE_ARMV6
590
INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 )
591
INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 )
592
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon )
593
INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon )
594
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c )
595
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c )
596
INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
597
INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
598
#endif
599
#if !HIGH_BIT_DEPTH && ARCH_AARCH64
600
INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon )
601
INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon )
602
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon )
603
INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon )
604
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _neon )
605
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _neon )
606
INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
607
INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
608
#endif
609
610
// No C implementation of intra_satd_x9. See checkasm for its behavior,
611
// or see x264_mb_analyse_intra for the entirely different algorithm we
612
// use when lacking an asm implementation of it.
613
614
615
616
/****************************************************************************
617
* structural similarity metric
618
****************************************************************************/
619
static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1,
620
const pixel *pix2, intptr_t stride2,
621
int sums[2][4] )
622
{
623
for( int z = 0; z < 2; z++ )
624
{
625
uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
626
for( int y = 0; y < 4; y++ )
627
for( int x = 0; x < 4; x++ )
628
{
629
int a = pix1[x+y*stride1];
630
int b = pix2[x+y*stride2];
631
s1 += a;
632
s2 += b;
633
ss += a*a;
634
ss += b*b;
635
s12 += a*b;
636
}
637
sums[z][0] = s1;
638
sums[z][1] = s2;
639
sums[z][2] = ss;
640
sums[z][3] = s12;
641
pix1 += 4;
642
pix2 += 4;
643
}
644
}
645
646
static float ssim_end1( int s1, int s2, int ss, int s12 )
647
{
648
/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
649
* s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
650
* Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
651
#if BIT_DEPTH > 9
652
#define type float
653
static const float ssim_c1 = .01*.01*PIXEL_MAX*PIXEL_MAX*64;
654
static const float ssim_c2 = .03*.03*PIXEL_MAX*PIXEL_MAX*64*63;
655
#else
656
#define type int
657
static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
658
static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
659
#endif
660
type fs1 = s1;
661
type fs2 = s2;
662
type fss = ss;
663
type fs12 = s12;
664
type vars = fss*64 - fs1*fs1 - fs2*fs2;
665
type covar = fs12*64 - fs1*fs2;
666
return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2)
667
/ ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2));
668
#undef type
669
}
670
671
static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
672
{
673
float ssim = 0.0;
674
for( int i = 0; i < width; i++ )
675
ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
676
sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
677
sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
678
sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
679
return ssim;
680
}
681
682
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
683
pixel *pix1, intptr_t stride1,
684
pixel *pix2, intptr_t stride2,
685
int width, int height, void *buf, int *cnt )
686
{
687
int z = 0;
688
float ssim = 0.0;
689
int (*sum0)[4] = buf;
690
int (*sum1)[4] = sum0 + (width >> 2) + 3;
691
width >>= 2;
692
height >>= 2;
693
for( int y = 1; y < height; y++ )
694
{
695
for( ; z <= y; z++ )
696
{
697
XCHG( void*, sum0, sum1 );
698
for( int x = 0; x < width; x+=2 )
699
pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
700
}
701
for( int x = 0; x < width-1; x += 4 )
702
ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
703
}
704
*cnt = (height-1) * (width-1);
705
return ssim;
706
}
707
708
static int pixel_vsad( pixel *src, intptr_t stride, int height )
709
{
710
int score = 0;
711
for( int i = 1; i < height; i++, src += stride )
712
for( int j = 0; j < 16; j++ )
713
score += abs(src[j] - src[j+stride]);
714
return score;
715
}
716
717
int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
718
{
719
int score_field, score_frame;
720
int stride = h->fenc->i_stride[0];
721
int mb_stride = h->mb.i_mb_stride;
722
pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride);
723
int mb_xy = mb_x + mb_y*mb_stride;
724
725
/* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */
726
int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 );
727
score_frame = h->pixf.vsad( fenc, stride, mbpair_height );
728
score_field = h->pixf.vsad( fenc, stride*2, mbpair_height >> 1 );
729
score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 );
730
731
if( mb_x > 0 )
732
score_field += 512 - h->mb.field[mb_xy -1]*1024;
733
if( mb_y > 0 )
734
score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024;
735
736
return (score_field < score_frame);
737
}
738
739
static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height )
740
{
741
int sum = 0;
742
for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 )
743
for( int x = 0; x < 8; x++ )
744
sum += pix1[x] - pix2[x];
745
return abs( sum );
746
}
747
748
/****************************************************************************
749
* successive elimination
750
****************************************************************************/
751
static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
752
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
753
{
754
int nmv = 0;
755
for( int i = 0; i < width; i++, sums++ )
756
{
757
int ads = abs( enc_dc[0] - sums[0] )
758
+ abs( enc_dc[1] - sums[8] )
759
+ abs( enc_dc[2] - sums[delta] )
760
+ abs( enc_dc[3] - sums[delta+8] )
761
+ cost_mvx[i];
762
if( ads < thresh )
763
mvs[nmv++] = i;
764
}
765
return nmv;
766
}
767
768
static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
769
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
770
{
771
int nmv = 0;
772
for( int i = 0; i < width; i++, sums++ )
773
{
774
int ads = abs( enc_dc[0] - sums[0] )
775
+ abs( enc_dc[1] - sums[delta] )
776
+ cost_mvx[i];
777
if( ads < thresh )
778
mvs[nmv++] = i;
779
}
780
return nmv;
781
}
782
783
static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
784
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
785
{
786
int nmv = 0;
787
for( int i = 0; i<width; i++, sums++ )
788
{
789
int ads = abs( enc_dc[0] - sums[0] )
790
+ cost_mvx[i];
791
if( ads < thresh )
792
mvs[nmv++] = i;
793
}
794
return nmv;
795
}
796
797
798
/****************************************************************************
799
* x264_pixel_init:
800
****************************************************************************/
801
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
802
{
803
memset( pixf, 0, sizeof(*pixf) );
804
805
#define INIT2_NAME( name1, name2, cpu ) \
806
pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
807
pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu;
808
#define INIT4_NAME( name1, name2, cpu ) \
809
INIT2_NAME( name1, name2, cpu ) \
810
pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\
811
pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu;
812
#define INIT5_NAME( name1, name2, cpu ) \
813
INIT4_NAME( name1, name2, cpu ) \
814
pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu;
815
#define INIT6_NAME( name1, name2, cpu ) \
816
INIT5_NAME( name1, name2, cpu ) \
817
pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;
818
#define INIT7_NAME( name1, name2, cpu ) \
819
INIT6_NAME( name1, name2, cpu ) \
820
pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu;
821
#define INIT8_NAME( name1, name2, cpu ) \
822
INIT7_NAME( name1, name2, cpu ) \
823
pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu;
824
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
825
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
826
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
827
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
828
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
829
#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
830
831
#define INIT_ADS( cpu ) \
832
pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
833
pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
834
pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
835
836
INIT8( sad, );
837
INIT8_NAME( sad_aligned, sad, );
838
INIT7( sad_x3, );
839
INIT7( sad_x4, );
840
INIT8( ssd, );
841
INIT8( satd, );
842
INIT7( satd_x3, );
843
INIT7( satd_x4, );
844
INIT4( hadamard_ac, );
845
INIT_ADS( );
846
847
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
848
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
849
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
850
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16;
851
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
852
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16;
853
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8;
854
855
pixf->ssd_nv12_core = pixel_ssd_nv12_core;
856
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
857
pixf->ssim_end4 = ssim_end4;
858
pixf->vsad = pixel_vsad;
859
pixf->asd8 = pixel_asd8;
860
861
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
862
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4;
863
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8;
864
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8;
865
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c;
866
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c;
867
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c;
868
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
869
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16;
870
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
871
872
#if HIGH_BIT_DEPTH
873
#if HAVE_MMX
874
if( cpu&X264_CPU_MMX2 )
875
{
876
INIT7( sad, _mmx2 );
877
INIT7_NAME( sad_aligned, sad, _mmx2 );
878
INIT7( sad_x3, _mmx2 );
879
INIT7( sad_x4, _mmx2 );
880
INIT8( satd, _mmx2 );
881
INIT7( satd_x3, _mmx2 );
882
INIT7( satd_x4, _mmx2 );
883
INIT4( hadamard_ac, _mmx2 );
884
INIT8( ssd, _mmx2 );
885
INIT_ADS( _mmx2 );
886
887
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
888
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
889
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
890
#if ARCH_X86
891
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
892
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
893
#endif
894
895
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
896
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
897
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
898
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
899
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
900
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2;
901
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
902
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
903
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
904
}
905
if( cpu&X264_CPU_SSE2 )
906
{
907
INIT4_NAME( sad_aligned, sad, _sse2_aligned );
908
INIT5( ssd, _sse2 );
909
INIT6( satd, _sse2 );
910
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
911
912
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
913
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
914
#if ARCH_X86_64
915
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
916
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
917
#endif
918
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
919
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
920
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
921
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
922
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
923
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
924
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
925
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
926
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
927
}
928
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
929
{
930
INIT5( sad, _sse2 );
931
INIT2( sad_x3, _sse2 );
932
INIT2( sad_x4, _sse2 );
933
INIT_ADS( _sse2 );
934
935
if( !(cpu&X264_CPU_STACK_MOD4) )
936
{
937
INIT4( hadamard_ac, _sse2 );
938
}
939
pixf->vsad = x264_pixel_vsad_sse2;
940
pixf->asd8 = x264_pixel_asd8_sse2;
941
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
942
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
943
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2;
944
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
945
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
946
}
947
if( cpu&X264_CPU_SSE2_IS_FAST )
948
{
949
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
950
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
951
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
952
pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
953
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
954
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
955
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
956
}
957
if( cpu&X264_CPU_SSSE3 )
958
{
959
INIT4_NAME( sad_aligned, sad, _ssse3_aligned );
960
pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3;
961
pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3;
962
INIT7( sad, _ssse3 );
963
INIT7( sad_x3, _ssse3 );
964
INIT7( sad_x4, _ssse3 );
965
INIT_ADS( _ssse3 );
966
INIT6( satd, _ssse3 );
967
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
968
969
if( !(cpu&X264_CPU_STACK_MOD4) )
970
{
971
INIT4( hadamard_ac, _ssse3 );
972
}
973
pixf->vsad = x264_pixel_vsad_ssse3;
974
pixf->asd8 = x264_pixel_asd8_ssse3;
975
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
976
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
977
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
978
#if ARCH_X86_64
979
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
980
#endif
981
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
982
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
983
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
984
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_ssse3;
985
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
986
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
987
}
988
if( cpu&X264_CPU_SSE4 )
989
{
990
INIT6( satd, _sse4 );
991
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
992
if( !(cpu&X264_CPU_STACK_MOD4) )
993
{
994
INIT4( hadamard_ac, _sse4 );
995
}
996
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
997
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
998
#if ARCH_X86_64
999
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
1000
#endif
1001
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
1002
}
1003
if( cpu&X264_CPU_AVX )
1004
{
1005
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
1006
INIT_ADS( _avx );
1007
INIT6( satd, _avx );
1008
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
1009
if( !(cpu&X264_CPU_STACK_MOD4) )
1010
{
1011
INIT4( hadamard_ac, _avx );
1012
}
1013
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
1014
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
1015
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
1016
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
1017
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
1018
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
1019
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
1020
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
1021
#if ARCH_X86_64
1022
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
1023
#endif
1024
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
1025
}
1026
if( cpu&X264_CPU_XOP )
1027
{
1028
INIT5( sad_x3, _xop );
1029
INIT5( sad_x4, _xop );
1030
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
1031
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
1032
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
1033
pixf->vsad = x264_pixel_vsad_xop;
1034
pixf->asd8 = x264_pixel_asd8_xop;
1035
#if ARCH_X86_64
1036
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
1037
#endif
1038
}
1039
if( cpu&X264_CPU_AVX2 )
1040
{
1041
INIT2( ssd, _avx2 );
1042
INIT2( sad, _avx2 );
1043
INIT2_NAME( sad_aligned, sad, _avx2 );
1044
INIT2( sad_x3, _avx2 );
1045
INIT2( sad_x4, _avx2 );
1046
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
1047
pixf->vsad = x264_pixel_vsad_avx2;
1048
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
1049
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
1050
}
1051
#endif // HAVE_MMX
1052
#else // !HIGH_BIT_DEPTH
1053
#if HAVE_MMX
1054
if( cpu&X264_CPU_MMX )
1055
{
1056
INIT8( ssd, _mmx );
1057
}
1058
1059
if( cpu&X264_CPU_MMX2 )
1060
{
1061
INIT8( sad, _mmx2 );
1062
INIT8_NAME( sad_aligned, sad, _mmx2 );
1063
INIT7( sad_x3, _mmx2 );
1064
INIT7( sad_x4, _mmx2 );
1065
INIT8( satd, _mmx2 );
1066
INIT7( satd_x3, _mmx2 );
1067
INIT7( satd_x4, _mmx2 );
1068
INIT4( hadamard_ac, _mmx2 );
1069
INIT_ADS( _mmx2 );
1070
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
1071
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
1072
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
1073
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
1074
#if ARCH_X86
1075
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
1076
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
1077
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
1078
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
1079
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
1080
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
1081
pixf->vsad = x264_pixel_vsad_mmx2;
1082
1083
if( cpu&X264_CPU_CACHELINE_32 )
1084
{
1085
INIT5( sad, _cache32_mmx2 );
1086
INIT4( sad_x3, _cache32_mmx2 );
1087
INIT4( sad_x4, _cache32_mmx2 );
1088
}
1089
else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
1090
{
1091
INIT5( sad, _cache64_mmx2 );
1092
INIT4( sad_x3, _cache64_mmx2 );
1093
INIT4( sad_x4, _cache64_mmx2 );
1094
}
1095
#else
1096
if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
1097
{
1098
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
1099
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2;
1100
pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmx2;
1101
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2;
1102
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmx2;
1103
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2;
1104
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmx2;
1105
}
1106
#endif
1107
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
1108
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
1109
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
1110
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2;
1111
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
1112
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
1113
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
1114
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
1115
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
1116
}
1117
1118
if( cpu&X264_CPU_SSE2 )
1119
{
1120
INIT5( ssd, _sse2slow );
1121
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
1122
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
1123
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
1124
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
1125
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
1126
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
1127
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
1128
#if ARCH_X86_64
1129
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
1130
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
1131
#endif
1132
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
1133
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
1134
pixf->vsad = x264_pixel_vsad_sse2;
1135
pixf->asd8 = x264_pixel_asd8_sse2;
1136
}
1137
1138
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
1139
{
1140
INIT2( sad, _sse2 );
1141
INIT2( sad_x3, _sse2 );
1142
INIT2( sad_x4, _sse2 );
1143
INIT6( satd, _sse2 );
1144
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
1145
INIT6( satd_x3, _sse2 );
1146
INIT6( satd_x4, _sse2 );
1147
INIT4( hadamard_ac, _sse2 );
1148
INIT_ADS( _sse2 );
1149
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
1150
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
1151
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
1152
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
1153
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2;
1154
if( cpu&X264_CPU_CACHELINE_64 )
1155
{
1156
INIT2( ssd, _sse2); /* faster for width 16 on p4 */
1157
#if ARCH_X86
1158
INIT2( sad, _cache64_sse2 );
1159
INIT2( sad_x3, _cache64_sse2 );
1160
INIT2( sad_x4, _cache64_sse2 );
1161
#endif
1162
if( cpu&X264_CPU_SSE2_IS_FAST )
1163
{
1164
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
1165
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
1166
}
1167
}
1168
}
1169
1170
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
1171
{
1172
pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
1173
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
1174
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
1175
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
1176
pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
1177
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
1178
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
1179
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
1180
}
1181
1182
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
1183
{
1184
INIT2( sad, _sse3 );
1185
INIT2( sad_x3, _sse3 );
1186
INIT2( sad_x4, _sse3 );
1187
}
1188
1189
if( cpu&X264_CPU_SSSE3 )
1190
{
1191
INIT4( hadamard_ac, _ssse3 );
1192
if( !(cpu&X264_CPU_STACK_MOD4) )
1193
{
1194
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3;
1195
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
1196
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3;
1197
#if ARCH_X86_64
1198
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
1199
#endif
1200
}
1201
INIT_ADS( _ssse3 );
1202
if( cpu&X264_CPU_SLOW_ATOM )
1203
{
1204
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
1205
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom;
1206
INIT6( satd, _ssse3_atom );
1207
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom;
1208
INIT6( satd_x3, _ssse3_atom );
1209
INIT6( satd_x4, _ssse3_atom );
1210
INIT4( hadamard_ac, _ssse3_atom );
1211
#if ARCH_X86_64
1212
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
1213
#endif
1214
}
1215
else
1216
{
1217
INIT8( ssd, _ssse3 );
1218
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
1219
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
1220
INIT8( satd, _ssse3 );
1221
INIT7( satd_x3, _ssse3 );
1222
INIT7( satd_x4, _ssse3 );
1223
#if ARCH_X86_64
1224
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
1225
#endif
1226
}
1227
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
1228
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
1229
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
1230
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
1231
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
1232
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
1233
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
1234
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
1235
pixf->asd8 = x264_pixel_asd8_ssse3;
1236
if( cpu&X264_CPU_CACHELINE_64 )
1237
{
1238
INIT2( sad, _cache64_ssse3 );
1239
INIT2( sad_x3, _cache64_ssse3 );
1240
INIT2( sad_x4, _cache64_ssse3 );
1241
}
1242
else
1243
{
1244
INIT2( sad_x3, _ssse3 );
1245
INIT5( sad_x4, _ssse3 );
1246
}
1247
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
1248
{
1249
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
1250
}
1251
}
1252
1253
if( cpu&X264_CPU_SSE4 )
1254
{
1255
INIT8( satd, _sse4 );
1256
INIT7( satd_x3, _sse4 );
1257
INIT7( satd_x4, _sse4 );
1258
INIT4( hadamard_ac, _sse4 );
1259
if( !(cpu&X264_CPU_STACK_MOD4) )
1260
{
1261
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4;
1262
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
1263
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4;
1264
#if ARCH_X86_64
1265
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4;
1266
#endif
1267
}
1268
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
1269
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
1270
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
1271
#if ARCH_X86_64
1272
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
1273
#endif
1274
}
1275
1276
if( cpu&X264_CPU_AVX )
1277
{
1278
INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
1279
INIT2( sad_x3, _avx );
1280
INIT2( sad_x4, _avx );
1281
INIT8( satd, _avx );
1282
INIT7( satd_x3, _avx );
1283
INIT7( satd_x4, _avx );
1284
INIT_ADS( _avx );
1285
INIT4( hadamard_ac, _avx );
1286
if( !(cpu&X264_CPU_STACK_MOD4) )
1287
{
1288
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx;
1289
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
1290
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx;
1291
#if ARCH_X86_64
1292
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx;
1293
#endif
1294
}
1295
INIT5( ssd, _avx );
1296
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
1297
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
1298
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
1299
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
1300
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
1301
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
1302
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
1303
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
1304
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
1305
#if ARCH_X86_64
1306
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
1307
#endif
1308
}
1309
1310
if( cpu&X264_CPU_XOP )
1311
{
1312
INIT7( satd, _xop );
1313
INIT7( satd_x3, _xop );
1314
INIT7( satd_x4, _xop );
1315
INIT4( hadamard_ac, _xop );
1316
if( !(cpu&X264_CPU_STACK_MOD4) )
1317
{
1318
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
1319
}
1320
INIT5( ssd, _xop );
1321
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
1322
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
1323
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
1324
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
1325
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
1326
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
1327
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
1328
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
1329
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
1330
#if ARCH_X86_64
1331
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
1332
#endif
1333
}
1334
1335
if( cpu&X264_CPU_AVX2 )
1336
{
1337
INIT2( ssd, _avx2 );
1338
INIT2( sad_x3, _avx2 );
1339
INIT2( sad_x4, _avx2 );
1340
INIT4( satd, _avx2 );
1341
INIT2( hadamard_ac, _avx2 );
1342
INIT_ADS( _avx2 );
1343
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
1344
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
1345
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
1346
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
1347
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
1348
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2;
1349
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2;
1350
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
1351
#if ARCH_X86_64
1352
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
1353
#endif
1354
}
1355
#endif //HAVE_MMX
1356
1357
#if HAVE_ARMV6
1358
if( cpu&X264_CPU_ARMV6 )
1359
{
1360
pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
1361
pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
1362
pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
1363
pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
1364
}
1365
if( cpu&X264_CPU_NEON )
1366
{
1367
INIT5( sad, _neon );
1368
INIT5( sad_aligned, _neon );
1369
INIT7( sad_x3, _neon );
1370
INIT7( sad_x4, _neon );
1371
INIT7( ssd, _neon );
1372
INIT7( satd, _neon );
1373
INIT7( satd_x3, _neon );
1374
INIT7( satd_x4, _neon );
1375
INIT4( hadamard_ac, _neon );
1376
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
1377
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
1378
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
1379
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
1380
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
1381
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
1382
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
1383
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
1384
pixf->vsad = x264_pixel_vsad_neon;
1385
pixf->asd8 = x264_pixel_asd8_neon;
1386
1387
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
1388
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
1389
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
1390
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon;
1391
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
1392
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon;
1393
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon;
1394
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
1395
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
1396
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
1397
1398
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon;
1399
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
1400
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
1401
1402
if( cpu&X264_CPU_FAST_NEON_MRC )
1403
{
1404
pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
1405
pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
1406
pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
1407
pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
1408
}
1409
else // really just scheduled for dual issue / A8
1410
{
1411
INIT5( sad_aligned, _neon_dual );
1412
}
1413
}
1414
#endif
1415
1416
#if ARCH_AARCH64
1417
if( cpu&X264_CPU_NEON )
1418
{
1419
INIT8( sad, _neon );
1420
// AArch64 has no distinct instructions for aligned load/store
1421
INIT8_NAME( sad_aligned, sad, _neon );
1422
INIT7( sad_x3, _neon );
1423
INIT7( sad_x4, _neon );
1424
INIT8( ssd, _neon );
1425
INIT8( satd, _neon );
1426
INIT7( satd_x3, _neon );
1427
INIT7( satd_x4, _neon );
1428
INIT4( hadamard_ac, _neon );
1429
1430
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
1431
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
1432
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
1433
1434
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
1435
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
1436
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
1437
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
1438
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
1439
pixf->vsad = x264_pixel_vsad_neon;
1440
pixf->asd8 = x264_pixel_asd8_neon;
1441
1442
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
1443
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
1444
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
1445
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon;
1446
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
1447
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon;
1448
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon;
1449
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
1450
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
1451
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
1452
1453
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon;
1454
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
1455
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
1456
}
1457
#endif // ARCH_AARCH64
1458
1459
#if HAVE_MSA
1460
if( cpu&X264_CPU_MSA )
1461
{
1462
INIT8( sad, _msa );
1463
INIT8_NAME( sad_aligned, sad, _msa );
1464
INIT8( ssd, _msa );
1465
INIT7( sad_x3, _msa );
1466
INIT7( sad_x4, _msa );
1467
INIT8( satd, _msa );
1468
INIT4( hadamard_ac, _msa );
1469
1470
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_msa;
1471
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_msa;
1472
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_msa;
1473
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa;
1474
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_msa;
1475
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa;
1476
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_msa;
1477
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_msa;
1478
1479
pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa;
1480
1481
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
1482
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa;
1483
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa;
1484
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
1485
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
1486
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
1487
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
1488
}
1489
#endif // HAVE_MSA
1490
1491
#endif // HIGH_BIT_DEPTH
1492
#if HAVE_ALTIVEC
1493
if( cpu&X264_CPU_ALTIVEC )
1494
{
1495
x264_pixel_altivec_init( pixf );
1496
}
1497
#endif
1498
1499
pixf->ads[PIXEL_8x16] =
1500
pixf->ads[PIXEL_8x4] =
1501
pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
1502
pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
1503
}
1504
1505
1506