Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/*****************************************************************************
2
* mc-c.c: x86 motion compensation
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Laurent Aimar <[email protected]>
7
* Loren Merritt <[email protected]>
8
* Fiona Glaser <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include <stdlib.h>
29
#include <stdio.h>
30
#include <string.h>
31
32
#include "common/common.h"
33
#include "mc.h"
34
35
#define DECL_SUF( func, args )\
36
void func##_mmx2 args;\
37
void func##_sse2 args;\
38
void func##_ssse3 args;\
39
void func##_avx2 args;
40
41
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
42
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
43
DECL_SUF( x264_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
44
DECL_SUF( x264_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
45
DECL_SUF( x264_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
46
DECL_SUF( x264_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
47
DECL_SUF( x264_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
48
DECL_SUF( x264_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
49
DECL_SUF( x264_pixel_avg_4x2, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
50
51
#define MC_WEIGHT(w,type) \
52
void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
53
54
#define MC_WEIGHT_OFFSET(w,type) \
55
void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
56
void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
57
MC_WEIGHT(w,type)
58
59
MC_WEIGHT_OFFSET( 4, mmx2 )
60
MC_WEIGHT_OFFSET( 8, mmx2 )
61
MC_WEIGHT_OFFSET( 12, mmx2 )
62
MC_WEIGHT_OFFSET( 16, mmx2 )
63
MC_WEIGHT_OFFSET( 20, mmx2 )
64
MC_WEIGHT_OFFSET( 12, sse2 )
65
MC_WEIGHT_OFFSET( 16, sse2 )
66
MC_WEIGHT_OFFSET( 20, sse2 )
67
#if HIGH_BIT_DEPTH
68
MC_WEIGHT_OFFSET( 8, sse2 )
69
#endif
70
MC_WEIGHT( 8, sse2 )
71
MC_WEIGHT( 4, ssse3 )
72
MC_WEIGHT( 8, ssse3 )
73
MC_WEIGHT( 12, ssse3 )
74
MC_WEIGHT( 16, ssse3 )
75
MC_WEIGHT( 20, ssse3 )
76
MC_WEIGHT( 8, avx2 )
77
MC_WEIGHT( 16, avx2 )
78
MC_WEIGHT( 20, avx2 )
79
#undef MC_OFFSET
80
#undef MC_WEIGHT
81
82
void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
83
void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
84
void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
85
void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
86
void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
87
void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
88
void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
89
void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
90
void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
91
void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
92
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
93
void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
94
void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
95
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
96
void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
97
void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
98
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
99
void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
100
pixel *srcu, intptr_t i_srcu,
101
pixel *srcv, intptr_t i_srcv, int w, int h );
102
void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
103
pixel *srcu, intptr_t i_srcu,
104
pixel *srcv, intptr_t i_srcv, int w, int h );
105
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
106
pixel *srcu, intptr_t i_srcu,
107
pixel *srcv, intptr_t i_srcv, int w, int h );
108
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
109
pixel *srcu, intptr_t i_srcu,
110
pixel *srcv, intptr_t i_srcv, int w, int h );
111
void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
112
pixel *dstv, intptr_t i_dstv,
113
pixel *src, intptr_t i_src, int w, int h );
114
void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
115
pixel *dstv, intptr_t i_dstv,
116
pixel *src, intptr_t i_src, int w, int h );
117
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
118
uint8_t *dstv, intptr_t i_dstv,
119
uint8_t *src, intptr_t i_src, int w, int h );
120
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
121
uint16_t *dstv, intptr_t i_dstv,
122
uint16_t *src, intptr_t i_src, int w, int h );
123
void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
124
pixel *dstb, intptr_t i_dstb,
125
pixel *dstc, intptr_t i_dstc,
126
pixel *src, intptr_t i_src, int pw, int w, int h );
127
void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
128
pixel *dstb, intptr_t i_dstb,
129
pixel *dstc, intptr_t i_dstc,
130
pixel *src, intptr_t i_src, int pw, int w, int h );
131
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
132
uint16_t *dstv, intptr_t i_dstv,
133
uint32_t *src, intptr_t i_src, int w, int h );
134
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
135
uint16_t *dstv, intptr_t i_dstv,
136
uint32_t *src, intptr_t i_src, int w, int h );
137
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
138
uint16_t *dstv, intptr_t i_dstv,
139
uint32_t *src, intptr_t i_src, int w, int h );
140
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
141
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
142
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
143
void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
144
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
145
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
146
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
147
void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
148
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
149
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
150
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
151
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
152
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
153
void x264_memzero_aligned_mmx( void *dst, size_t n );
154
void x264_memzero_aligned_sse( void *dst, size_t n );
155
void x264_memzero_aligned_avx( void *dst, size_t n );
156
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
157
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
158
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
159
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
160
void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
161
void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
162
void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
163
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
164
void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
165
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
166
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
167
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
168
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
169
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
170
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
171
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
172
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
173
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
174
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
175
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
176
177
#define MC_CHROMA(cpu)\
178
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
179
int dx, int dy, int i_width, int i_height );
180
MC_CHROMA(mmx2)
181
MC_CHROMA(sse2)
182
MC_CHROMA(ssse3)
183
MC_CHROMA(ssse3_cache64)
184
MC_CHROMA(avx)
185
MC_CHROMA(avx2)
186
187
#define LOWRES(cpu)\
188
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
189
intptr_t src_stride, intptr_t dst_stride, int width, int height );
190
LOWRES(mmx2)
191
LOWRES(cache32_mmx2)
192
LOWRES(sse2)
193
LOWRES(ssse3)
194
LOWRES(avx)
195
LOWRES(xop)
196
LOWRES(avx2)
197
198
#define PIXEL_AVG_W(width,cpu)\
199
void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
200
/* This declares some functions that don't exist, but that isn't a problem. */
201
#define PIXEL_AVG_WALL(cpu)\
202
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
203
204
PIXEL_AVG_WALL(mmx2)
205
PIXEL_AVG_WALL(cache32_mmx2)
206
PIXEL_AVG_WALL(cache64_mmx2)
207
PIXEL_AVG_WALL(cache64_sse2)
208
PIXEL_AVG_WALL(sse2)
209
PIXEL_AVG_WALL(cache64_ssse3)
210
PIXEL_AVG_WALL(avx2)
211
212
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
213
static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
214
{\
215
NULL,\
216
x264_pixel_avg2_w4_##name1,\
217
x264_pixel_avg2_w8_##name2,\
218
x264_pixel_avg2_w12_##name3,\
219
x264_pixel_avg2_w16_##name4,\
220
x264_pixel_avg2_w20_##name5,\
221
};
222
223
#if HIGH_BIT_DEPTH
224
/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
225
#define x264_pixel_avg2_w12_mmx2 x264_pixel_avg2_w10_mmx2
226
#define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2
227
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2
228
#define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2
229
#define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2
230
#define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2
231
#else
232
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
233
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
234
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
235
#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
236
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
237
#endif // HIGH_BIT_DEPTH
238
239
PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
240
#if HIGH_BIT_DEPTH
241
PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
242
PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
243
#else // !HIGH_BIT_DEPTH
244
#if ARCH_X86
245
PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
246
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
247
#endif
248
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
249
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
250
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
251
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
252
PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
253
#endif // HIGH_BIT_DEPTH
254
255
#define MC_COPY_WTAB(instr, name1, name2, name3)\
256
static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
257
{\
258
NULL,\
259
x264_mc_copy_w4_##name1,\
260
x264_mc_copy_w8_##name2,\
261
NULL,\
262
x264_mc_copy_w16_##name3,\
263
};
264
265
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
266
#if HIGH_BIT_DEPTH
267
MC_COPY_WTAB(sse,mmx,sse,sse)
268
MC_COPY_WTAB(avx,mmx,sse,avx)
269
#else
270
MC_COPY_WTAB(sse,mmx,mmx,sse)
271
#endif
272
273
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
274
static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
275
{\
276
x264_mc_##function##_w4_##name1,\
277
x264_mc_##function##_w4_##name1,\
278
x264_mc_##function##_w8_##name2,\
279
x264_mc_##function##_w##w12version##_##instr,\
280
x264_mc_##function##_w16_##instr,\
281
x264_mc_##function##_w20_##instr,\
282
};
283
284
#if HIGH_BIT_DEPTH
285
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
286
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
287
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
288
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
289
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
290
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)
291
292
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
293
{
294
if( w->i_scale == 1<<w->i_denom )
295
{
296
if( w->i_offset < 0 )
297
w->weightfn = h->mc.offsetsub;
298
else
299
w->weightfn = h->mc.offsetadd;
300
for( int i = 0; i < 8; i++ )
301
w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
302
return;
303
}
304
w->weightfn = h->mc.weight;
305
int den1 = 1<<w->i_denom;
306
int den2 = w->i_scale<<1;
307
int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
308
for( int i = 0; i < 8; i++ )
309
{
310
w->cachea[i] = den1;
311
w->cacheb[i] = i&1 ? den3 : den2;
312
}
313
}
314
#else
315
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
316
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
317
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
318
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
319
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
320
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
321
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
322
MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
323
324
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
325
{
326
int i;
327
int16_t den1;
328
329
if( w->i_scale == 1<<w->i_denom )
330
{
331
if( w->i_offset < 0 )
332
w->weightfn = h->mc.offsetsub;
333
else
334
w->weightfn = h->mc.offsetadd;
335
memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
336
return;
337
}
338
w->weightfn = h->mc.weight;
339
den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
340
for( i = 0; i < 8; i++ )
341
{
342
w->cachea[i] = w->i_scale;
343
w->cacheb[i] = den1;
344
}
345
}
346
347
static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
348
{
349
int i, den1;
350
if( w->i_scale == 1<<w->i_denom )
351
{
352
if( w->i_offset < 0 )
353
w->weightfn = h->mc.offsetsub;
354
else
355
w->weightfn = h->mc.offsetadd;
356
357
memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
358
return;
359
}
360
w->weightfn = h->mc.weight;
361
den1 = w->i_scale << (8 - w->i_denom);
362
for( i = 0; i < 8; i++ )
363
{
364
w->cachea[i] = den1;
365
w->cacheb[i] = w->i_offset;
366
}
367
}
368
#endif // !HIGH_BIT_DEPTH
369
370
#define MC_LUMA(name,instr1,instr2)\
371
static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\
372
pixel *src[4], intptr_t i_src_stride,\
373
int mvx, int mvy,\
374
int i_width, int i_height, const x264_weight_t *weight )\
375
{\
376
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
377
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
378
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
379
if( qpel_idx & 5 ) /* qpel interpolation needed */\
380
{\
381
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
382
x264_pixel_avg_wtab_##instr1[i_width>>2](\
383
dst, i_dst_stride, src1, i_src_stride,\
384
src2, i_height );\
385
if( weight->weightfn )\
386
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
387
}\
388
else if( weight->weightfn )\
389
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
390
else\
391
x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
392
}
393
394
MC_LUMA(mmx2,mmx2,mmx)
395
MC_LUMA(sse2,sse2,sse)
396
#if HIGH_BIT_DEPTH
397
MC_LUMA(avx2,avx2,avx)
398
#else
399
#if ARCH_X86
400
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
401
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
402
#endif
403
MC_LUMA(cache64_sse2,cache64_sse2,sse)
404
MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
405
MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
406
#endif // !HIGH_BIT_DEPTH
407
408
#define GET_REF(name)\
409
static pixel *get_ref_##name( pixel *dst, intptr_t *i_dst_stride,\
410
pixel *src[4], intptr_t i_src_stride,\
411
int mvx, int mvy,\
412
int i_width, int i_height, const x264_weight_t *weight )\
413
{\
414
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
415
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
416
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
417
if( qpel_idx & 5 ) /* qpel interpolation needed */\
418
{\
419
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
420
x264_pixel_avg_wtab_##name[i_width>>2](\
421
dst, *i_dst_stride, src1, i_src_stride,\
422
src2, i_height );\
423
if( weight->weightfn )\
424
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
425
return dst;\
426
}\
427
else if( weight->weightfn )\
428
{\
429
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
430
return dst;\
431
}\
432
else\
433
{\
434
*i_dst_stride = i_src_stride;\
435
return src1;\
436
}\
437
}
438
439
GET_REF(mmx2)
440
GET_REF(sse2)
441
GET_REF(avx2)
442
#if !HIGH_BIT_DEPTH
443
#if ARCH_X86
444
GET_REF(cache32_mmx2)
445
GET_REF(cache64_mmx2)
446
#endif
447
GET_REF(cache64_sse2)
448
GET_REF(cache64_ssse3)
449
GET_REF(cache64_ssse3_atom)
450
#endif // !HIGH_BIT_DEPTH
451
452
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
453
void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
454
void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
455
void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
456
static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
457
intptr_t stride, int width, int height, int16_t *buf )\
458
{\
459
intptr_t realign = (intptr_t)src & (align-1);\
460
src -= realign;\
461
dstv -= realign;\
462
dstc -= realign;\
463
dsth -= realign;\
464
width += realign;\
465
while( height-- )\
466
{\
467
x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
468
x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
469
x264_hpel_filter_h_##cpuh( dsth, src, width );\
470
dsth += stride;\
471
dstv += stride;\
472
dstc += stride;\
473
src += stride;\
474
}\
475
x264_sfence();\
476
}
477
478
HPEL(8, mmx2, mmx2, mmx2, mmx2)
479
#if HIGH_BIT_DEPTH
480
HPEL(16, sse2, sse2, sse2, sse2)
481
#else // !HIGH_BIT_DEPTH
482
HPEL(16, sse2_amd, mmx2, mmx2, sse2)
483
#if ARCH_X86_64
484
void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
485
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
486
void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
487
void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
488
#else
489
HPEL(16, sse2, sse2, sse2, sse2)
490
HPEL(16, ssse3, ssse3, ssse3, ssse3)
491
HPEL(16, avx, avx, avx, avx)
492
HPEL(32, avx2, avx2, avx2, avx2)
493
#endif
494
#endif // HIGH_BIT_DEPTH
495
496
#define PLANE_COPY(align, cpu)\
497
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
498
{\
499
int c_w = (align) / sizeof(pixel) - 1;\
500
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
501
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
502
else if( !(w&c_w) )\
503
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
504
else\
505
{\
506
if( --h > 0 )\
507
{\
508
if( i_src > 0 )\
509
{\
510
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
511
dst += i_dst * h;\
512
src += i_src * h;\
513
}\
514
else\
515
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
516
}\
517
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
518
memcpy( dst, src, w*sizeof(pixel) );\
519
}\
520
}
521
522
PLANE_COPY(16, sse)
523
PLANE_COPY(32, avx)
524
525
#define PLANE_COPY_SWAP(align, cpu)\
526
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
527
{\
528
int c_w = (align>>1) / sizeof(pixel) - 1;\
529
if( !(w&c_w) )\
530
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
531
else if( w > c_w )\
532
{\
533
if( --h > 0 )\
534
{\
535
if( i_src > 0 )\
536
{\
537
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
538
dst += i_dst * h;\
539
src += i_src * h;\
540
}\
541
else\
542
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
543
}\
544
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
545
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
546
{\
547
dst[x] = src[x+1];\
548
dst[x+1] = src[x];\
549
}\
550
}\
551
else\
552
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
553
}
554
555
PLANE_COPY_SWAP(16, ssse3)
556
PLANE_COPY_SWAP(32, avx2)
557
558
#define PLANE_INTERLEAVE(cpu) \
559
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
560
pixel *srcu, intptr_t i_srcu,\
561
pixel *srcv, intptr_t i_srcv, int w, int h )\
562
{\
563
int c_w = 16 / sizeof(pixel) - 1;\
564
if( !(w&c_w) )\
565
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
566
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
567
{\
568
if( --h > 0 )\
569
{\
570
if( i_srcu > 0 )\
571
{\
572
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
573
dst += i_dst * h;\
574
srcu += i_srcu * h;\
575
srcv += i_srcv * h;\
576
}\
577
else\
578
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
579
}\
580
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
581
}\
582
else\
583
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
584
}
585
586
PLANE_INTERLEAVE(mmx2)
587
PLANE_INTERLEAVE(sse2)
588
#if HIGH_BIT_DEPTH
589
PLANE_INTERLEAVE(avx)
590
#endif
591
592
#if HAVE_X86_INLINE_ASM
593
#undef MC_CLIP_ADD
594
#define MC_CLIP_ADD(s,x)\
595
do\
596
{\
597
int temp;\
598
asm("movd %0, %%xmm0 \n"\
599
"movd %2, %%xmm1 \n"\
600
"paddsw %%xmm1, %%xmm0 \n"\
601
"movd %%xmm0, %1 \n"\
602
:"+m"(s), "=&r"(temp)\
603
:"m"(x)\
604
);\
605
s = temp;\
606
} while(0)
607
608
#undef MC_CLIP_ADD2
609
#define MC_CLIP_ADD2(s,x)\
610
do\
611
{\
612
asm("movd %0, %%xmm0 \n"\
613
"movd %1, %%xmm1 \n"\
614
"paddsw %%xmm1, %%xmm0 \n"\
615
"movd %%xmm0, %0 \n"\
616
:"+m"(M32(s))\
617
:"m"(M32(x))\
618
);\
619
} while(0)
620
#endif
621
622
PROPAGATE_LIST(ssse3)
623
PROPAGATE_LIST(avx)
624
625
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
626
{
627
if( !(cpu&X264_CPU_MMX) )
628
return;
629
630
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
631
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
632
633
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
634
635
pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
636
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
637
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
638
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
639
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
640
pf->memzero_aligned = x264_memzero_aligned_mmx;
641
pf->integral_init4v = x264_integral_init4v_mmx;
642
pf->integral_init8v = x264_integral_init8v_mmx;
643
644
if( !(cpu&X264_CPU_MMX2) )
645
return;
646
647
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
648
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
649
pf->prefetch_ref = x264_prefetch_ref_mmx2;
650
651
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
652
pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
653
654
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
655
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2;
656
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmx2;
657
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmx2;
658
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmx2;
659
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_mmx2;
660
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmx2;
661
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmx2;
662
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmx2;
663
664
pf->mc_luma = mc_luma_mmx2;
665
pf->get_ref = get_ref_mmx2;
666
pf->mc_chroma = x264_mc_chroma_mmx2;
667
pf->hpel_filter = x264_hpel_filter_mmx2;
668
pf->weight = x264_mc_weight_wtab_mmx2;
669
pf->weight_cache = x264_weight_cache_mmx2;
670
pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
671
pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;
672
673
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
674
675
if( cpu&X264_CPU_SSE )
676
{
677
pf->memcpy_aligned = x264_memcpy_aligned_sse;
678
pf->memzero_aligned = x264_memzero_aligned_sse;
679
pf->plane_copy = x264_plane_copy_sse;
680
}
681
682
#if HIGH_BIT_DEPTH
683
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
684
if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
685
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
686
#endif
687
688
if( !(cpu&X264_CPU_SSE2) )
689
return;
690
691
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
692
693
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
694
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
695
696
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
697
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
698
699
if( cpu&X264_CPU_SSE2_IS_FAST )
700
{
701
pf->get_ref = get_ref_sse2;
702
pf->mc_luma = mc_luma_sse2;
703
pf->hpel_filter = x264_hpel_filter_sse2;
704
}
705
706
pf->integral_init4v = x264_integral_init4v_sse2;
707
pf->integral_init8v = x264_integral_init8v_sse2;
708
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
709
pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
710
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
711
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
712
713
if( cpu&X264_CPU_SSE2_IS_SLOW )
714
return;
715
716
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
717
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
718
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
719
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
720
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
721
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sse2;
722
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sse2;
723
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2;
724
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2;
725
726
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
727
pf->weight = x264_mc_weight_wtab_sse2;
728
729
if( !(cpu&X264_CPU_STACK_MOD4) )
730
pf->mc_chroma = x264_mc_chroma_sse2;
731
732
if( !(cpu&X264_CPU_SSSE3) )
733
return;
734
735
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
736
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
737
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
738
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
739
740
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
741
pf->integral_init4v = x264_integral_init4v_ssse3;
742
743
if( !(cpu&X264_CPU_AVX) )
744
return;
745
746
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
747
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
748
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
749
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
750
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
751
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
752
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
753
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
754
755
if( !(cpu&X264_CPU_STACK_MOD4) )
756
pf->mc_chroma = x264_mc_chroma_avx;
757
758
if( cpu&X264_CPU_XOP )
759
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
760
761
if( cpu&X264_CPU_AVX2 )
762
{
763
pf->mc_luma = mc_luma_avx2;
764
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
765
}
766
#else // !HIGH_BIT_DEPTH
767
768
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
769
if( cpu&X264_CPU_CACHELINE_32 )
770
{
771
pf->mc_luma = mc_luma_cache32_mmx2;
772
pf->get_ref = get_ref_cache32_mmx2;
773
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
774
}
775
else if( cpu&X264_CPU_CACHELINE_64 )
776
{
777
pf->mc_luma = mc_luma_cache64_mmx2;
778
pf->get_ref = get_ref_cache64_mmx2;
779
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
780
}
781
#endif
782
783
if( !(cpu&X264_CPU_SSE2) )
784
return;
785
786
pf->integral_init4v = x264_integral_init4v_sse2;
787
pf->integral_init8v = x264_integral_init8v_sse2;
788
pf->hpel_filter = x264_hpel_filter_sse2_amd;
789
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
790
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
791
792
if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
793
{
794
pf->weight = x264_mc_weight_wtab_sse2;
795
if( !(cpu&X264_CPU_SLOW_ATOM) )
796
{
797
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
798
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
799
}
800
801
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
802
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
803
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
804
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
805
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
806
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
807
pf->hpel_filter = x264_hpel_filter_sse2;
808
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
809
if( !(cpu&X264_CPU_STACK_MOD4) )
810
pf->mc_chroma = x264_mc_chroma_sse2;
811
812
if( cpu&X264_CPU_SSE2_IS_FAST )
813
{
814
pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
815
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
816
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
817
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
818
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
819
pf->mc_luma = mc_luma_sse2;
820
pf->get_ref = get_ref_sse2;
821
if( cpu&X264_CPU_CACHELINE_64 )
822
{
823
pf->mc_luma = mc_luma_cache64_sse2;
824
pf->get_ref = get_ref_cache64_sse2;
825
}
826
}
827
}
828
829
if( !(cpu&X264_CPU_SSSE3) )
830
return;
831
832
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
833
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3;
834
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3;
835
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3;
836
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3;
837
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_ssse3;
838
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3;
839
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
840
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
841
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
842
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
843
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
844
845
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
846
{
847
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
848
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
849
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
850
}
851
852
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
853
{
854
#if ARCH_X86_64
855
if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
856
#endif
857
pf->hpel_filter = x264_hpel_filter_ssse3;
858
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
859
}
860
if( !(cpu&X264_CPU_STACK_MOD4) )
861
pf->mc_chroma = x264_mc_chroma_ssse3;
862
863
if( cpu&X264_CPU_CACHELINE_64 )
864
{
865
if( !(cpu&X264_CPU_STACK_MOD4) )
866
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
867
pf->mc_luma = mc_luma_cache64_ssse3;
868
pf->get_ref = get_ref_cache64_ssse3;
869
if( cpu&X264_CPU_SLOW_ATOM )
870
{
871
pf->mc_luma = mc_luma_cache64_ssse3_atom;
872
pf->get_ref = get_ref_cache64_ssse3_atom;
873
}
874
}
875
876
pf->weight_cache = x264_weight_cache_ssse3;
877
pf->weight = x264_mc_weight_wtab_ssse3;
878
879
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
880
pf->integral_init4v = x264_integral_init4v_ssse3;
881
882
if( !(cpu&X264_CPU_SSE4) )
883
return;
884
885
pf->integral_init4h = x264_integral_init4h_sse4;
886
pf->integral_init8h = x264_integral_init8h_sse4;
887
888
if( !(cpu&X264_CPU_AVX) )
889
return;
890
891
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
892
pf->integral_init8h = x264_integral_init8h_avx;
893
pf->hpel_filter = x264_hpel_filter_avx;
894
895
if( !(cpu&X264_CPU_STACK_MOD4) )
896
pf->mc_chroma = x264_mc_chroma_avx;
897
898
if( cpu&X264_CPU_XOP )
899
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
900
901
if( cpu&X264_CPU_AVX2 )
902
{
903
pf->hpel_filter = x264_hpel_filter_avx2;
904
pf->mc_chroma = x264_mc_chroma_avx2;
905
pf->weight = x264_mc_weight_wtab_avx2;
906
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
907
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2;
908
pf->integral_init8v = x264_integral_init8v_avx2;
909
pf->integral_init4v = x264_integral_init4v_avx2;
910
pf->integral_init8h = x264_integral_init8h_avx2;
911
pf->integral_init4h = x264_integral_init4h_avx2;
912
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
913
}
914
#endif // HIGH_BIT_DEPTH
915
916
if( !(cpu&X264_CPU_AVX) )
917
return;
918
pf->memzero_aligned = x264_memzero_aligned_avx;
919
pf->plane_copy = x264_plane_copy_avx;
920
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
921
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
922
923
if( cpu&X264_CPU_FMA4 )
924
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
925
926
if( !(cpu&X264_CPU_AVX2) )
927
return;
928
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
929
pf->get_ref = get_ref_avx2;
930
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
931
}
932
933