CoCalc -- mc-c.c

05. Matplotlib / ffmpeg-3.0 / libx264 / common / x86 / mc-c.c
⁵²⁸⁶⁷ views
1
/*****************************************************************************
2
 * mc-c.c: x86 motion compensation
3
 *****************************************************************************
4
 * Copyright (C) 2003-2016 x264 project
5
 *
6
 * Authors: Laurent Aimar <[email protected]>
7
 *          Loren Merritt <[email protected]>
8
 *          Fiona Glaser <[email protected]>
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
 *
24
 * This program is also available under a commercial proprietary license.
25
 * For more information, contact us at [email protected].
26
 *****************************************************************************/
27

28
#include <stdlib.h>
29
#include <stdio.h>
30
#include <string.h>
31

32
#include "common/common.h"
33
#include "mc.h"
34

35
#define DECL_SUF( func, args )\
36
    void func##_mmx2 args;\
37
    void func##_sse2 args;\
38
    void func##_ssse3 args;\
39
    void func##_avx2 args;
40

41
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
42
DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
43
DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
44
DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
45
DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
46
DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
47
DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
48
DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
49
DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
50

51
#define MC_WEIGHT(w,type) \
52
    void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
53

54
#define MC_WEIGHT_OFFSET(w,type) \
55
    void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
56
    void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
57
    MC_WEIGHT(w,type)
58

59
MC_WEIGHT_OFFSET( 4, mmx2 )
60
MC_WEIGHT_OFFSET( 8, mmx2 )
61
MC_WEIGHT_OFFSET( 12, mmx2 )
62
MC_WEIGHT_OFFSET( 16, mmx2 )
63
MC_WEIGHT_OFFSET( 20, mmx2 )
64
MC_WEIGHT_OFFSET( 12, sse2 )
65
MC_WEIGHT_OFFSET( 16, sse2 )
66
MC_WEIGHT_OFFSET( 20, sse2 )
67
#if HIGH_BIT_DEPTH
68
MC_WEIGHT_OFFSET( 8, sse2 )
69
#endif
70
MC_WEIGHT( 8, sse2  )
71
MC_WEIGHT( 4, ssse3 )
72
MC_WEIGHT( 8, ssse3 )
73
MC_WEIGHT( 12, ssse3 )
74
MC_WEIGHT( 16, ssse3 )
75
MC_WEIGHT( 20, ssse3 )
76
MC_WEIGHT( 8, avx2 )
77
MC_WEIGHT( 16, avx2 )
78
MC_WEIGHT( 20, avx2 )
79
#undef MC_OFFSET
80
#undef MC_WEIGHT
81

82
void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
83
void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
84
void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
85
void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
86
void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
87
void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
88
void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
89
void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
90
void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
91
void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
92
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
93
void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
94
void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
95
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
96
void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
97
void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
98
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
99
void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
100
                                           pixel *srcu, intptr_t i_srcu,
101
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
102
void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
103
                                           pixel *srcu, intptr_t i_srcu,
104
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
105
void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
106
                                          pixel *srcu, intptr_t i_srcu,
107
                                          pixel *srcv, intptr_t i_srcv, int w, int h );
108
void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
109
                                   pixel *srcu, intptr_t i_srcu,
110
                                   pixel *srcv, intptr_t i_srcv, int w, int h );
111
void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
112
                                       pixel *dstv, intptr_t i_dstv,
113
                                       pixel *src,  intptr_t i_src, int w, int h );
114
void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
115
                                        pixel *dstv, intptr_t i_dstv,
116
                                        pixel *src,  intptr_t i_src, int w, int h );
117
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
118
                                         uint8_t *dstv, intptr_t i_dstv,
119
                                         uint8_t *src,  intptr_t i_src, int w, int h );
120
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
121
                                       uint16_t *dstv, intptr_t i_dstv,
122
                                       uint16_t *src,  intptr_t i_src, int w, int h );
123
void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
124
                                             pixel *dstb, intptr_t i_dstb,
125
                                             pixel *dstc, intptr_t i_dstc,
126
                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
127
void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
128
                                             pixel *dstb, intptr_t i_dstb,
129
                                             pixel *dstc, intptr_t i_dstc,
130
                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
131
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
132
                                              uint16_t *dstv, intptr_t i_dstv,
133
                                              uint32_t *src,  intptr_t i_src, int w, int h );
134
void x264_plane_copy_deinterleave_v210_avx  ( uint16_t *dstu, intptr_t i_dstu,
135
                                              uint16_t *dstv, intptr_t i_dstv,
136
                                              uint32_t *src,  intptr_t i_src, int w, int h );
137
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
138
                                              uint16_t *dstv, intptr_t i_dstv,
139
                                              uint32_t *src,  intptr_t i_src, int w, int h );
140
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
141
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
142
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
143
void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
144
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
145
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
146
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
147
void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
148
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
149
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
150
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
151
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
152
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
153
void x264_memzero_aligned_mmx( void *dst, size_t n );
154
void x264_memzero_aligned_sse( void *dst, size_t n );
155
void x264_memzero_aligned_avx( void *dst, size_t n );
156
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
157
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
158
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
159
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
160
void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
161
void x264_integral_init4v_mmx  ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
162
void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
163
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
164
void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
165
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
166
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
167
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
168
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
169
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
170
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
171
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
172
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
173
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
174
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
175
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
176

177
#define MC_CHROMA(cpu)\
178
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
179
                           int dx, int dy, int i_width, int i_height );
180
MC_CHROMA(mmx2)
181
MC_CHROMA(sse2)
182
MC_CHROMA(ssse3)
183
MC_CHROMA(ssse3_cache64)
184
MC_CHROMA(avx)
185
MC_CHROMA(avx2)
186

187
#define LOWRES(cpu)\
188
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
189
                                        intptr_t src_stride, intptr_t dst_stride, int width, int height );
190
LOWRES(mmx2)
191
LOWRES(cache32_mmx2)
192
LOWRES(sse2)
193
LOWRES(ssse3)
194
LOWRES(avx)
195
LOWRES(xop)
196
LOWRES(avx2)
197

198
#define PIXEL_AVG_W(width,cpu)\
199
void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
200
/* This declares some functions that don't exist, but that isn't a problem. */
201
#define PIXEL_AVG_WALL(cpu)\
202
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
203

204
PIXEL_AVG_WALL(mmx2)
205
PIXEL_AVG_WALL(cache32_mmx2)
206
PIXEL_AVG_WALL(cache64_mmx2)
207
PIXEL_AVG_WALL(cache64_sse2)
208
PIXEL_AVG_WALL(sse2)
209
PIXEL_AVG_WALL(cache64_ssse3)
210
PIXEL_AVG_WALL(avx2)
211

212
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
213
static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
214
{\
215
    NULL,\
216
    x264_pixel_avg2_w4_##name1,\
217
    x264_pixel_avg2_w8_##name2,\
218
    x264_pixel_avg2_w12_##name3,\
219
    x264_pixel_avg2_w16_##name4,\
220
    x264_pixel_avg2_w20_##name5,\
221
};
222

223
#if HIGH_BIT_DEPTH
224
/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
225
#define x264_pixel_avg2_w12_mmx2       x264_pixel_avg2_w10_mmx2
226
#define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
227
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
228
#define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
229
#define x264_pixel_avg2_w12_avx2         x264_pixel_avg2_w16_avx2
230
#define x264_pixel_avg2_w20_avx2         x264_pixel_avg2_w18_avx2
231
#else
232
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
233
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
234
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
235
#define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
236
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w16_sse2
237
#endif // HIGH_BIT_DEPTH
238

239
PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
240
#if HIGH_BIT_DEPTH
241
PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
242
PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
243
#else // !HIGH_BIT_DEPTH
244
#if ARCH_X86
245
PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
246
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
247
#endif
248
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
249
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
250
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
251
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
252
PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
253
#endif // HIGH_BIT_DEPTH
254

255
#define MC_COPY_WTAB(instr, name1, name2, name3)\
256
static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
257
{\
258
    NULL,\
259
    x264_mc_copy_w4_##name1,\
260
    x264_mc_copy_w8_##name2,\
261
    NULL,\
262
    x264_mc_copy_w16_##name3,\
263
};
264

265
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
266
#if HIGH_BIT_DEPTH
267
MC_COPY_WTAB(sse,mmx,sse,sse)
268
MC_COPY_WTAB(avx,mmx,sse,avx)
269
#else
270
MC_COPY_WTAB(sse,mmx,mmx,sse)
271
#endif
272

273
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
274
    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
275
{\
276
    x264_mc_##function##_w4_##name1,\
277
    x264_mc_##function##_w4_##name1,\
278
    x264_mc_##function##_w8_##name2,\
279
    x264_mc_##function##_w##w12version##_##instr,\
280
    x264_mc_##function##_w16_##instr,\
281
    x264_mc_##function##_w20_##instr,\
282
};
283

284
#if HIGH_BIT_DEPTH
285
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
286
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
287
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
288
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
289
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
290
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)
291

292
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
293
{
294
    if( w->i_scale == 1<<w->i_denom )
295
    {
296
        if( w->i_offset < 0 )
297
            w->weightfn = h->mc.offsetsub;
298
        else
299
            w->weightfn = h->mc.offsetadd;
300
        for( int i = 0; i < 8; i++ )
301
            w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
302
        return;
303
    }
304
    w->weightfn = h->mc.weight;
305
    int den1 = 1<<w->i_denom;
306
    int den2 = w->i_scale<<1;
307
    int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
308
    for( int i = 0; i < 8; i++ )
309
    {
310
        w->cachea[i] = den1;
311
        w->cacheb[i] = i&1 ? den3 : den2;
312
    }
313
}
314
#else
315
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
316
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
317
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
318
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
319
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
320
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
321
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
322
MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
323

324
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
325
{
326
    int i;
327
    int16_t den1;
328

329
    if( w->i_scale == 1<<w->i_denom )
330
    {
331
        if( w->i_offset < 0 )
332
            w->weightfn = h->mc.offsetsub;
333
        else
334
            w->weightfn = h->mc.offsetadd;
335
        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
336
        return;
337
    }
338
    w->weightfn = h->mc.weight;
339
    den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
340
    for( i = 0; i < 8; i++ )
341
    {
342
        w->cachea[i] = w->i_scale;
343
        w->cacheb[i] = den1;
344
    }
345
}
346

347
static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
348
{
349
    int i, den1;
350
    if( w->i_scale == 1<<w->i_denom )
351
    {
352
        if( w->i_offset < 0 )
353
            w->weightfn = h->mc.offsetsub;
354
        else
355
            w->weightfn = h->mc.offsetadd;
356

357
        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
358
        return;
359
    }
360
    w->weightfn = h->mc.weight;
361
    den1 = w->i_scale << (8 - w->i_denom);
362
    for( i = 0; i < 8; i++ )
363
    {
364
        w->cachea[i] = den1;
365
        w->cacheb[i] = w->i_offset;
366
    }
367
}
368
#endif // !HIGH_BIT_DEPTH
369

370
#define MC_LUMA(name,instr1,instr2)\
371
static void mc_luma_##name( pixel *dst,    intptr_t i_dst_stride,\
372
                            pixel *src[4], intptr_t i_src_stride,\
373
                            int mvx, int mvy,\
374
                            int i_width, int i_height, const x264_weight_t *weight )\
375
{\
376
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
377
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
378
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
379
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
380
    {\
381
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
382
        x264_pixel_avg_wtab_##instr1[i_width>>2](\
383
                dst, i_dst_stride, src1, i_src_stride,\
384
                src2, i_height );\
385
        if( weight->weightfn )\
386
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
387
    }\
388
    else if( weight->weightfn )\
389
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
390
    else\
391
        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
392
}
393

394
MC_LUMA(mmx2,mmx2,mmx)
395
MC_LUMA(sse2,sse2,sse)
396
#if HIGH_BIT_DEPTH
397
MC_LUMA(avx2,avx2,avx)
398
#else
399
#if ARCH_X86
400
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
401
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
402
#endif
403
MC_LUMA(cache64_sse2,cache64_sse2,sse)
404
MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
405
MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
406
#endif // !HIGH_BIT_DEPTH
407

408
#define GET_REF(name)\
409
static pixel *get_ref_##name( pixel *dst,   intptr_t *i_dst_stride,\
410
                              pixel *src[4], intptr_t i_src_stride,\
411
                              int mvx, int mvy,\
412
                              int i_width, int i_height, const x264_weight_t *weight )\
413
{\
414
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
415
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
416
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
417
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
418
    {\
419
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
420
        x264_pixel_avg_wtab_##name[i_width>>2](\
421
                dst, *i_dst_stride, src1, i_src_stride,\
422
                src2, i_height );\
423
        if( weight->weightfn )\
424
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
425
        return dst;\
426
    }\
427
    else if( weight->weightfn )\
428
    {\
429
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
430
        return dst;\
431
    }\
432
    else\
433
    {\
434
        *i_dst_stride = i_src_stride;\
435
        return src1;\
436
    }\
437
}
438

439
GET_REF(mmx2)
440
GET_REF(sse2)
441
GET_REF(avx2)
442
#if !HIGH_BIT_DEPTH
443
#if ARCH_X86
444
GET_REF(cache32_mmx2)
445
GET_REF(cache64_mmx2)
446
#endif
447
GET_REF(cache64_sse2)
448
GET_REF(cache64_ssse3)
449
GET_REF(cache64_ssse3_atom)
450
#endif // !HIGH_BIT_DEPTH
451

452
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
453
void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
454
void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
455
void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
456
static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
457
                                    intptr_t stride, int width, int height, int16_t *buf )\
458
{\
459
    intptr_t realign = (intptr_t)src & (align-1);\
460
    src -= realign;\
461
    dstv -= realign;\
462
    dstc -= realign;\
463
    dsth -= realign;\
464
    width += realign;\
465
    while( height-- )\
466
    {\
467
        x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
468
        x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
469
        x264_hpel_filter_h_##cpuh( dsth, src, width );\
470
        dsth += stride;\
471
        dstv += stride;\
472
        dstc += stride;\
473
        src  += stride;\
474
    }\
475
    x264_sfence();\
476
}
477

478
HPEL(8, mmx2, mmx2, mmx2, mmx2)
479
#if HIGH_BIT_DEPTH
480
HPEL(16, sse2, sse2, sse2, sse2)
481
#else // !HIGH_BIT_DEPTH
482
HPEL(16, sse2_amd, mmx2, mmx2, sse2)
483
#if ARCH_X86_64
484
void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
485
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
486
void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
487
void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
488
#else
489
HPEL(16, sse2, sse2, sse2, sse2)
490
HPEL(16, ssse3, ssse3, ssse3, ssse3)
491
HPEL(16, avx, avx, avx, avx)
492
HPEL(32, avx2, avx2, avx2, avx2)
493
#endif
494
#endif // HIGH_BIT_DEPTH
495

496
#define PLANE_COPY(align, cpu)\
497
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
498
{\
499
    int c_w = (align) / sizeof(pixel) - 1;\
500
    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
501
        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
502
    else if( !(w&c_w) )\
503
        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
504
    else\
505
    {\
506
        if( --h > 0 )\
507
        {\
508
            if( i_src > 0 )\
509
            {\
510
                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
511
                dst += i_dst * h;\
512
                src += i_src * h;\
513
            }\
514
            else\
515
                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
516
        }\
517
        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
518
        memcpy( dst, src, w*sizeof(pixel) );\
519
    }\
520
}
521

522
PLANE_COPY(16, sse)
523
PLANE_COPY(32, avx)
524

525
#define PLANE_COPY_SWAP(align, cpu)\
526
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
527
{\
528
    int c_w = (align>>1) / sizeof(pixel) - 1;\
529
    if( !(w&c_w) )\
530
        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
531
    else if( w > c_w )\
532
    {\
533
        if( --h > 0 )\
534
        {\
535
            if( i_src > 0 )\
536
            {\
537
                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
538
                dst += i_dst * h;\
539
                src += i_src * h;\
540
            }\
541
            else\
542
                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
543
        }\
544
        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
545
        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
546
        {\
547
            dst[x]   = src[x+1];\
548
            dst[x+1] = src[x];\
549
        }\
550
    }\
551
    else\
552
        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
553
}
554

555
PLANE_COPY_SWAP(16, ssse3)
556
PLANE_COPY_SWAP(32, avx2)
557

558
#define PLANE_INTERLEAVE(cpu) \
559
static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
560
                                              pixel *srcu, intptr_t i_srcu,\
561
                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
562
{\
563
    int c_w = 16 / sizeof(pixel) - 1;\
564
    if( !(w&c_w) )\
565
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
566
    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
567
    {\
568
        if( --h > 0 )\
569
        {\
570
            if( i_srcu > 0 )\
571
            {\
572
                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
573
                dst  += i_dst  * h;\
574
                srcu += i_srcu * h;\
575
                srcv += i_srcv * h;\
576
            }\
577
            else\
578
                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
579
        }\
580
        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
581
    }\
582
    else\
583
        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
584
}
585

586
PLANE_INTERLEAVE(mmx2)
587
PLANE_INTERLEAVE(sse2)
588
#if HIGH_BIT_DEPTH
589
PLANE_INTERLEAVE(avx)
590
#endif
591

592
#if HAVE_X86_INLINE_ASM
593
#undef MC_CLIP_ADD
594
#define MC_CLIP_ADD(s,x)\
595
do\
596
{\
597
    int temp;\
598
    asm("movd       %0, %%xmm0     \n"\
599
        "movd       %2, %%xmm1     \n"\
600
        "paddsw %%xmm1, %%xmm0     \n"\
601
        "movd   %%xmm0, %1         \n"\
602
        :"+m"(s), "=&r"(temp)\
603
        :"m"(x)\
604
    );\
605
    s = temp;\
606
} while(0)
607

608
#undef MC_CLIP_ADD2
609
#define MC_CLIP_ADD2(s,x)\
610
do\
611
{\
612
    asm("movd       %0, %%xmm0     \n"\
613
        "movd       %1, %%xmm1     \n"\
614
        "paddsw %%xmm1, %%xmm0     \n"\
615
        "movd   %%xmm0, %0         \n"\
616
        :"+m"(M32(s))\
617
        :"m"(M32(x))\
618
    );\
619
} while(0)
620
#endif
621

622
PROPAGATE_LIST(ssse3)
623
PROPAGATE_LIST(avx)
624

625
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
626
{
627
    if( !(cpu&X264_CPU_MMX) )
628
        return;
629

630
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
631
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
632

633
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
634

635
    pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
636
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
637
    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
638
    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
639
    pf->memcpy_aligned  = x264_memcpy_aligned_mmx;
640
    pf->memzero_aligned = x264_memzero_aligned_mmx;
641
    pf->integral_init4v = x264_integral_init4v_mmx;
642
    pf->integral_init8v = x264_integral_init8v_mmx;
643

644
    if( !(cpu&X264_CPU_MMX2) )
645
        return;
646

647
    pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
648
    pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
649
    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
650

651
    pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
652
    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
653

654
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
655
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
656
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmx2;
657
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmx2;
658
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmx2;
659
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_mmx2;
660
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmx2;
661
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmx2;
662
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmx2;
663

664
    pf->mc_luma = mc_luma_mmx2;
665
    pf->get_ref = get_ref_mmx2;
666
    pf->mc_chroma = x264_mc_chroma_mmx2;
667
    pf->hpel_filter = x264_hpel_filter_mmx2;
668
    pf->weight = x264_mc_weight_wtab_mmx2;
669
    pf->weight_cache = x264_weight_cache_mmx2;
670
    pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
671
    pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;
672

673
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
674

675
    if( cpu&X264_CPU_SSE )
676
    {
677
        pf->memcpy_aligned  = x264_memcpy_aligned_sse;
678
        pf->memzero_aligned = x264_memzero_aligned_sse;
679
        pf->plane_copy = x264_plane_copy_sse;
680
    }
681

682
#if HIGH_BIT_DEPTH
683
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
684
    if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
685
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
686
#endif
687

688
    if( !(cpu&X264_CPU_SSE2) )
689
        return;
690

691
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
692

693
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
694
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
695

696
    pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
697
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
698

699
    if( cpu&X264_CPU_SSE2_IS_FAST )
700
    {
701
        pf->get_ref = get_ref_sse2;
702
        pf->mc_luma = mc_luma_sse2;
703
        pf->hpel_filter = x264_hpel_filter_sse2;
704
    }
705

706
    pf->integral_init4v = x264_integral_init4v_sse2;
707
    pf->integral_init8v = x264_integral_init8v_sse2;
708
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
709
    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
710
    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
711
    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
712

713
    if( cpu&X264_CPU_SSE2_IS_SLOW )
714
        return;
715

716
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
717
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
718
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_sse2;
719
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_sse2;
720
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_sse2;
721
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sse2;
722
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sse2;
723
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
724
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;
725

726
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
727
    pf->weight = x264_mc_weight_wtab_sse2;
728

729
    if( !(cpu&X264_CPU_STACK_MOD4) )
730
        pf->mc_chroma = x264_mc_chroma_sse2;
731

732
    if( !(cpu&X264_CPU_SSSE3) )
733
        return;
734

735
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
736
    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
737
    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
738
    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
739

740
    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
741
        pf->integral_init4v = x264_integral_init4v_ssse3;
742

743
    if( !(cpu&X264_CPU_AVX) )
744
        return;
745

746
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
747
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
748
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
749
    pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
750
    pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
751
    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
752
    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
753
    pf->copy[PIXEL_16x16]            = x264_mc_copy_w16_aligned_avx;
754

755
    if( !(cpu&X264_CPU_STACK_MOD4) )
756
        pf->mc_chroma = x264_mc_chroma_avx;
757

758
    if( cpu&X264_CPU_XOP )
759
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
760

761
    if( cpu&X264_CPU_AVX2 )
762
    {
763
        pf->mc_luma = mc_luma_avx2;
764
        pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
765
    }
766
#else // !HIGH_BIT_DEPTH
767

768
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
769
    if( cpu&X264_CPU_CACHELINE_32 )
770
    {
771
        pf->mc_luma = mc_luma_cache32_mmx2;
772
        pf->get_ref = get_ref_cache32_mmx2;
773
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
774
    }
775
    else if( cpu&X264_CPU_CACHELINE_64 )
776
    {
777
        pf->mc_luma = mc_luma_cache64_mmx2;
778
        pf->get_ref = get_ref_cache64_mmx2;
779
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
780
    }
781
#endif
782

783
    if( !(cpu&X264_CPU_SSE2) )
784
        return;
785

786
    pf->integral_init4v = x264_integral_init4v_sse2;
787
    pf->integral_init8v = x264_integral_init8v_sse2;
788
    pf->hpel_filter = x264_hpel_filter_sse2_amd;
789
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
790
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
791

792
    if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
793
    {
794
        pf->weight = x264_mc_weight_wtab_sse2;
795
        if( !(cpu&X264_CPU_SLOW_ATOM) )
796
        {
797
            pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
798
            pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
799
        }
800

801
        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
802
        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
803
        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
804
        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
805
        pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
806
        pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
807
        pf->hpel_filter = x264_hpel_filter_sse2;
808
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
809
        if( !(cpu&X264_CPU_STACK_MOD4) )
810
            pf->mc_chroma = x264_mc_chroma_sse2;
811

812
        if( cpu&X264_CPU_SSE2_IS_FAST )
813
        {
814
            pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
815
            pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
816
            pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
817
            pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
818
            pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
819
            pf->mc_luma = mc_luma_sse2;
820
            pf->get_ref = get_ref_sse2;
821
            if( cpu&X264_CPU_CACHELINE_64 )
822
            {
823
                pf->mc_luma = mc_luma_cache64_sse2;
824
                pf->get_ref = get_ref_cache64_sse2;
825
            }
826
        }
827
    }
828

829
    if( !(cpu&X264_CPU_SSSE3) )
830
        return;
831

832
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
833
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_ssse3;
834
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_ssse3;
835
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_ssse3;
836
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_ssse3;
837
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_ssse3;
838
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
839
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
840
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
841
    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
842
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
843
    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
844

845
    if( !(cpu&X264_CPU_SLOW_PSHUFB) )
846
    {
847
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
848
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
849
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
850
    }
851

852
    if( !(cpu&X264_CPU_SLOW_PALIGNR) )
853
    {
854
#if ARCH_X86_64
855
        if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
856
#endif
857
            pf->hpel_filter = x264_hpel_filter_ssse3;
858
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
859
    }
860
    if( !(cpu&X264_CPU_STACK_MOD4) )
861
        pf->mc_chroma = x264_mc_chroma_ssse3;
862

863
    if( cpu&X264_CPU_CACHELINE_64 )
864
    {
865
        if( !(cpu&X264_CPU_STACK_MOD4) )
866
            pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
867
        pf->mc_luma = mc_luma_cache64_ssse3;
868
        pf->get_ref = get_ref_cache64_ssse3;
869
        if( cpu&X264_CPU_SLOW_ATOM )
870
        {
871
            pf->mc_luma = mc_luma_cache64_ssse3_atom;
872
            pf->get_ref = get_ref_cache64_ssse3_atom;
873
        }
874
    }
875

876
    pf->weight_cache = x264_weight_cache_ssse3;
877
    pf->weight = x264_mc_weight_wtab_ssse3;
878

879
    if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
880
        pf->integral_init4v = x264_integral_init4v_ssse3;
881

882
    if( !(cpu&X264_CPU_SSE4) )
883
        return;
884

885
    pf->integral_init4h = x264_integral_init4h_sse4;
886
    pf->integral_init8h = x264_integral_init8h_sse4;
887

888
    if( !(cpu&X264_CPU_AVX) )
889
        return;
890

891
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
892
    pf->integral_init8h = x264_integral_init8h_avx;
893
    pf->hpel_filter = x264_hpel_filter_avx;
894

895
    if( !(cpu&X264_CPU_STACK_MOD4) )
896
        pf->mc_chroma = x264_mc_chroma_avx;
897

898
    if( cpu&X264_CPU_XOP )
899
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
900

901
    if( cpu&X264_CPU_AVX2 )
902
    {
903
        pf->hpel_filter = x264_hpel_filter_avx2;
904
        pf->mc_chroma = x264_mc_chroma_avx2;
905
        pf->weight = x264_mc_weight_wtab_avx2;
906
        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
907
        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_avx2;
908
        pf->integral_init8v = x264_integral_init8v_avx2;
909
        pf->integral_init4v = x264_integral_init4v_avx2;
910
        pf->integral_init8h = x264_integral_init8h_avx2;
911
        pf->integral_init4h = x264_integral_init4h_avx2;
912
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
913
    }
914
#endif // HIGH_BIT_DEPTH
915

916
    if( !(cpu&X264_CPU_AVX) )
917
        return;
918
    pf->memzero_aligned = x264_memzero_aligned_avx;
919
    pf->plane_copy = x264_plane_copy_avx;
920
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
921
    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
922

923
    if( cpu&X264_CPU_FMA4 )
924
        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
925

926
    if( !(cpu&X264_CPU_AVX2) )
927
        return;
928
    pf->plane_copy_swap = x264_plane_copy_swap_avx2;
929
    pf->get_ref = get_ref_avx2;
930
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
931
}
932

933
Product

Resources

Company