Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
;*****************************************************************************
2
;* mc-a.asm: x86 motion compensation
3
;*****************************************************************************
4
;* Copyright (C) 2003-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Laurent Aimar <fenrir@via.ecp.fr>
9
;* Dylan Yudaken <dyudaken@gmail.com>
10
;* Holger Lubitz <holger@lubitz.org>
11
;* Min Chen <chenm001.163.com>
12
;* Oskar Arvidsson <oskar@irock.se>
13
;*
14
;* This program is free software; you can redistribute it and/or modify
15
;* it under the terms of the GNU General Public License as published by
16
;* the Free Software Foundation; either version 2 of the License, or
17
;* (at your option) any later version.
18
;*
19
;* This program is distributed in the hope that it will be useful,
20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22
;* GNU General Public License for more details.
23
;*
24
;* You should have received a copy of the GNU General Public License
25
;* along with this program; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
27
;*
28
;* This program is also available under a commercial proprietary license.
29
;* For more information, contact us at licensing@x264.com.
30
;*****************************************************************************
31
32
%include "x86inc.asm"
33
%include "x86util.asm"
34
35
SECTION_RODATA 32
36
37
ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38
ch_shuf_adj: times 8 db 0
39
times 8 db 2
40
times 8 db 4
41
times 8 db 6
42
sq_1: times 1 dq 1
43
44
SECTION .text
45
46
cextern pb_0
47
cextern pw_1
48
cextern pw_4
49
cextern pw_8
50
cextern pw_32
51
cextern pw_64
52
cextern pw_512
53
cextern pw_00ff
54
cextern pw_pixel_max
55
cextern sw_64
56
cextern pd_32
57
cextern deinterleave_shufd
58
59
;=============================================================================
60
; implicit weighted biprediction
61
;=============================================================================
62
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
63
%if WIN64
64
DECLARE_REG_TMP 0,1,2,3,4,5,4,5
65
%macro AVG_START 0-1 0
66
PROLOGUE 6,7,%1
67
%endmacro
68
%elif UNIX64
69
DECLARE_REG_TMP 0,1,2,3,4,5,7,8
70
%macro AVG_START 0-1 0
71
PROLOGUE 6,9,%1
72
%endmacro
73
%else
74
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
75
%macro AVG_START 0-1 0
76
PROLOGUE 0,7,%1
77
mov t0, r0m
78
mov t1, r1m
79
mov t2, r2m
80
mov t3, r3m
81
mov t4, r4m
82
mov t5, r5m
83
%endmacro
84
%endif
85
86
%macro AVG_END 0
87
lea t4, [t4+t5*2*SIZEOF_PIXEL]
88
lea t2, [t2+t3*2*SIZEOF_PIXEL]
89
lea t0, [t0+t1*2*SIZEOF_PIXEL]
90
sub eax, 2
91
jg .height_loop
92
RET
93
%endmacro
94
95
%if HIGH_BIT_DEPTH
96
97
%macro BIWEIGHT_MMX 2
98
movh m0, %1
99
movh m1, %2
100
punpcklwd m0, m1
101
pmaddwd m0, m3
102
paddd m0, m4
103
psrad m0, 6
104
%endmacro
105
106
%macro BIWEIGHT_START_MMX 0
107
movzx t6d, word r6m
108
mov t7d, 64
109
sub t7d, t6d
110
shl t7d, 16
111
add t6d, t7d
112
movd m3, t6d
113
SPLATD m3, m3
114
mova m4, [pd_32]
115
pxor m5, m5
116
%endmacro
117
118
%else ;!HIGH_BIT_DEPTH
119
%macro BIWEIGHT_MMX 2
120
movh m0, %1
121
movh m1, %2
122
punpcklbw m0, m5
123
punpcklbw m1, m5
124
pmullw m0, m2
125
pmullw m1, m3
126
paddw m0, m1
127
paddw m0, m4
128
psraw m0, 6
129
%endmacro
130
131
%macro BIWEIGHT_START_MMX 0
132
movd m2, r6m
133
SPLATW m2, m2 ; weight_dst
134
mova m3, [pw_64]
135
psubw m3, m2 ; weight_src
136
mova m4, [pw_32] ; rounding
137
pxor m5, m5
138
%endmacro
139
%endif ;HIGH_BIT_DEPTH
140
141
%macro BIWEIGHT_SSSE3 2
142
movh m0, %1
143
movh m1, %2
144
punpcklbw m0, m1
145
pmaddubsw m0, m3
146
pmulhrsw m0, m4
147
%endmacro
148
149
%macro BIWEIGHT_START_SSSE3 0
150
movzx t6d, byte r6m ; FIXME x86_64
151
mov t7d, 64
152
sub t7d, t6d
153
shl t7d, 8
154
add t6d, t7d
155
mova m4, [pw_512]
156
movd xm3, t6d
157
%if cpuflag(avx2)
158
vpbroadcastw m3, xm3
159
%else
160
SPLATW m3, m3 ; weight_dst,src
161
%endif
162
%endmacro
163
164
%if HIGH_BIT_DEPTH
165
%macro BIWEIGHT_ROW 4
166
BIWEIGHT [%2], [%3]
167
%if %4==mmsize/4
168
packssdw m0, m0
169
CLIPW m0, m5, m7
170
movh [%1], m0
171
%else
172
SWAP 0, 6
173
BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
174
packssdw m6, m0
175
CLIPW m6, m5, m7
176
mova [%1], m6
177
%endif
178
%endmacro
179
180
%else ;!HIGH_BIT_DEPTH
181
%macro BIWEIGHT_ROW 4
182
BIWEIGHT [%2], [%3]
183
%if %4==mmsize/2
184
packuswb m0, m0
185
movh [%1], m0
186
%else
187
SWAP 0, 6
188
BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
189
packuswb m6, m0
190
mova [%1], m6
191
%endif
192
%endmacro
193
194
%endif ;HIGH_BIT_DEPTH
195
196
;-----------------------------------------------------------------------------
197
; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
198
;-----------------------------------------------------------------------------
199
%macro AVG_WEIGHT 1-2 0
200
cglobal pixel_avg_weight_w%1
201
BIWEIGHT_START
202
AVG_START %2
203
%if HIGH_BIT_DEPTH
204
mova m7, [pw_pixel_max]
205
%endif
206
.height_loop:
207
%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
208
BIWEIGHT [t2], [t4]
209
SWAP 0, 6
210
BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
211
%if HIGH_BIT_DEPTH
212
packssdw m6, m0
213
CLIPW m6, m5, m7
214
%else ;!HIGH_BIT_DEPTH
215
packuswb m6, m0
216
%endif ;HIGH_BIT_DEPTH
217
movlps [t0], m6
218
movhps [t0+SIZEOF_PIXEL*t1], m6
219
%else
220
%assign x 0
221
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
222
BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
223
BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
224
%assign x x+mmsize
225
%endrep
226
%endif
227
AVG_END
228
%endmacro
229
230
%define BIWEIGHT BIWEIGHT_MMX
231
%define BIWEIGHT_START BIWEIGHT_START_MMX
232
INIT_MMX mmx2
233
AVG_WEIGHT 4
234
AVG_WEIGHT 8
235
AVG_WEIGHT 16
236
%if HIGH_BIT_DEPTH
237
INIT_XMM sse2
238
AVG_WEIGHT 4, 8
239
AVG_WEIGHT 8, 8
240
AVG_WEIGHT 16, 8
241
%else ;!HIGH_BIT_DEPTH
242
INIT_XMM sse2
243
AVG_WEIGHT 8, 7
244
AVG_WEIGHT 16, 7
245
%define BIWEIGHT BIWEIGHT_SSSE3
246
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
247
INIT_MMX ssse3
248
AVG_WEIGHT 4
249
INIT_XMM ssse3
250
AVG_WEIGHT 8, 7
251
AVG_WEIGHT 16, 7
252
253
INIT_YMM avx2
254
cglobal pixel_avg_weight_w16
255
BIWEIGHT_START
256
AVG_START 5
257
.height_loop:
258
movu xm0, [t2]
259
movu xm1, [t4]
260
vinserti128 m0, m0, [t2+t3], 1
261
vinserti128 m1, m1, [t4+t5], 1
262
SBUTTERFLY bw, 0, 1, 2
263
pmaddubsw m0, m3
264
pmaddubsw m1, m3
265
pmulhrsw m0, m4
266
pmulhrsw m1, m4
267
packuswb m0, m1
268
mova [t0], xm0
269
vextracti128 [t0+t1], m0, 1
270
AVG_END
271
%endif ;HIGH_BIT_DEPTH
272
273
;=============================================================================
274
; P frame explicit weighted prediction
275
;=============================================================================
276
277
%if HIGH_BIT_DEPTH
278
; width
279
%macro WEIGHT_START 1
280
mova m0, [r4+ 0] ; 1<<denom
281
mova m3, [r4+16]
282
movd m2, [r4+32] ; denom
283
mova m4, [pw_pixel_max]
284
paddw m2, [sq_1] ; denom+1
285
%endmacro
286
287
; src1, src2
288
%macro WEIGHT 2
289
movh m5, [%1]
290
movh m6, [%2]
291
punpcklwd m5, m0
292
punpcklwd m6, m0
293
pmaddwd m5, m3
294
pmaddwd m6, m3
295
psrad m5, m2
296
psrad m6, m2
297
packssdw m5, m6
298
%endmacro
299
300
; src, dst, width
301
%macro WEIGHT_TWO_ROW 4
302
%assign x 0
303
%rep (%3+mmsize/2-1)/(mmsize/2)
304
%if %3-x/2 <= 4 && mmsize == 16
305
WEIGHT %1+x, %1+r3+x
306
CLIPW m5, [pb_0], m4
307
movh [%2+x], m5
308
movhps [%2+r1+x], m5
309
%else
310
WEIGHT %1+x, %1+x+mmsize/2
311
SWAP 5, 7
312
WEIGHT %1+r3+x, %1+r3+x+mmsize/2
313
CLIPW m5, [pb_0], m4
314
CLIPW m7, [pb_0], m4
315
mova [%2+x], m7
316
mova [%2+r1+x], m5
317
%endif
318
%assign x x+mmsize
319
%endrep
320
%endmacro
321
322
%else ; !HIGH_BIT_DEPTH
323
324
%macro WEIGHT_START 1
325
%if cpuflag(avx2)
326
vbroadcasti128 m3, [r4]
327
vbroadcasti128 m4, [r4+16]
328
%else
329
mova m3, [r4]
330
mova m4, [r4+16]
331
%if notcpuflag(ssse3)
332
movd m5, [r4+32]
333
%endif
334
%endif
335
pxor m2, m2
336
%endmacro
337
338
; src1, src2, dst1, dst2, fast
339
%macro WEIGHT_ROWx2 5
340
movh m0, [%1 ]
341
movh m1, [%1+mmsize/2]
342
movh m6, [%2 ]
343
movh m7, [%2+mmsize/2]
344
punpcklbw m0, m2
345
punpcklbw m1, m2
346
punpcklbw m6, m2
347
punpcklbw m7, m2
348
%if cpuflag(ssse3)
349
%if %5==0
350
psllw m0, 7
351
psllw m1, 7
352
psllw m6, 7
353
psllw m7, 7
354
%endif
355
pmulhrsw m0, m3
356
pmulhrsw m1, m3
357
pmulhrsw m6, m3
358
pmulhrsw m7, m3
359
paddw m0, m4
360
paddw m1, m4
361
paddw m6, m4
362
paddw m7, m4
363
%else
364
pmullw m0, m3
365
pmullw m1, m3
366
pmullw m6, m3
367
pmullw m7, m3
368
paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
369
paddsw m1, m4
370
paddsw m6, m4
371
paddsw m7, m4
372
psraw m0, m5
373
psraw m1, m5
374
psraw m6, m5
375
psraw m7, m5
376
%endif
377
packuswb m0, m1
378
packuswb m6, m7
379
mova [%3], m0
380
mova [%4], m6
381
%endmacro
382
383
; src1, src2, dst1, dst2, width, fast
384
%macro WEIGHT_COL 6
385
%if cpuflag(avx2)
386
%if %5==16
387
movu xm0, [%1]
388
vinserti128 m0, m0, [%2], 1
389
punpckhbw m1, m0, m2
390
punpcklbw m0, m0, m2
391
%if %6==0
392
psllw m0, 7
393
psllw m1, 7
394
%endif
395
pmulhrsw m0, m3
396
pmulhrsw m1, m3
397
paddw m0, m4
398
paddw m1, m4
399
packuswb m0, m1
400
mova [%3], xm0
401
vextracti128 [%4], m0, 1
402
%else
403
movq xm0, [%1]
404
vinserti128 m0, m0, [%2], 1
405
punpcklbw m0, m2
406
%if %6==0
407
psllw m0, 7
408
%endif
409
pmulhrsw m0, m3
410
paddw m0, m4
411
packuswb m0, m0
412
vextracti128 xm1, m0, 1
413
%if %5 == 8
414
movq [%3], xm0
415
movq [%4], xm1
416
%else
417
movd [%3], xm0
418
movd [%4], xm1
419
%endif
420
%endif
421
%else
422
movh m0, [%1]
423
movh m1, [%2]
424
punpcklbw m0, m2
425
punpcklbw m1, m2
426
%if cpuflag(ssse3)
427
%if %6==0
428
psllw m0, 7
429
psllw m1, 7
430
%endif
431
pmulhrsw m0, m3
432
pmulhrsw m1, m3
433
paddw m0, m4
434
paddw m1, m4
435
%else
436
pmullw m0, m3
437
pmullw m1, m3
438
paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
439
paddsw m1, m4
440
psraw m0, m5
441
psraw m1, m5
442
%endif
443
%if %5 == 8
444
packuswb m0, m1
445
movh [%3], m0
446
movhps [%4], m0
447
%else
448
packuswb m0, m0
449
packuswb m1, m1
450
movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
451
movd [%4], m1
452
%endif
453
%endif
454
%endmacro
455
; src, dst, width
456
%macro WEIGHT_TWO_ROW 4
457
%assign x 0
458
%rep %3
459
%if (%3-x) >= mmsize
460
WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
461
%assign x (x+mmsize)
462
%else
463
%assign w %3-x
464
%if w == 20
465
%assign w 16
466
%endif
467
WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
468
%assign x (x+w)
469
%endif
470
%if x >= %3
471
%exitrep
472
%endif
473
%endrep
474
%endmacro
475
476
%endif ; HIGH_BIT_DEPTH
477
478
;-----------------------------------------------------------------------------
479
;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
480
;-----------------------------------------------------------------------------
481
482
%macro WEIGHTER 1
483
cglobal mc_weight_w%1, 6,6,8
484
FIX_STRIDES r1, r3
485
WEIGHT_START %1
486
%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
487
; we can merge the shift step into the scale factor
488
; if (m3<<7) doesn't overflow an int16_t
489
cmp byte [r4+1], 0
490
jz .fast
491
%endif
492
.loop:
493
WEIGHT_TWO_ROW r2, r0, %1, 0
494
lea r0, [r0+r1*2]
495
lea r2, [r2+r3*2]
496
sub r5d, 2
497
jg .loop
498
RET
499
%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
500
.fast:
501
psllw m3, 7
502
.fastloop:
503
WEIGHT_TWO_ROW r2, r0, %1, 1
504
lea r0, [r0+r1*2]
505
lea r2, [r2+r3*2]
506
sub r5d, 2
507
jg .fastloop
508
RET
509
%endif
510
%endmacro
511
512
INIT_MMX mmx2
513
WEIGHTER 4
514
WEIGHTER 8
515
WEIGHTER 12
516
WEIGHTER 16
517
WEIGHTER 20
518
INIT_XMM sse2
519
WEIGHTER 8
520
WEIGHTER 16
521
WEIGHTER 20
522
%if HIGH_BIT_DEPTH
523
WEIGHTER 12
524
%else
525
INIT_MMX ssse3
526
WEIGHTER 4
527
INIT_XMM ssse3
528
WEIGHTER 8
529
WEIGHTER 16
530
WEIGHTER 20
531
INIT_YMM avx2
532
WEIGHTER 8
533
WEIGHTER 16
534
WEIGHTER 20
535
%endif
536
537
%macro OFFSET_OP 7
538
mov%6 m0, [%1]
539
mov%6 m1, [%2]
540
%if HIGH_BIT_DEPTH
541
p%5usw m0, m2
542
p%5usw m1, m2
543
%ifidn %5,add
544
pminsw m0, m3
545
pminsw m1, m3
546
%endif
547
%else
548
p%5usb m0, m2
549
p%5usb m1, m2
550
%endif
551
mov%7 [%3], m0
552
mov%7 [%4], m1
553
%endmacro
554
555
%macro OFFSET_TWO_ROW 4
556
%assign x 0
557
%rep %3
558
%if (%3*SIZEOF_PIXEL-x) >= mmsize
559
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
560
%assign x (x+mmsize)
561
%else
562
%if HIGH_BIT_DEPTH
563
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
564
%else
565
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
566
%endif
567
%exitrep
568
%endif
569
%if x >= %3*SIZEOF_PIXEL
570
%exitrep
571
%endif
572
%endrep
573
%endmacro
574
575
;-----------------------------------------------------------------------------
576
;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
577
;-----------------------------------------------------------------------------
578
%macro OFFSET 2
579
cglobal mc_offset%2_w%1, 6,6
580
FIX_STRIDES r1, r3
581
mova m2, [r4]
582
%if HIGH_BIT_DEPTH
583
%ifidn %2,add
584
mova m3, [pw_pixel_max]
585
%endif
586
%endif
587
.loop:
588
OFFSET_TWO_ROW r2, r0, %1, %2
589
lea r0, [r0+r1*2]
590
lea r2, [r2+r3*2]
591
sub r5d, 2
592
jg .loop
593
RET
594
%endmacro
595
596
%macro OFFSETPN 1
597
OFFSET %1, add
598
OFFSET %1, sub
599
%endmacro
600
INIT_MMX mmx2
601
OFFSETPN 4
602
OFFSETPN 8
603
OFFSETPN 12
604
OFFSETPN 16
605
OFFSETPN 20
606
INIT_XMM sse2
607
OFFSETPN 12
608
OFFSETPN 16
609
OFFSETPN 20
610
%if HIGH_BIT_DEPTH
611
INIT_XMM sse2
612
OFFSETPN 8
613
%endif
614
615
616
;=============================================================================
617
; pixel avg
618
;=============================================================================
619
620
;-----------------------------------------------------------------------------
621
; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
622
; pixel *src2, intptr_t src2_stride, int weight );
623
;-----------------------------------------------------------------------------
624
%macro AVGH 2
625
cglobal pixel_avg_%1x%2
626
mov eax, %2
627
cmp dword r6m, 32
628
jne pixel_avg_weight_w%1 %+ SUFFIX
629
%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
630
jmp pixel_avg_w%1_avx2
631
%else
632
%if mmsize == 16 && %1 == 16
633
test dword r4m, 15
634
jz pixel_avg_w%1_sse2
635
%endif
636
jmp pixel_avg_w%1_mmx2
637
%endif
638
%endmacro
639
640
;-----------------------------------------------------------------------------
641
; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
642
; pixel *src2, intptr_t src2_stride, int height, int weight );
643
;-----------------------------------------------------------------------------
644
645
%macro AVG_FUNC 3
646
cglobal pixel_avg_w%1
647
AVG_START
648
.height_loop:
649
%assign x 0
650
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
651
%2 m0, [t2+x]
652
%2 m1, [t2+x+SIZEOF_PIXEL*t3]
653
%if HIGH_BIT_DEPTH
654
pavgw m0, [t4+x]
655
pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
656
%else ;!HIGH_BIT_DEPTH
657
pavgb m0, [t4+x]
658
pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
659
%endif
660
%3 [t0+x], m0
661
%3 [t0+x+SIZEOF_PIXEL*t1], m1
662
%assign x x+mmsize
663
%endrep
664
AVG_END
665
%endmacro
666
667
%if HIGH_BIT_DEPTH
668
669
INIT_MMX mmx2
670
AVG_FUNC 4, movq, movq
671
AVGH 4, 16
672
AVGH 4, 8
673
AVGH 4, 4
674
AVGH 4, 2
675
676
AVG_FUNC 8, movq, movq
677
AVGH 8, 16
678
AVGH 8, 8
679
AVGH 8, 4
680
681
AVG_FUNC 16, movq, movq
682
AVGH 16, 16
683
AVGH 16, 8
684
685
INIT_XMM sse2
686
AVG_FUNC 4, movq, movq
687
AVGH 4, 16
688
AVGH 4, 8
689
AVGH 4, 4
690
AVGH 4, 2
691
692
AVG_FUNC 8, movdqu, movdqa
693
AVGH 8, 16
694
AVGH 8, 8
695
AVGH 8, 4
696
697
AVG_FUNC 16, movdqu, movdqa
698
AVGH 16, 16
699
AVGH 16, 8
700
701
%else ;!HIGH_BIT_DEPTH
702
703
INIT_MMX mmx2
704
AVG_FUNC 4, movd, movd
705
AVGH 4, 16
706
AVGH 4, 8
707
AVGH 4, 4
708
AVGH 4, 2
709
710
AVG_FUNC 8, movq, movq
711
AVGH 8, 16
712
AVGH 8, 8
713
AVGH 8, 4
714
715
AVG_FUNC 16, movq, movq
716
AVGH 16, 16
717
AVGH 16, 8
718
719
INIT_XMM sse2
720
AVG_FUNC 16, movdqu, movdqa
721
AVGH 16, 16
722
AVGH 16, 8
723
AVGH 8, 16
724
AVGH 8, 8
725
AVGH 8, 4
726
INIT_XMM ssse3
727
AVGH 16, 16
728
AVGH 16, 8
729
AVGH 8, 16
730
AVGH 8, 8
731
AVGH 8, 4
732
INIT_MMX ssse3
733
AVGH 4, 16
734
AVGH 4, 8
735
AVGH 4, 4
736
AVGH 4, 2
737
INIT_XMM avx2
738
AVG_FUNC 16, movdqu, movdqa
739
AVGH 16, 16
740
AVGH 16, 8
741
742
%endif ;HIGH_BIT_DEPTH
743
744
745
746
;=============================================================================
747
; pixel avg2
748
;=============================================================================
749
750
%if HIGH_BIT_DEPTH
751
;-----------------------------------------------------------------------------
752
; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
753
; uint16_t *src1, intptr_t src_stride,
754
; uint16_t *src2, int height );
755
;-----------------------------------------------------------------------------
756
%macro AVG2_W_ONE 1
757
cglobal pixel_avg2_w%1, 6,7,4
758
sub r4, r2
759
lea r6, [r4+r3*2]
760
.height_loop:
761
movu m0, [r2]
762
movu m1, [r2+r3*2]
763
%if cpuflag(avx) || mmsize == 8
764
pavgw m0, [r2+r4]
765
pavgw m1, [r2+r6]
766
%else
767
movu m2, [r2+r4]
768
movu m3, [r2+r6]
769
pavgw m0, m2
770
pavgw m1, m3
771
%endif
772
mova [r0], m0
773
mova [r0+r1*2], m1
774
lea r2, [r2+r3*4]
775
lea r0, [r0+r1*4]
776
sub r5d, 2
777
jg .height_loop
778
RET
779
%endmacro
780
781
%macro AVG2_W_TWO 3
782
cglobal pixel_avg2_w%1, 6,7,8
783
sub r4, r2
784
lea r6, [r4+r3*2]
785
.height_loop:
786
movu m0, [r2]
787
%2 m1, [r2+mmsize]
788
movu m2, [r2+r3*2]
789
%2 m3, [r2+r3*2+mmsize]
790
%if mmsize == 8
791
pavgw m0, [r2+r4]
792
pavgw m1, [r2+r4+mmsize]
793
pavgw m2, [r2+r6]
794
pavgw m3, [r2+r6+mmsize]
795
%else
796
movu m4, [r2+r4]
797
%2 m5, [r2+r4+mmsize]
798
movu m6, [r2+r6]
799
%2 m7, [r2+r6+mmsize]
800
pavgw m0, m4
801
pavgw m1, m5
802
pavgw m2, m6
803
pavgw m3, m7
804
%endif
805
mova [r0], m0
806
%3 [r0+mmsize], m1
807
mova [r0+r1*2], m2
808
%3 [r0+r1*2+mmsize], m3
809
lea r2, [r2+r3*4]
810
lea r0, [r0+r1*4]
811
sub r5d, 2
812
jg .height_loop
813
RET
814
%endmacro
815
816
INIT_MMX mmx2
817
AVG2_W_ONE 4
818
AVG2_W_TWO 8, movu, mova
819
INIT_XMM sse2
820
AVG2_W_ONE 8
821
AVG2_W_TWO 10, movd, movd
822
AVG2_W_TWO 16, movu, mova
823
INIT_YMM avx2
824
AVG2_W_ONE 16
825
826
INIT_MMX
827
cglobal pixel_avg2_w10_mmx2, 6,7
828
sub r4, r2
829
lea r6, [r4+r3*2]
830
.height_loop:
831
movu m0, [r2+ 0]
832
movu m1, [r2+ 8]
833
movh m2, [r2+16]
834
movu m3, [r2+r3*2+ 0]
835
movu m4, [r2+r3*2+ 8]
836
movh m5, [r2+r3*2+16]
837
pavgw m0, [r2+r4+ 0]
838
pavgw m1, [r2+r4+ 8]
839
pavgw m2, [r2+r4+16]
840
pavgw m3, [r2+r6+ 0]
841
pavgw m4, [r2+r6+ 8]
842
pavgw m5, [r2+r6+16]
843
mova [r0+ 0], m0
844
mova [r0+ 8], m1
845
movh [r0+16], m2
846
mova [r0+r1*2+ 0], m3
847
mova [r0+r1*2+ 8], m4
848
movh [r0+r1*2+16], m5
849
lea r2, [r2+r3*2*2]
850
lea r0, [r0+r1*2*2]
851
sub r5d, 2
852
jg .height_loop
853
RET
854
855
cglobal pixel_avg2_w16_mmx2, 6,7
856
sub r4, r2
857
lea r6, [r4+r3*2]
858
.height_loop:
859
movu m0, [r2+ 0]
860
movu m1, [r2+ 8]
861
movu m2, [r2+16]
862
movu m3, [r2+24]
863
movu m4, [r2+r3*2+ 0]
864
movu m5, [r2+r3*2+ 8]
865
movu m6, [r2+r3*2+16]
866
movu m7, [r2+r3*2+24]
867
pavgw m0, [r2+r4+ 0]
868
pavgw m1, [r2+r4+ 8]
869
pavgw m2, [r2+r4+16]
870
pavgw m3, [r2+r4+24]
871
pavgw m4, [r2+r6+ 0]
872
pavgw m5, [r2+r6+ 8]
873
pavgw m6, [r2+r6+16]
874
pavgw m7, [r2+r6+24]
875
mova [r0+ 0], m0
876
mova [r0+ 8], m1
877
mova [r0+16], m2
878
mova [r0+24], m3
879
mova [r0+r1*2+ 0], m4
880
mova [r0+r1*2+ 8], m5
881
mova [r0+r1*2+16], m6
882
mova [r0+r1*2+24], m7
883
lea r2, [r2+r3*2*2]
884
lea r0, [r0+r1*2*2]
885
sub r5d, 2
886
jg .height_loop
887
RET
888
889
cglobal pixel_avg2_w18_mmx2, 6,7
890
sub r4, r2
891
.height_loop:
892
movu m0, [r2+ 0]
893
movu m1, [r2+ 8]
894
movu m2, [r2+16]
895
movu m3, [r2+24]
896
movh m4, [r2+32]
897
pavgw m0, [r2+r4+ 0]
898
pavgw m1, [r2+r4+ 8]
899
pavgw m2, [r2+r4+16]
900
pavgw m3, [r2+r4+24]
901
pavgw m4, [r2+r4+32]
902
mova [r0+ 0], m0
903
mova [r0+ 8], m1
904
mova [r0+16], m2
905
mova [r0+24], m3
906
movh [r0+32], m4
907
lea r2, [r2+r3*2]
908
lea r0, [r0+r1*2]
909
dec r5d
910
jg .height_loop
911
RET
912
913
%macro PIXEL_AVG_W18 0
914
cglobal pixel_avg2_w18, 6,7
915
sub r4, r2
916
.height_loop:
917
movu m0, [r2+ 0]
918
movd xm2, [r2+32]
919
%if mmsize == 32
920
pavgw m0, [r2+r4+ 0]
921
movd xm1, [r2+r4+32]
922
pavgw xm2, xm1
923
%else
924
movu m1, [r2+16]
925
movu m3, [r2+r4+ 0]
926
movu m4, [r2+r4+16]
927
movd m5, [r2+r4+32]
928
pavgw m0, m3
929
pavgw m1, m4
930
pavgw m2, m5
931
mova [r0+16], m1
932
%endif
933
mova [r0+ 0], m0
934
movd [r0+32], xm2
935
lea r2, [r2+r3*2]
936
lea r0, [r0+r1*2]
937
dec r5d
938
jg .height_loop
939
RET
940
%endmacro
941
942
INIT_XMM sse2
943
PIXEL_AVG_W18
944
INIT_YMM avx2
945
PIXEL_AVG_W18
946
947
%endif ; HIGH_BIT_DEPTH
948
949
%if HIGH_BIT_DEPTH == 0
950
;-----------------------------------------------------------------------------
951
; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
952
; uint8_t *src1, intptr_t src_stride,
953
; uint8_t *src2, int height );
954
;-----------------------------------------------------------------------------
955
%macro AVG2_W8 2
956
cglobal pixel_avg2_w%1_mmx2, 6,7
957
sub r4, r2
958
lea r6, [r4+r3]
959
.height_loop:
960
%2 mm0, [r2]
961
%2 mm1, [r2+r3]
962
pavgb mm0, [r2+r4]
963
pavgb mm1, [r2+r6]
964
lea r2, [r2+r3*2]
965
%2 [r0], mm0
966
%2 [r0+r1], mm1
967
lea r0, [r0+r1*2]
968
sub r5d, 2
969
jg .height_loop
970
RET
971
%endmacro
972
973
INIT_MMX
974
AVG2_W8 4, movd
975
AVG2_W8 8, movq
976
977
%macro AVG2_W16 2
978
cglobal pixel_avg2_w%1_mmx2, 6,7
979
sub r2, r4
980
lea r6, [r2+r3]
981
.height_loop:
982
movq mm0, [r4]
983
%2 mm1, [r4+8]
984
movq mm2, [r4+r3]
985
%2 mm3, [r4+r3+8]
986
pavgb mm0, [r4+r2]
987
pavgb mm1, [r4+r2+8]
988
pavgb mm2, [r4+r6]
989
pavgb mm3, [r4+r6+8]
990
lea r4, [r4+r3*2]
991
movq [r0], mm0
992
%2 [r0+8], mm1
993
movq [r0+r1], mm2
994
%2 [r0+r1+8], mm3
995
lea r0, [r0+r1*2]
996
sub r5d, 2
997
jg .height_loop
998
RET
999
%endmacro
1000
1001
AVG2_W16 12, movd
1002
AVG2_W16 16, movq
1003
1004
cglobal pixel_avg2_w20_mmx2, 6,7
1005
sub r2, r4
1006
lea r6, [r2+r3]
1007
.height_loop:
1008
movq mm0, [r4]
1009
movq mm1, [r4+8]
1010
movd mm2, [r4+16]
1011
movq mm3, [r4+r3]
1012
movq mm4, [r4+r3+8]
1013
movd mm5, [r4+r3+16]
1014
pavgb mm0, [r4+r2]
1015
pavgb mm1, [r4+r2+8]
1016
pavgb mm2, [r4+r2+16]
1017
pavgb mm3, [r4+r6]
1018
pavgb mm4, [r4+r6+8]
1019
pavgb mm5, [r4+r6+16]
1020
lea r4, [r4+r3*2]
1021
movq [r0], mm0
1022
movq [r0+8], mm1
1023
movd [r0+16], mm2
1024
movq [r0+r1], mm3
1025
movq [r0+r1+8], mm4
1026
movd [r0+r1+16], mm5
1027
lea r0, [r0+r1*2]
1028
sub r5d, 2
1029
jg .height_loop
1030
RET
1031
1032
INIT_XMM
1033
cglobal pixel_avg2_w16_sse2, 6,7
1034
sub r4, r2
1035
lea r6, [r4+r3]
1036
.height_loop:
1037
movu m0, [r2]
1038
movu m2, [r2+r3]
1039
movu m1, [r2+r4]
1040
movu m3, [r2+r6]
1041
lea r2, [r2+r3*2]
1042
pavgb m0, m1
1043
pavgb m2, m3
1044
mova [r0], m0
1045
mova [r0+r1], m2
1046
lea r0, [r0+r1*2]
1047
sub r5d, 2
1048
jg .height_loop
1049
RET
1050
1051
cglobal pixel_avg2_w20_sse2, 6,7
1052
sub r2, r4
1053
lea r6, [r2+r3]
1054
.height_loop:
1055
movu m0, [r4]
1056
movu m2, [r4+r3]
1057
movu m1, [r4+r2]
1058
movu m3, [r4+r6]
1059
movd mm4, [r4+16]
1060
movd mm5, [r4+r3+16]
1061
pavgb m0, m1
1062
pavgb m2, m3
1063
pavgb mm4, [r4+r2+16]
1064
pavgb mm5, [r4+r6+16]
1065
lea r4, [r4+r3*2]
1066
mova [r0], m0
1067
mova [r0+r1], m2
1068
movd [r0+16], mm4
1069
movd [r0+r1+16], mm5
1070
lea r0, [r0+r1*2]
1071
sub r5d, 2
1072
jg .height_loop
1073
RET
1074
1075
INIT_YMM avx2
1076
cglobal pixel_avg2_w20, 6,7
1077
sub r2, r4
1078
lea r6, [r2+r3]
1079
.height_loop:
1080
movu m0, [r4]
1081
movu m1, [r4+r3]
1082
pavgb m0, [r4+r2]
1083
pavgb m1, [r4+r6]
1084
lea r4, [r4+r3*2]
1085
mova [r0], m0
1086
mova [r0+r1], m1
1087
lea r0, [r0+r1*2]
1088
sub r5d, 2
1089
jg .height_loop
1090
RET
1091
1092
; Cacheline split code for processors with high latencies for loads
1093
; split over cache lines. See sad-a.asm for a more detailed explanation.
1094
; This particular instance is complicated by the fact that src1 and src2
1095
; can have different alignments. For simplicity and code size, only the
1096
; MMX cacheline workaround is used. As a result, in the case of SSE2
1097
; pixel_avg, the cacheline check functions calls the SSE2 version if there
1098
; is no cacheline split, and the MMX workaround if there is.
1099
1100
%macro INIT_SHIFT 2
1101
and eax, 7
1102
shl eax, 3
1103
movd %1, [sw_64]
1104
movd %2, eax
1105
psubw %1, %2
1106
%endmacro
1107
1108
%macro AVG_CACHELINE_START 0
1109
%assign stack_offset 0
1110
INIT_SHIFT mm6, mm7
1111
mov eax, r4m
1112
INIT_SHIFT mm4, mm5
1113
PROLOGUE 6,6
1114
and r2, ~7
1115
and r4, ~7
1116
sub r4, r2
1117
.height_loop:
1118
%endmacro
1119
1120
%macro AVG_CACHELINE_LOOP 2
1121
movq mm1, [r2+%1]
1122
movq mm0, [r2+8+%1]
1123
movq mm3, [r2+r4+%1]
1124
movq mm2, [r2+r4+8+%1]
1125
psrlq mm1, mm7
1126
psllq mm0, mm6
1127
psrlq mm3, mm5
1128
psllq mm2, mm4
1129
por mm0, mm1
1130
por mm2, mm3
1131
pavgb mm2, mm0
1132
%2 [r0+%1], mm2
1133
%endmacro
1134
1135
%macro AVG_CACHELINE_FUNC 2
1136
pixel_avg2_w%1_cache_mmx2:
1137
AVG_CACHELINE_START
1138
AVG_CACHELINE_LOOP 0, movq
1139
%if %1>8
1140
AVG_CACHELINE_LOOP 8, movq
1141
%if %1>16
1142
AVG_CACHELINE_LOOP 16, movd
1143
%endif
1144
%endif
1145
add r2, r3
1146
add r0, r1
1147
dec r5d
1148
jg .height_loop
1149
RET
1150
%endmacro
1151
1152
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
1153
%if %1 == 12
1154
;w12 isn't needed because w16 is just as fast if there's no cacheline split
1155
%define cachesplit pixel_avg2_w16_cache_mmx2
1156
%else
1157
%define cachesplit pixel_avg2_w%1_cache_mmx2
1158
%endif
1159
cglobal pixel_avg2_w%1_cache%2_%3
1160
mov eax, r2m
1161
and eax, %2-1
1162
cmp eax, (%2-%1-(%1 % 8))
1163
%if %1==12||%1==20
1164
jbe pixel_avg2_w%1_%3
1165
%else
1166
jb pixel_avg2_w%1_%3
1167
%endif
1168
%if 0 ; or %1==8 - but the extra branch seems too expensive
1169
ja cachesplit
1170
%if ARCH_X86_64
1171
test r4b, 1
1172
%else
1173
test byte r4m, 1
1174
%endif
1175
jz pixel_avg2_w%1_%3
1176
%else
1177
or eax, r4m
1178
and eax, 7
1179
jz pixel_avg2_w%1_%3
1180
mov eax, r2m
1181
%endif
1182
%if mmsize==16 || (%1==8 && %2==64)
1183
AVG_CACHELINE_FUNC %1, %2
1184
%else
1185
jmp cachesplit
1186
%endif
1187
%endmacro
1188
1189
INIT_MMX
1190
AVG_CACHELINE_CHECK 8, 64, mmx2
1191
AVG_CACHELINE_CHECK 12, 64, mmx2
1192
%if ARCH_X86_64 == 0
1193
AVG_CACHELINE_CHECK 16, 64, mmx2
1194
AVG_CACHELINE_CHECK 20, 64, mmx2
1195
AVG_CACHELINE_CHECK 8, 32, mmx2
1196
AVG_CACHELINE_CHECK 12, 32, mmx2
1197
AVG_CACHELINE_CHECK 16, 32, mmx2
1198
AVG_CACHELINE_CHECK 20, 32, mmx2
1199
%endif
1200
INIT_XMM
1201
AVG_CACHELINE_CHECK 16, 64, sse2
1202
AVG_CACHELINE_CHECK 20, 64, sse2
1203
1204
; computed jump assumes this loop is exactly 48 bytes
1205
%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
1206
ALIGN 16
1207
avg_w16_align%1_%2_ssse3:
1208
%if %1==0 && %2==0
1209
movdqa xmm1, [r2]
1210
pavgb xmm1, [r2+r4]
1211
add r2, r3
1212
%elif %1==0
1213
movdqa xmm1, [r2+r4+16]
1214
palignr xmm1, [r2+r4], %2
1215
pavgb xmm1, [r2]
1216
add r2, r3
1217
%elif %2&15==0
1218
movdqa xmm1, [r2+16]
1219
palignr xmm1, [r2], %1
1220
pavgb xmm1, [r2+r4]
1221
add r2, r3
1222
%else
1223
movdqa xmm1, [r2+16]
1224
movdqa xmm2, [r2+r4+16]
1225
palignr xmm1, [r2], %1
1226
palignr xmm2, [r2+r4], %2&15
1227
add r2, r3
1228
pavgb xmm1, xmm2
1229
%endif
1230
movdqa [r0], xmm1
1231
add r0, r1
1232
dec r5d
1233
jg avg_w16_align%1_%2_ssse3
1234
ret
1235
%if %1==0
1236
; make sure the first ones don't end up short
1237
ALIGN 16
1238
times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
1239
%endif
1240
%endmacro
1241
1242
cglobal pixel_avg2_w16_cache64_ssse3
1243
%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
1244
mov eax, r2m
1245
and eax, 0x3f
1246
cmp eax, 0x30
1247
jb x264_pixel_avg2_w16_sse2
1248
or eax, r4m
1249
and eax, 7
1250
jz x264_pixel_avg2_w16_sse2
1251
%endif
1252
PROLOGUE 6, 8
1253
lea r6, [r4+r2]
1254
and r4, ~0xf
1255
and r6, 0x1f
1256
and r2, ~0xf
1257
lea r6, [r6*3] ;(offset + align*2)*3
1258
sub r4, r2
1259
shl r6, 4 ;jump = (offset + align*2)*48
1260
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
1261
%ifdef PIC
1262
lea r7, [avg_w16_addr]
1263
add r6, r7
1264
%else
1265
lea r6, [avg_w16_addr + r6]
1266
%endif
1267
TAIL_CALL r6, 1
1268
1269
%assign j 0
1270
%assign k 1
1271
%rep 16
1272
AVG16_CACHELINE_LOOP_SSSE3 j, j
1273
AVG16_CACHELINE_LOOP_SSSE3 j, k
1274
%assign j j+1
1275
%assign k k+1
1276
%endrep
1277
%endif ; !HIGH_BIT_DEPTH
1278
1279
;=============================================================================
1280
; pixel copy
1281
;=============================================================================
1282
1283
%macro COPY1 2
1284
movu m0, [r2]
1285
movu m1, [r2+r3]
1286
movu m2, [r2+r3*2]
1287
movu m3, [r2+%2]
1288
mova [r0], m0
1289
mova [r0+r1], m1
1290
mova [r0+r1*2], m2
1291
mova [r0+%1], m3
1292
%endmacro
1293
1294
%macro COPY2 2-4 0, 1
1295
movu m0, [r2+%3*mmsize]
1296
movu m1, [r2+%4*mmsize]
1297
movu m2, [r2+r3+%3*mmsize]
1298
movu m3, [r2+r3+%4*mmsize]
1299
mova [r0+%3*mmsize], m0
1300
mova [r0+%4*mmsize], m1
1301
mova [r0+r1+%3*mmsize], m2
1302
mova [r0+r1+%4*mmsize], m3
1303
movu m0, [r2+r3*2+%3*mmsize]
1304
movu m1, [r2+r3*2+%4*mmsize]
1305
movu m2, [r2+%2+%3*mmsize]
1306
movu m3, [r2+%2+%4*mmsize]
1307
mova [r0+r1*2+%3*mmsize], m0
1308
mova [r0+r1*2+%4*mmsize], m1
1309
mova [r0+%1+%3*mmsize], m2
1310
mova [r0+%1+%4*mmsize], m3
1311
%endmacro
1312
1313
%macro COPY4 2
1314
COPY2 %1, %2, 0, 1
1315
COPY2 %1, %2, 2, 3
1316
%endmacro
1317
1318
;-----------------------------------------------------------------------------
1319
; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
1320
; uint8_t *src, intptr_t i_src_stride, int i_height )
1321
;-----------------------------------------------------------------------------
1322
INIT_MMX
1323
cglobal mc_copy_w4_mmx, 4,6
1324
FIX_STRIDES r1, r3
1325
cmp dword r4m, 4
1326
lea r5, [r3*3]
1327
lea r4, [r1*3]
1328
je .end
1329
%if HIGH_BIT_DEPTH == 0
1330
%define mova movd
1331
%define movu movd
1332
%endif
1333
COPY1 r4, r5
1334
lea r2, [r2+r3*4]
1335
lea r0, [r0+r1*4]
1336
.end:
1337
COPY1 r4, r5
1338
RET
1339
1340
%macro MC_COPY 1
1341
%assign %%w %1*SIZEOF_PIXEL/mmsize
1342
%if %%w > 0
1343
cglobal mc_copy_w%1, 5,7
1344
FIX_STRIDES r1, r3
1345
lea r6, [r3*3]
1346
lea r5, [r1*3]
1347
.height_loop:
1348
COPY %+ %%w r5, r6
1349
lea r2, [r2+r3*4]
1350
lea r0, [r0+r1*4]
1351
sub r4d, 4
1352
jg .height_loop
1353
RET
1354
%endif
1355
%endmacro
1356
1357
INIT_MMX mmx
1358
MC_COPY 8
1359
MC_COPY 16
1360
INIT_XMM sse
1361
MC_COPY 8
1362
MC_COPY 16
1363
INIT_XMM aligned, sse
1364
MC_COPY 16
1365
%if HIGH_BIT_DEPTH
1366
INIT_YMM avx
1367
MC_COPY 16
1368
INIT_YMM aligned, avx
1369
MC_COPY 16
1370
%endif
1371
1372
;=============================================================================
1373
; prefetch
1374
;=============================================================================
1375
; assumes 64 byte cachelines
1376
; FIXME doesn't cover all pixels in high depth and/or 4:4:4
1377
1378
;-----------------------------------------------------------------------------
1379
; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
1380
; pixel *pix_uv, intptr_t stride_uv, int mb_x )
1381
;-----------------------------------------------------------------------------
1382
1383
%macro PREFETCH_FENC 1
1384
%if ARCH_X86_64
1385
cglobal prefetch_fenc_%1, 5,5
1386
FIX_STRIDES r1, r3
1387
and r4d, 3
1388
mov eax, r4d
1389
imul r4d, r1d
1390
lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
1391
prefetcht0 [r0]
1392
prefetcht0 [r0+r1]
1393
lea r0, [r0+r1*2]
1394
prefetcht0 [r0]
1395
prefetcht0 [r0+r1]
1396
1397
imul eax, r3d
1398
lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
1399
prefetcht0 [r2]
1400
prefetcht0 [r2+r3]
1401
%ifidn %1, 422
1402
lea r2, [r2+r3*2]
1403
prefetcht0 [r2]
1404
prefetcht0 [r2+r3]
1405
%endif
1406
RET
1407
1408
%else
1409
cglobal prefetch_fenc_%1, 0,3
1410
mov r2, r4m
1411
mov r1, r1m
1412
mov r0, r0m
1413
FIX_STRIDES r1
1414
and r2, 3
1415
imul r2, r1
1416
lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
1417
prefetcht0 [r0]
1418
prefetcht0 [r0+r1]
1419
lea r0, [r0+r1*2]
1420
prefetcht0 [r0]
1421
prefetcht0 [r0+r1]
1422
1423
mov r2, r4m
1424
mov r1, r3m
1425
mov r0, r2m
1426
FIX_STRIDES r1
1427
and r2, 3
1428
imul r2, r1
1429
lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
1430
prefetcht0 [r0]
1431
prefetcht0 [r0+r1]
1432
%ifidn %1, 422
1433
lea r0, [r0+r1*2]
1434
prefetcht0 [r0]
1435
prefetcht0 [r0+r1]
1436
%endif
1437
ret
1438
%endif ; ARCH_X86_64
1439
%endmacro
1440
1441
INIT_MMX mmx2
1442
PREFETCH_FENC 420
1443
PREFETCH_FENC 422
1444
1445
;-----------------------------------------------------------------------------
1446
; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
1447
;-----------------------------------------------------------------------------
1448
INIT_MMX mmx2
1449
cglobal prefetch_ref, 3,3
1450
FIX_STRIDES r1
1451
dec r2d
1452
and r2d, r1d
1453
lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
1454
lea r2, [r1*3]
1455
prefetcht0 [r0]
1456
prefetcht0 [r0+r1]
1457
prefetcht0 [r0+r1*2]
1458
prefetcht0 [r0+r2]
1459
lea r0, [r0+r1*4]
1460
prefetcht0 [r0]
1461
prefetcht0 [r0+r1]
1462
prefetcht0 [r0+r1*2]
1463
prefetcht0 [r0+r2]
1464
RET
1465
1466
1467
1468
;=============================================================================
1469
; chroma MC
1470
;=============================================================================
1471
1472
%if ARCH_X86_64
1473
DECLARE_REG_TMP 6,7,8
1474
%else
1475
DECLARE_REG_TMP 0,1,2
1476
%endif
1477
1478
%macro MC_CHROMA_START 1
1479
%if ARCH_X86_64
1480
PROLOGUE 0,9,%1
1481
%else
1482
PROLOGUE 0,6,%1
1483
%endif
1484
movifnidn r3, r3mp
1485
movifnidn r4d, r4m
1486
movifnidn r5d, r5m
1487
movifnidn t0d, r6m
1488
mov t2d, t0d
1489
mov t1d, r5d
1490
sar t0d, 3
1491
sar t1d, 3
1492
imul t0d, r4d
1493
lea t0d, [t0+t1*2]
1494
FIX_STRIDES t0d
1495
movsxdifnidn t0, t0d
1496
add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
1497
%endmacro
1498
1499
%if HIGH_BIT_DEPTH
1500
%macro UNPACK_UNALIGNED 4
1501
movu %1, [%4+0]
1502
movu %2, [%4+4]
1503
punpckhwd %3, %1, %2
1504
punpcklwd %1, %2
1505
%if mmsize == 8
1506
mova %2, %1
1507
punpcklwd %1, %3
1508
punpckhwd %2, %3
1509
%else
1510
shufps %2, %1, %3, q3131
1511
shufps %1, %3, q2020
1512
%endif
1513
%endmacro
1514
%else ; !HIGH_BIT_DEPTH
1515
%macro UNPACK_UNALIGNED 3
1516
%if mmsize == 8
1517
punpcklwd %1, %3
1518
%else
1519
movh %2, %3
1520
punpcklwd %1, %2
1521
%endif
1522
%endmacro
1523
%endif ; HIGH_BIT_DEPTH
1524
1525
;-----------------------------------------------------------------------------
1526
; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
1527
; uint8_t *src, intptr_t src_stride,
1528
; int dx, int dy,
1529
; int width, int height )
1530
;-----------------------------------------------------------------------------
1531
%macro MC_CHROMA 0
1532
cglobal mc_chroma
1533
MC_CHROMA_START 0
1534
FIX_STRIDES r4
1535
and r5d, 7
1536
%if ARCH_X86_64
1537
jz .mc1dy
1538
%endif
1539
and t2d, 7
1540
%if ARCH_X86_64
1541
jz .mc1dx
1542
%endif
1543
shl r5d, 16
1544
add t2d, r5d
1545
mov t0d, t2d
1546
shl t2d, 8
1547
sub t2d, t0d
1548
add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1549
cmp dword r7m, 4
1550
%if mmsize==8
1551
.skip_prologue:
1552
%else
1553
jl mc_chroma_mmx2 %+ .skip_prologue
1554
WIN64_SPILL_XMM 9
1555
%endif
1556
movd m5, t2d
1557
movifnidn r0, r0mp
1558
movifnidn r1, r1mp
1559
movifnidn r2d, r2m
1560
movifnidn r5d, r8m
1561
pxor m6, m6
1562
punpcklbw m5, m6
1563
%if mmsize==8
1564
pshufw m7, m5, q3232
1565
pshufw m6, m5, q0000
1566
pshufw m5, m5, q1111
1567
jge .width4
1568
%else
1569
%if WIN64
1570
cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1571
%endif
1572
pshufd m7, m5, q1111
1573
punpcklwd m5, m5
1574
pshufd m6, m5, q0000
1575
pshufd m5, m5, q1111
1576
jg .width8
1577
%endif
1578
%if HIGH_BIT_DEPTH
1579
add r2, r2
1580
UNPACK_UNALIGNED m0, m1, m2, r3
1581
%else
1582
movu m0, [r3]
1583
UNPACK_UNALIGNED m0, m1, [r3+2]
1584
mova m1, m0
1585
pand m0, [pw_00ff]
1586
psrlw m1, 8
1587
%endif ; HIGH_BIT_DEPTH
1588
pmaddwd m0, m7
1589
pmaddwd m1, m7
1590
packssdw m0, m1
1591
SWAP 3, 0
1592
ALIGN 4
1593
.loop2:
1594
%if HIGH_BIT_DEPTH
1595
UNPACK_UNALIGNED m0, m1, m2, r3+r4
1596
pmullw m3, m6
1597
%else ; !HIGH_BIT_DEPTH
1598
movu m0, [r3+r4]
1599
UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1600
pmullw m3, m6
1601
mova m1, m0
1602
pand m0, [pw_00ff]
1603
psrlw m1, 8
1604
%endif ; HIGH_BIT_DEPTH
1605
pmaddwd m0, m7
1606
pmaddwd m1, m7
1607
mova m2, [pw_32]
1608
packssdw m0, m1
1609
paddw m2, m3
1610
mova m3, m0
1611
pmullw m0, m5
1612
paddw m0, m2
1613
psrlw m0, 6
1614
%if HIGH_BIT_DEPTH
1615
movh [r0], m0
1616
%if mmsize == 8
1617
psrlq m0, 32
1618
movh [r1], m0
1619
%else
1620
movhps [r1], m0
1621
%endif
1622
%else ; !HIGH_BIT_DEPTH
1623
packuswb m0, m0
1624
movd [r0], m0
1625
%if mmsize==8
1626
psrlq m0, 16
1627
%else
1628
psrldq m0, 4
1629
%endif
1630
movd [r1], m0
1631
%endif ; HIGH_BIT_DEPTH
1632
add r3, r4
1633
add r0, r2
1634
add r1, r2
1635
dec r5d
1636
jg .loop2
1637
RET
1638
1639
%if mmsize==8
1640
.width4:
1641
%if ARCH_X86_64
1642
mov t0, r0
1643
mov t1, r1
1644
mov t2, r3
1645
%if WIN64
1646
%define multy0 r4m
1647
%else
1648
%define multy0 [rsp-8]
1649
%endif
1650
mova multy0, m5
1651
%else
1652
mov r3m, r3
1653
%define multy0 r4m
1654
mova multy0, m5
1655
%endif
1656
%else
1657
.width8:
1658
%if ARCH_X86_64
1659
%define multy0 m8
1660
SWAP 8, 5
1661
%else
1662
%define multy0 r0m
1663
mova multy0, m5
1664
%endif
1665
%endif
1666
FIX_STRIDES r2
1667
.loopx:
1668
%if HIGH_BIT_DEPTH
1669
UNPACK_UNALIGNED m0, m2, m4, r3
1670
UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
1671
%else
1672
movu m0, [r3]
1673
movu m1, [r3+mmsize/2]
1674
UNPACK_UNALIGNED m0, m2, [r3+2]
1675
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1676
psrlw m2, m0, 8
1677
psrlw m3, m1, 8
1678
pand m0, [pw_00ff]
1679
pand m1, [pw_00ff]
1680
%endif
1681
pmaddwd m0, m7
1682
pmaddwd m2, m7
1683
pmaddwd m1, m7
1684
pmaddwd m3, m7
1685
packssdw m0, m2
1686
packssdw m1, m3
1687
SWAP 4, 0
1688
SWAP 5, 1
1689
add r3, r4
1690
ALIGN 4
1691
.loop4:
1692
%if HIGH_BIT_DEPTH
1693
UNPACK_UNALIGNED m0, m1, m2, r3
1694
pmaddwd m0, m7
1695
pmaddwd m1, m7
1696
packssdw m0, m1
1697
UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
1698
pmaddwd m1, m7
1699
pmaddwd m2, m7
1700
packssdw m1, m2
1701
%else ; !HIGH_BIT_DEPTH
1702
movu m0, [r3]
1703
movu m1, [r3+mmsize/2]
1704
UNPACK_UNALIGNED m0, m2, [r3+2]
1705
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1706
psrlw m2, m0, 8
1707
psrlw m3, m1, 8
1708
pand m0, [pw_00ff]
1709
pand m1, [pw_00ff]
1710
pmaddwd m0, m7
1711
pmaddwd m2, m7
1712
pmaddwd m1, m7
1713
pmaddwd m3, m7
1714
packssdw m0, m2
1715
packssdw m1, m3
1716
%endif ; HIGH_BIT_DEPTH
1717
pmullw m4, m6
1718
pmullw m5, m6
1719
mova m2, [pw_32]
1720
paddw m3, m2, m5
1721
paddw m2, m4
1722
mova m4, m0
1723
mova m5, m1
1724
pmullw m0, multy0
1725
pmullw m1, multy0
1726
paddw m0, m2
1727
paddw m1, m3
1728
psrlw m0, 6
1729
psrlw m1, 6
1730
%if HIGH_BIT_DEPTH
1731
movh [r0], m0
1732
movh [r0+mmsize/2], m1
1733
%if mmsize==8
1734
psrlq m0, 32
1735
psrlq m1, 32
1736
movh [r1], m0
1737
movh [r1+mmsize/2], m1
1738
%else
1739
movhps [r1], m0
1740
movhps [r1+mmsize/2], m1
1741
%endif
1742
%else ; !HIGH_BIT_DEPTH
1743
packuswb m0, m1
1744
%if mmsize==8
1745
pshufw m1, m0, q0020
1746
pshufw m0, m0, q0031
1747
movd [r0], m1
1748
movd [r1], m0
1749
%else
1750
pshufd m0, m0, q3120
1751
movq [r0], m0
1752
movhps [r1], m0
1753
%endif
1754
%endif ; HIGH_BIT_DEPTH
1755
add r3, r4
1756
add r0, r2
1757
add r1, r2
1758
dec r5d
1759
jg .loop4
1760
%if mmsize!=8
1761
RET
1762
%else
1763
sub dword r7m, 4
1764
jg .width8
1765
RET
1766
.width8:
1767
%if ARCH_X86_64
1768
lea r3, [t2+8*SIZEOF_PIXEL]
1769
lea r0, [t0+4*SIZEOF_PIXEL]
1770
lea r1, [t1+4*SIZEOF_PIXEL]
1771
%else
1772
mov r3, r3m
1773
mov r0, r0m
1774
mov r1, r1m
1775
add r3, 8*SIZEOF_PIXEL
1776
add r0, 4*SIZEOF_PIXEL
1777
add r1, 4*SIZEOF_PIXEL
1778
%endif
1779
mov r5d, r8m
1780
jmp .loopx
1781
%endif
1782
1783
%if ARCH_X86_64 ; too many regs for x86_32
1784
RESET_MM_PERMUTATION
1785
%if WIN64
1786
%assign stack_offset stack_offset - stack_size_padded
1787
%assign stack_size_padded 0
1788
%assign xmm_regs_used 0
1789
%endif
1790
.mc1dy:
1791
and t2d, 7
1792
movd m5, t2d
1793
mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1794
jmp .mc1d
1795
.mc1dx:
1796
movd m5, r5d
1797
mov r6d, 2*SIZEOF_PIXEL
1798
.mc1d:
1799
%if HIGH_BIT_DEPTH && mmsize == 16
1800
WIN64_SPILL_XMM 8
1801
%endif
1802
mova m4, [pw_8]
1803
SPLATW m5, m5
1804
psubw m4, m5
1805
movifnidn r0, r0mp
1806
movifnidn r1, r1mp
1807
movifnidn r2d, r2m
1808
FIX_STRIDES r2
1809
movifnidn r5d, r8m
1810
cmp dword r7m, 4
1811
jg .mc1d_w8
1812
mov r7, r2
1813
mov r8, r4
1814
%if mmsize!=8
1815
shr r5d, 1
1816
%endif
1817
.loop1d_w4:
1818
%if HIGH_BIT_DEPTH
1819
%if mmsize == 8
1820
movq m0, [r3+0]
1821
movq m2, [r3+8]
1822
movq m1, [r3+r6+0]
1823
movq m3, [r3+r6+8]
1824
%else
1825
movu m0, [r3]
1826
movu m1, [r3+r6]
1827
add r3, r8
1828
movu m2, [r3]
1829
movu m3, [r3+r6]
1830
%endif
1831
SBUTTERFLY wd, 0, 2, 6
1832
SBUTTERFLY wd, 1, 3, 7
1833
SBUTTERFLY wd, 0, 2, 6
1834
SBUTTERFLY wd, 1, 3, 7
1835
%if mmsize == 16
1836
SBUTTERFLY wd, 0, 2, 6
1837
SBUTTERFLY wd, 1, 3, 7
1838
%endif
1839
%else ; !HIGH_BIT_DEPTH
1840
movq m0, [r3]
1841
movq m1, [r3+r6]
1842
%if mmsize!=8
1843
add r3, r8
1844
movhps m0, [r3]
1845
movhps m1, [r3+r6]
1846
%endif
1847
psrlw m2, m0, 8
1848
psrlw m3, m1, 8
1849
pand m0, [pw_00ff]
1850
pand m1, [pw_00ff]
1851
%endif ; HIGH_BIT_DEPTH
1852
pmullw m0, m4
1853
pmullw m1, m5
1854
pmullw m2, m4
1855
pmullw m3, m5
1856
paddw m0, [pw_4]
1857
paddw m2, [pw_4]
1858
paddw m0, m1
1859
paddw m2, m3
1860
psrlw m0, 3
1861
psrlw m2, 3
1862
%if HIGH_BIT_DEPTH
1863
%if mmsize == 8
1864
xchg r4, r8
1865
xchg r2, r7
1866
%endif
1867
movq [r0], m0
1868
movq [r1], m2
1869
%if mmsize == 16
1870
add r0, r7
1871
add r1, r7
1872
movhps [r0], m0
1873
movhps [r1], m2
1874
%endif
1875
%else ; !HIGH_BIT_DEPTH
1876
packuswb m0, m2
1877
%if mmsize==8
1878
xchg r4, r8
1879
xchg r2, r7
1880
movd [r0], m0
1881
psrlq m0, 32
1882
movd [r1], m0
1883
%else
1884
movhlps m1, m0
1885
movd [r0], m0
1886
movd [r1], m1
1887
add r0, r7
1888
add r1, r7
1889
psrldq m0, 4
1890
psrldq m1, 4
1891
movd [r0], m0
1892
movd [r1], m1
1893
%endif
1894
%endif ; HIGH_BIT_DEPTH
1895
add r3, r4
1896
add r0, r2
1897
add r1, r2
1898
dec r5d
1899
jg .loop1d_w4
1900
RET
1901
.mc1d_w8:
1902
sub r2, 4*SIZEOF_PIXEL
1903
sub r4, 8*SIZEOF_PIXEL
1904
mov r7, 4*SIZEOF_PIXEL
1905
mov r8, 8*SIZEOF_PIXEL
1906
%if mmsize==8
1907
shl r5d, 1
1908
%endif
1909
jmp .loop1d_w4
1910
%endif ; ARCH_X86_64
1911
%endmacro ; MC_CHROMA
1912
1913
%macro MC_CHROMA_SSSE3 0
1914
cglobal mc_chroma
1915
MC_CHROMA_START 10-cpuflag(avx2)
1916
and r5d, 7
1917
and t2d, 7
1918
mov t0d, r5d
1919
shl t0d, 8
1920
sub t0d, r5d
1921
mov r5d, 8
1922
add t0d, 8
1923
sub r5d, t2d
1924
imul t2d, t0d ; (x*255+8)*y
1925
imul r5d, t0d ; (x*255+8)*(8-y)
1926
movd xm6, t2d
1927
movd xm7, r5d
1928
%if cpuflag(cache64)
1929
mov t0d, r3d
1930
and t0d, 7
1931
%ifdef PIC
1932
lea t1, [ch_shuf_adj]
1933
movddup xm5, [t1 + t0*4]
1934
%else
1935
movddup xm5, [ch_shuf_adj + t0*4]
1936
%endif
1937
paddb xm5, [ch_shuf]
1938
and r3, ~7
1939
%else
1940
mova m5, [ch_shuf]
1941
%endif
1942
movifnidn r0, r0mp
1943
movifnidn r1, r1mp
1944
movifnidn r2d, r2m
1945
movifnidn r5d, r8m
1946
%if cpuflag(avx2)
1947
vpbroadcastw m6, xm6
1948
vpbroadcastw m7, xm7
1949
%else
1950
SPLATW m6, m6
1951
SPLATW m7, m7
1952
%endif
1953
%if ARCH_X86_64
1954
%define shiftround m8
1955
mova m8, [pw_512]
1956
%else
1957
%define shiftround [pw_512]
1958
%endif
1959
cmp dword r7m, 4
1960
jg .width8
1961
1962
%if cpuflag(avx2)
1963
.loop4:
1964
movu xm0, [r3]
1965
movu xm1, [r3+r4]
1966
vinserti128 m0, m0, [r3+r4], 1
1967
vinserti128 m1, m1, [r3+r4*2], 1
1968
pshufb m0, m5
1969
pshufb m1, m5
1970
pmaddubsw m0, m7
1971
pmaddubsw m1, m6
1972
paddw m0, m1
1973
pmulhrsw m0, shiftround
1974
packuswb m0, m0
1975
vextracti128 xm1, m0, 1
1976
movd [r0], xm0
1977
movd [r0+r2], xm1
1978
psrldq xm0, 4
1979
psrldq xm1, 4
1980
movd [r1], xm0
1981
movd [r1+r2], xm1
1982
lea r3, [r3+r4*2]
1983
lea r0, [r0+r2*2]
1984
lea r1, [r1+r2*2]
1985
sub r5d, 2
1986
jg .loop4
1987
RET
1988
.width8:
1989
movu xm0, [r3]
1990
vinserti128 m0, m0, [r3+8], 1
1991
pshufb m0, m5
1992
.loop8:
1993
movu xm3, [r3+r4]
1994
vinserti128 m3, m3, [r3+r4+8], 1
1995
pshufb m3, m5
1996
pmaddubsw m1, m0, m7
1997
pmaddubsw m2, m3, m6
1998
pmaddubsw m3, m3, m7
1999
2000
movu xm0, [r3+r4*2]
2001
vinserti128 m0, m0, [r3+r4*2+8], 1
2002
pshufb m0, m5
2003
pmaddubsw m4, m0, m6
2004
2005
paddw m1, m2
2006
paddw m3, m4
2007
pmulhrsw m1, shiftround
2008
pmulhrsw m3, shiftround
2009
packuswb m1, m3
2010
mova m2, [deinterleave_shufd]
2011
vpermd m1, m2, m1
2012
vextracti128 xm2, m1, 1
2013
movq [r0], xm1
2014
movhps [r1], xm1
2015
movq [r0+r2], xm2
2016
movhps [r1+r2], xm2
2017
%else
2018
movu m0, [r3]
2019
pshufb m0, m5
2020
.loop4:
2021
movu m1, [r3+r4]
2022
pshufb m1, m5
2023
movu m3, [r3+r4*2]
2024
pshufb m3, m5
2025
mova m4, m3
2026
pmaddubsw m0, m7
2027
pmaddubsw m2, m1, m7
2028
pmaddubsw m1, m6
2029
pmaddubsw m3, m6
2030
paddw m1, m0
2031
paddw m3, m2
2032
pmulhrsw m1, shiftround
2033
pmulhrsw m3, shiftround
2034
mova m0, m4
2035
packuswb m1, m3
2036
movd [r0], m1
2037
%if cpuflag(sse4)
2038
pextrd [r1], m1, 1
2039
pextrd [r0+r2], m1, 2
2040
pextrd [r1+r2], m1, 3
2041
%else
2042
movhlps m3, m1
2043
movd [r0+r2], m3
2044
psrldq m1, 4
2045
psrldq m3, 4
2046
movd [r1], m1
2047
movd [r1+r2], m3
2048
%endif
2049
lea r3, [r3+r4*2]
2050
lea r0, [r0+r2*2]
2051
lea r1, [r1+r2*2]
2052
sub r5d, 2
2053
jg .loop4
2054
RET
2055
.width8:
2056
movu m0, [r3]
2057
pshufb m0, m5
2058
movu m1, [r3+8]
2059
pshufb m1, m5
2060
%if ARCH_X86_64
2061
SWAP 9, 6
2062
%define mult1 m9
2063
%else
2064
mova r0m, m6
2065
%define mult1 r0m
2066
%endif
2067
.loop8:
2068
movu m2, [r3+r4]
2069
pshufb m2, m5
2070
movu m3, [r3+r4+8]
2071
pshufb m3, m5
2072
mova m4, m2
2073
mova m6, m3
2074
pmaddubsw m0, m7
2075
pmaddubsw m1, m7
2076
pmaddubsw m2, mult1
2077
pmaddubsw m3, mult1
2078
paddw m0, m2
2079
paddw m1, m3
2080
pmulhrsw m0, shiftround ; x + 32 >> 6
2081
pmulhrsw m1, shiftround
2082
packuswb m0, m1
2083
pshufd m0, m0, q3120
2084
movq [r0], m0
2085
movhps [r1], m0
2086
2087
movu m2, [r3+r4*2]
2088
pshufb m2, m5
2089
movu m3, [r3+r4*2+8]
2090
pshufb m3, m5
2091
mova m0, m2
2092
mova m1, m3
2093
pmaddubsw m4, m7
2094
pmaddubsw m6, m7
2095
pmaddubsw m2, mult1
2096
pmaddubsw m3, mult1
2097
paddw m2, m4
2098
paddw m3, m6
2099
pmulhrsw m2, shiftround
2100
pmulhrsw m3, shiftround
2101
packuswb m2, m3
2102
pshufd m2, m2, q3120
2103
movq [r0+r2], m2
2104
movhps [r1+r2], m2
2105
%endif
2106
lea r3, [r3+r4*2]
2107
lea r0, [r0+r2*2]
2108
lea r1, [r1+r2*2]
2109
sub r5d, 2
2110
jg .loop8
2111
RET
2112
%endmacro
2113
2114
%if HIGH_BIT_DEPTH
2115
INIT_MMX mmx2
2116
MC_CHROMA
2117
INIT_XMM sse2
2118
MC_CHROMA
2119
INIT_XMM avx
2120
MC_CHROMA
2121
%else ; !HIGH_BIT_DEPTH
2122
INIT_MMX mmx2
2123
MC_CHROMA
2124
INIT_XMM sse2
2125
MC_CHROMA
2126
INIT_XMM ssse3
2127
MC_CHROMA_SSSE3
2128
INIT_XMM ssse3, cache64
2129
MC_CHROMA_SSSE3
2130
INIT_XMM avx
2131
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
2132
INIT_YMM avx2
2133
MC_CHROMA_SSSE3
2134
%endif ; HIGH_BIT_DEPTH
2135
2136