Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* sad16-a.asm: x86 high depth sad functions
3
;*****************************************************************************
4
;* Copyright (C) 2010-2016 x264 project
5
;*
6
;* Authors: Oskar Arvidsson <oskar@irock.se>
7
;* Henrik Gramner <henrik@gramner.com>
8
;*
9
;* This program is free software; you can redistribute it and/or modify
10
;* it under the terms of the GNU General Public License as published by
11
;* the Free Software Foundation; either version 2 of the License, or
12
;* (at your option) any later version.
13
;*
14
;* This program is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
;* GNU General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU General Public License
20
;* along with this program; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
;*
23
;* This program is also available under a commercial proprietary license.
24
;* For more information, contact us at licensing@x264.com.
25
;*****************************************************************************
26
27
%include "x86inc.asm"
28
%include "x86util.asm"
29
30
SECTION .text
31
32
cextern pw_1
33
cextern pw_4
34
cextern pw_8
35
36
;=============================================================================
37
; SAD MMX
38
;=============================================================================
39
40
%macro SAD_INC_1x16P_MMX 0
41
movu m1, [r0+ 0]
42
movu m2, [r0+ 8]
43
movu m3, [r0+16]
44
movu m4, [r0+24]
45
psubw m1, [r2+ 0]
46
psubw m2, [r2+ 8]
47
psubw m3, [r2+16]
48
psubw m4, [r2+24]
49
ABSW2 m1, m2, m1, m2, m5, m6
50
ABSW2 m3, m4, m3, m4, m7, m5
51
lea r0, [r0+2*r1]
52
lea r2, [r2+2*r3]
53
paddw m1, m2
54
paddw m3, m4
55
paddw m0, m1
56
paddw m0, m3
57
%endmacro
58
59
%macro SAD_INC_2x8P_MMX 0
60
movu m1, [r0+0]
61
movu m2, [r0+8]
62
movu m3, [r0+2*r1+0]
63
movu m4, [r0+2*r1+8]
64
psubw m1, [r2+0]
65
psubw m2, [r2+8]
66
psubw m3, [r2+2*r3+0]
67
psubw m4, [r2+2*r3+8]
68
ABSW2 m1, m2, m1, m2, m5, m6
69
ABSW2 m3, m4, m3, m4, m7, m5
70
lea r0, [r0+4*r1]
71
lea r2, [r2+4*r3]
72
paddw m1, m2
73
paddw m3, m4
74
paddw m0, m1
75
paddw m0, m3
76
%endmacro
77
78
%macro SAD_INC_2x4P_MMX 0
79
movu m1, [r0]
80
movu m2, [r0+2*r1]
81
psubw m1, [r2]
82
psubw m2, [r2+2*r3]
83
ABSW2 m1, m2, m1, m2, m3, m4
84
lea r0, [r0+4*r1]
85
lea r2, [r2+4*r3]
86
paddw m0, m1
87
paddw m0, m2
88
%endmacro
89
90
;-----------------------------------------------------------------------------
91
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
92
;-----------------------------------------------------------------------------
93
%macro SAD_MMX 3
94
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
95
pxor m0, m0
96
%if %2 == 4
97
SAD_INC_%3x%1P_MMX
98
SAD_INC_%3x%1P_MMX
99
%else
100
mov r4d, %2/%3
101
.loop:
102
SAD_INC_%3x%1P_MMX
103
dec r4d
104
jg .loop
105
%endif
106
%if %1*%2 == 256
107
HADDUW m0, m1
108
%else
109
HADDW m0, m1
110
%endif
111
movd eax, m0
112
RET
113
%endmacro
114
115
INIT_MMX mmx2
116
SAD_MMX 16, 16, 1
117
SAD_MMX 16, 8, 1
118
SAD_MMX 8, 16, 2
119
SAD_MMX 8, 8, 2
120
SAD_MMX 8, 4, 2
121
SAD_MMX 4, 8, 2
122
SAD_MMX 4, 4, 2
123
INIT_MMX ssse3
124
SAD_MMX 4, 8, 2
125
SAD_MMX 4, 4, 2
126
127
;=============================================================================
128
; SAD XMM
129
;=============================================================================
130
131
%macro SAD_INC_2ROW 1
132
%if 2*%1 > mmsize
133
movu m1, [r2+ 0]
134
movu m2, [r2+16]
135
movu m3, [r2+2*r3+ 0]
136
movu m4, [r2+2*r3+16]
137
psubw m1, [r0+ 0]
138
psubw m2, [r0+16]
139
psubw m3, [r0+2*r1+ 0]
140
psubw m4, [r0+2*r1+16]
141
ABSW2 m1, m2, m1, m2, m5, m6
142
lea r0, [r0+4*r1]
143
lea r2, [r2+4*r3]
144
ABSW2 m3, m4, m3, m4, m7, m5
145
paddw m1, m2
146
paddw m3, m4
147
paddw m0, m1
148
paddw m0, m3
149
%else
150
movu m1, [r2]
151
movu m2, [r2+2*r3]
152
psubw m1, [r0]
153
psubw m2, [r0+2*r1]
154
ABSW2 m1, m2, m1, m2, m3, m4
155
lea r0, [r0+4*r1]
156
lea r2, [r2+4*r3]
157
paddw m0, m1
158
paddw m0, m2
159
%endif
160
%endmacro
161
162
;-----------------------------------------------------------------------------
163
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
164
;-----------------------------------------------------------------------------
165
%macro SAD 2
166
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
167
pxor m0, m0
168
%if %2 == 4
169
SAD_INC_2ROW %1
170
SAD_INC_2ROW %1
171
%else
172
mov r4d, %2/2
173
.loop:
174
SAD_INC_2ROW %1
175
dec r4d
176
jg .loop
177
%endif
178
HADDW m0, m1
179
movd eax, xm0
180
RET
181
%endmacro
182
183
INIT_XMM sse2
184
SAD 16, 16
185
SAD 16, 8
186
SAD 8, 16
187
SAD 8, 8
188
SAD 8, 4
189
INIT_XMM sse2, aligned
190
SAD 16, 16
191
SAD 16, 8
192
SAD 8, 16
193
SAD 8, 8
194
INIT_XMM ssse3
195
SAD 16, 16
196
SAD 16, 8
197
SAD 8, 16
198
SAD 8, 8
199
SAD 8, 4
200
INIT_XMM ssse3, aligned
201
SAD 16, 16
202
SAD 16, 8
203
SAD 8, 16
204
SAD 8, 8
205
INIT_YMM avx2
206
SAD 16, 16
207
SAD 16, 8
208
209
;=============================================================================
210
; SAD x3/x4
211
;=============================================================================
212
213
%macro SAD_X3_INC_P 0
214
add r0, 4*FENC_STRIDE
215
lea r1, [r1+4*r4]
216
lea r2, [r2+4*r4]
217
lea r3, [r3+4*r4]
218
%endmacro
219
220
%macro SAD_X3_ONE_START 0
221
mova m3, [r0]
222
movu m0, [r1]
223
movu m1, [r2]
224
movu m2, [r3]
225
psubw m0, m3
226
psubw m1, m3
227
psubw m2, m3
228
ABSW2 m0, m1, m0, m1, m4, m5
229
ABSW m2, m2, m6
230
%endmacro
231
232
%macro SAD_X3_ONE 2
233
mova m6, [r0+%1]
234
movu m3, [r1+%2]
235
movu m4, [r2+%2]
236
movu m5, [r3+%2]
237
psubw m3, m6
238
psubw m4, m6
239
psubw m5, m6
240
ABSW2 m3, m4, m3, m4, m7, m6
241
ABSW m5, m5, m6
242
paddw m0, m3
243
paddw m1, m4
244
paddw m2, m5
245
%endmacro
246
247
%macro SAD_X3_END 2
248
%if mmsize == 8 && %1*%2 == 256
249
HADDUW m0, m3
250
HADDUW m1, m4
251
HADDUW m2, m5
252
%else
253
HADDW m0, m3
254
HADDW m1, m4
255
HADDW m2, m5
256
%endif
257
%if UNIX64
258
movd [r5+0], xm0
259
movd [r5+4], xm1
260
movd [r5+8], xm2
261
%else
262
mov r0, r5mp
263
movd [r0+0], xm0
264
movd [r0+4], xm1
265
movd [r0+8], xm2
266
%endif
267
RET
268
%endmacro
269
270
%macro SAD_X4_INC_P 0
271
add r0, 4*FENC_STRIDE
272
lea r1, [r1+4*r5]
273
lea r2, [r2+4*r5]
274
lea r3, [r3+4*r5]
275
lea r4, [r4+4*r5]
276
%endmacro
277
278
%macro SAD_X4_ONE_START 0
279
mova m4, [r0]
280
movu m0, [r1]
281
movu m1, [r2]
282
movu m2, [r3]
283
movu m3, [r4]
284
psubw m0, m4
285
psubw m1, m4
286
psubw m2, m4
287
psubw m3, m4
288
ABSW2 m0, m1, m0, m1, m5, m6
289
ABSW2 m2, m3, m2, m3, m4, m7
290
%endmacro
291
292
%macro SAD_X4_ONE 2
293
mova m4, [r0+%1]
294
movu m5, [r1+%2]
295
movu m6, [r2+%2]
296
%if num_mmregs > 8
297
movu m7, [r3+%2]
298
movu m8, [r4+%2]
299
psubw m5, m4
300
psubw m6, m4
301
psubw m7, m4
302
psubw m8, m4
303
ABSW2 m5, m6, m5, m6, m9, m10
304
ABSW2 m7, m8, m7, m8, m9, m10
305
paddw m0, m5
306
paddw m1, m6
307
paddw m2, m7
308
paddw m3, m8
309
%elif cpuflag(ssse3)
310
movu m7, [r3+%2]
311
psubw m5, m4
312
psubw m6, m4
313
psubw m7, m4
314
movu m4, [r4+%2]
315
pabsw m5, m5
316
psubw m4, [r0+%1]
317
pabsw m6, m6
318
pabsw m7, m7
319
pabsw m4, m4
320
paddw m0, m5
321
paddw m1, m6
322
paddw m2, m7
323
paddw m3, m4
324
%else ; num_mmregs == 8 && !ssse3
325
psubw m5, m4
326
psubw m6, m4
327
ABSW m5, m5, m7
328
ABSW m6, m6, m7
329
paddw m0, m5
330
paddw m1, m6
331
movu m5, [r3+%2]
332
movu m6, [r4+%2]
333
psubw m5, m4
334
psubw m6, m4
335
ABSW2 m5, m6, m5, m6, m7, m4
336
paddw m2, m5
337
paddw m3, m6
338
%endif
339
%endmacro
340
341
%macro SAD_X4_END 2
342
%if mmsize == 8 && %1*%2 == 256
343
HADDUW m0, m4
344
HADDUW m1, m5
345
HADDUW m2, m6
346
HADDUW m3, m7
347
%else
348
HADDW m0, m4
349
HADDW m1, m5
350
HADDW m2, m6
351
HADDW m3, m7
352
%endif
353
mov r0, r6mp
354
movd [r0+ 0], xm0
355
movd [r0+ 4], xm1
356
movd [r0+ 8], xm2
357
movd [r0+12], xm3
358
RET
359
%endmacro
360
361
%macro SAD_X_2xNP 4
362
%assign x %3
363
%rep %4
364
SAD_X%1_ONE x*mmsize, x*mmsize
365
SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
366
%assign x x+1
367
%endrep
368
%endmacro
369
370
%macro PIXEL_VSAD 0
371
cglobal pixel_vsad, 3,3,8
372
mova m0, [r0]
373
mova m1, [r0+16]
374
mova m2, [r0+2*r1]
375
mova m3, [r0+2*r1+16]
376
lea r0, [r0+4*r1]
377
psubw m0, m2
378
psubw m1, m3
379
ABSW2 m0, m1, m0, m1, m4, m5
380
paddw m0, m1
381
sub r2d, 2
382
je .end
383
.loop:
384
mova m4, [r0]
385
mova m5, [r0+16]
386
mova m6, [r0+2*r1]
387
mova m7, [r0+2*r1+16]
388
lea r0, [r0+4*r1]
389
psubw m2, m4
390
psubw m3, m5
391
psubw m4, m6
392
psubw m5, m7
393
ABSW m2, m2, m1
394
ABSW m3, m3, m1
395
ABSW m4, m4, m1
396
ABSW m5, m5, m1
397
paddw m0, m2
398
paddw m0, m3
399
paddw m0, m4
400
paddw m0, m5
401
mova m2, m6
402
mova m3, m7
403
sub r2d, 2
404
jg .loop
405
.end:
406
%if BIT_DEPTH == 9
407
HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
408
%else
409
HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
410
%endif
411
movd eax, m0
412
RET
413
%endmacro
414
INIT_XMM sse2
415
PIXEL_VSAD
416
INIT_XMM ssse3
417
PIXEL_VSAD
418
INIT_XMM xop
419
PIXEL_VSAD
420
421
INIT_YMM avx2
422
cglobal pixel_vsad, 3,3
423
mova m0, [r0]
424
mova m1, [r0+2*r1]
425
lea r0, [r0+4*r1]
426
psubw m0, m1
427
pabsw m0, m0
428
sub r2d, 2
429
je .end
430
.loop:
431
mova m2, [r0]
432
mova m3, [r0+2*r1]
433
lea r0, [r0+4*r1]
434
psubw m1, m2
435
psubw m2, m3
436
pabsw m1, m1
437
pabsw m2, m2
438
paddw m0, m1
439
paddw m0, m2
440
mova m1, m3
441
sub r2d, 2
442
jg .loop
443
.end:
444
%if BIT_DEPTH == 9
445
HADDW m0, m1
446
%else
447
HADDUW m0, m1
448
%endif
449
movd eax, xm0
450
RET
451
452
;-----------------------------------------------------------------------------
453
; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
454
; uint16_t *pix2, intptr_t i_stride, int scores[3] )
455
;-----------------------------------------------------------------------------
456
%macro SAD_X 3
457
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
458
%assign regnum %1+1
459
%xdefine STRIDE r %+ regnum
460
mov r6, %3/2-1
461
SAD_X%1_ONE_START
462
SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
463
SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
464
.loop:
465
SAD_X%1_INC_P
466
SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
467
dec r6
468
jg .loop
469
%if %1 == 4
470
mov r6, r6m
471
%endif
472
SAD_X%1_END %2, %3
473
%endmacro
474
475
INIT_MMX mmx2
476
%define XMM_REGS 0
477
SAD_X 3, 16, 16
478
SAD_X 3, 16, 8
479
SAD_X 3, 8, 16
480
SAD_X 3, 8, 8
481
SAD_X 3, 8, 4
482
SAD_X 3, 4, 8
483
SAD_X 3, 4, 4
484
SAD_X 4, 16, 16
485
SAD_X 4, 16, 8
486
SAD_X 4, 8, 16
487
SAD_X 4, 8, 8
488
SAD_X 4, 8, 4
489
SAD_X 4, 4, 8
490
SAD_X 4, 4, 4
491
INIT_MMX ssse3
492
SAD_X 3, 4, 8
493
SAD_X 3, 4, 4
494
SAD_X 4, 4, 8
495
SAD_X 4, 4, 4
496
INIT_XMM ssse3
497
%define XMM_REGS 7
498
SAD_X 3, 16, 16
499
SAD_X 3, 16, 8
500
SAD_X 3, 8, 16
501
SAD_X 3, 8, 8
502
SAD_X 3, 8, 4
503
%define XMM_REGS 9
504
SAD_X 4, 16, 16
505
SAD_X 4, 16, 8
506
SAD_X 4, 8, 16
507
SAD_X 4, 8, 8
508
SAD_X 4, 8, 4
509
INIT_XMM sse2
510
%define XMM_REGS 8
511
SAD_X 3, 16, 16
512
SAD_X 3, 16, 8
513
SAD_X 3, 8, 16
514
SAD_X 3, 8, 8
515
SAD_X 3, 8, 4
516
%define XMM_REGS 11
517
SAD_X 4, 16, 16
518
SAD_X 4, 16, 8
519
SAD_X 4, 8, 16
520
SAD_X 4, 8, 8
521
SAD_X 4, 8, 4
522
INIT_XMM xop
523
%define XMM_REGS 7
524
SAD_X 3, 16, 16
525
SAD_X 3, 16, 8
526
SAD_X 3, 8, 16
527
SAD_X 3, 8, 8
528
SAD_X 3, 8, 4
529
%define XMM_REGS 9
530
SAD_X 4, 16, 16
531
SAD_X 4, 16, 8
532
SAD_X 4, 8, 16
533
SAD_X 4, 8, 8
534
SAD_X 4, 8, 4
535
INIT_YMM avx2
536
%define XMM_REGS 7
537
SAD_X 3, 16, 16
538
SAD_X 3, 16, 8
539
%define XMM_REGS 9
540
SAD_X 4, 16, 16
541
SAD_X 4, 16, 8
542
543
;-----------------------------------------------------------------------------
544
; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
545
;-----------------------------------------------------------------------------
546
547
%macro INTRA_SAD_X3_4x4 0
548
cglobal intra_sad_x3_4x4, 3,3,7
549
%if cpuflag(ssse3)
550
movddup m0, [r1-1*FDEC_STRIDEB]
551
%else
552
movq m0, [r1-1*FDEC_STRIDEB]
553
punpcklqdq m0, m0
554
%endif
555
movq m1, [r0+0*FENC_STRIDEB]
556
movq m2, [r0+2*FENC_STRIDEB]
557
pshuflw m6, m0, q1032
558
paddw m6, m0
559
pshuflw m5, m6, q2301
560
paddw m6, m5
561
punpcklqdq m6, m6 ; A+B+C+D 8 times
562
movhps m1, [r0+1*FENC_STRIDEB]
563
movhps m2, [r0+3*FENC_STRIDEB]
564
psubw m3, m1, m0
565
psubw m0, m2
566
ABSW2 m3, m0, m3, m0, m4, m5
567
paddw m0, m3
568
movd m3, [r1+0*FDEC_STRIDEB-4]
569
movd m4, [r1+2*FDEC_STRIDEB-4]
570
movhps m3, [r1+1*FDEC_STRIDEB-8]
571
movhps m4, [r1+3*FDEC_STRIDEB-8]
572
pshufhw m3, m3, q3333
573
pshufhw m4, m4, q3333
574
pshuflw m3, m3, q1111 ; FF FF EE EE
575
pshuflw m4, m4, q1111 ; HH HH GG GG
576
paddw m5, m3, m4
577
paddw m6, [pw_4]
578
paddw m6, m5
579
pshufd m5, m5, q1032
580
paddw m5, m6
581
psrlw m5, 3
582
psubw m6, m5, m2
583
psubw m5, m1
584
psubw m1, m3
585
psubw m2, m4
586
ABSW2 m5, m6, m5, m6, m3, m4
587
ABSW2 m1, m2, m1, m2, m3, m4
588
paddw m5, m6
589
paddw m1, m2
590
%if cpuflag(ssse3)
591
phaddw m0, m1
592
movhlps m3, m5
593
paddw m5, m3
594
phaddw m0, m5
595
pmaddwd m0, [pw_1]
596
mova [r2], m0
597
%else
598
HADDW m0, m3
599
HADDW m1, m3
600
HADDW m5, m3
601
movd [r2], m0 ; V prediction cost
602
movd [r2+4], m1 ; H prediction cost
603
movd [r2+8], m5 ; DC prediction cost
604
%endif
605
RET
606
%endmacro
607
608
INIT_XMM sse2
609
INTRA_SAD_X3_4x4
610
INIT_XMM ssse3
611
INTRA_SAD_X3_4x4
612
INIT_XMM avx
613
INTRA_SAD_X3_4x4
614
615
;-----------------------------------------------------------------------------
616
; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
617
;-----------------------------------------------------------------------------
618
619
;m0 = DC
620
;m6 = V
621
;m7 = H
622
;m1 = DC score
623
;m2 = V score
624
;m3 = H score
625
;m5 = temp
626
;m4 = pixel row
627
628
%macro INTRA_SAD_HVDC_ITER 2
629
mova m4, [r0+(%1-4)*FENC_STRIDEB]
630
psubw m4, m0
631
ABSW m4, m4, m5
632
ACCUM paddw, 1, 4, %1
633
mova m4, [r0+(%1-4)*FENC_STRIDEB]
634
psubw m4, m6
635
ABSW m4, m4, m5
636
ACCUM paddw, 2, 4, %1
637
pshufd m5, m7, %2
638
psubw m5, [r0+(%1-4)*FENC_STRIDEB]
639
ABSW m5, m5, m4
640
ACCUM paddw, 3, 5, %1
641
%endmacro
642
643
%macro INTRA_SAD_X3_8x8 0
644
cglobal intra_sad_x3_8x8, 3,3,8
645
add r0, 4*FENC_STRIDEB
646
movu m0, [r1+7*SIZEOF_PIXEL]
647
mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
648
mova m7, m0
649
paddw m0, m6
650
punpckhwd m7, m7
651
HADDW m0, m4
652
paddw m0, [pw_8]
653
psrlw m0, 4
654
SPLATW m0, m0
655
INTRA_SAD_HVDC_ITER 0, q3333
656
INTRA_SAD_HVDC_ITER 1, q2222
657
INTRA_SAD_HVDC_ITER 2, q1111
658
INTRA_SAD_HVDC_ITER 3, q0000
659
movq m7, [r1+7*SIZEOF_PIXEL]
660
punpcklwd m7, m7
661
INTRA_SAD_HVDC_ITER 4, q3333
662
INTRA_SAD_HVDC_ITER 5, q2222
663
INTRA_SAD_HVDC_ITER 6, q1111
664
INTRA_SAD_HVDC_ITER 7, q0000
665
%if cpuflag(ssse3)
666
phaddw m2, m3 ; 2 2 2 2 3 3 3 3
667
movhlps m3, m1
668
paddw m1, m3 ; 1 1 1 1 _ _ _ _
669
phaddw m2, m1 ; 2 2 3 3 1 1 _ _
670
pmaddwd m2, [pw_1] ; 2 3 1 _
671
mova [r2], m2
672
%else
673
HADDW m2, m4
674
HADDW m3, m4
675
HADDW m1, m4
676
movd [r2+0], m2
677
movd [r2+4], m3
678
movd [r2+8], m1
679
%endif
680
RET
681
%endmacro
682
683
INIT_XMM sse2
684
INTRA_SAD_X3_8x8
685
INIT_XMM ssse3
686
INTRA_SAD_X3_8x8
687
688
%macro INTRA_SAD_HVDC_ITER_YMM 2
689
mova xm4, [r0+(%1-4)*FENC_STRIDEB]
690
vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
691
pshufd m5, m7, %2
692
psubw m5, m4
693
pabsw m5, m5
694
ACCUM paddw, 2, 5, %1 ; H
695
psubw m5, m4, m6
696
psubw m4, m0
697
pabsw m5, m5
698
pabsw m4, m4
699
ACCUM paddw, 1, 5, %1 ; V
700
ACCUM paddw, 3, 4, %1 ; DC
701
%endmacro
702
703
INIT_YMM avx2
704
cglobal intra_sad_x3_8x8, 3,3,8
705
add r0, 4*FENC_STRIDEB
706
movu xm0, [r1+7*SIZEOF_PIXEL]
707
vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
708
vpermq m7, m0, q0011
709
paddw xm0, xm6
710
paddw xm0, [pw_1] ; equal to +8 after HADDW
711
HADDW xm0, xm4
712
psrld xm0, 4
713
vpbroadcastw m0, xm0
714
punpcklwd m7, m7
715
INTRA_SAD_HVDC_ITER_YMM 0, q3333
716
INTRA_SAD_HVDC_ITER_YMM 1, q2222
717
INTRA_SAD_HVDC_ITER_YMM 2, q1111
718
INTRA_SAD_HVDC_ITER_YMM 3, q0000
719
phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
720
punpckhqdq m2, m3, m3
721
paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
722
phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
723
vextracti128 xm2, m1, 1
724
paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
725
pmaddwd xm1, [pw_1] ; 1 2 3 _
726
mova [r2], xm1
727
RET
728
729