Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* sad-a.asm: x86 sad functions
3
;*****************************************************************************
4
;* Copyright (C) 2003-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Laurent Aimar <fenrir@via.ecp.fr>
9
;* Alex Izvorski <aizvorksi@gmail.com>
10
;*
11
;* This program is free software; you can redistribute it and/or modify
12
;* it under the terms of the GNU General Public License as published by
13
;* the Free Software Foundation; either version 2 of the License, or
14
;* (at your option) any later version.
15
;*
16
;* This program is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
;* GNU General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU General Public License
22
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24
;*
25
;* This program is also available under a commercial proprietary license.
26
;* For more information, contact us at licensing@x264.com.
27
;*****************************************************************************
28
29
%include "x86inc.asm"
30
%include "x86util.asm"
31
32
SECTION_RODATA 32
33
34
pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
35
hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
36
37
SECTION .text
38
39
cextern pb_3
40
cextern pb_shuf8x8c
41
cextern pw_8
42
cextern sw_64
43
44
;=============================================================================
45
; SAD MMX
46
;=============================================================================
47
48
%macro SAD_INC_2x16P 0
49
movq mm1, [r0]
50
movq mm2, [r0+8]
51
movq mm3, [r0+r1]
52
movq mm4, [r0+r1+8]
53
psadbw mm1, [r2]
54
psadbw mm2, [r2+8]
55
psadbw mm3, [r2+r3]
56
psadbw mm4, [r2+r3+8]
57
lea r0, [r0+2*r1]
58
paddw mm1, mm2
59
paddw mm3, mm4
60
lea r2, [r2+2*r3]
61
paddw mm0, mm1
62
paddw mm0, mm3
63
%endmacro
64
65
%macro SAD_INC_2x8P 0
66
movq mm1, [r0]
67
movq mm2, [r0+r1]
68
psadbw mm1, [r2]
69
psadbw mm2, [r2+r3]
70
lea r0, [r0+2*r1]
71
paddw mm0, mm1
72
paddw mm0, mm2
73
lea r2, [r2+2*r3]
74
%endmacro
75
76
%macro SAD_INC_2x4P 0
77
movd mm1, [r0]
78
movd mm2, [r2]
79
punpckldq mm1, [r0+r1]
80
punpckldq mm2, [r2+r3]
81
psadbw mm1, mm2
82
paddw mm0, mm1
83
lea r0, [r0+2*r1]
84
lea r2, [r2+2*r3]
85
%endmacro
86
87
;-----------------------------------------------------------------------------
88
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
89
;-----------------------------------------------------------------------------
90
%macro SAD 2
91
cglobal pixel_sad_%1x%2_mmx2, 4,4
92
pxor mm0, mm0
93
%rep %2/2
94
SAD_INC_2x%1P
95
%endrep
96
movd eax, mm0
97
RET
98
%endmacro
99
100
SAD 16, 16
101
SAD 16, 8
102
SAD 8, 16
103
SAD 8, 8
104
SAD 8, 4
105
SAD 4, 16
106
SAD 4, 8
107
SAD 4, 4
108
109
110
111
;=============================================================================
112
; SAD XMM
113
;=============================================================================
114
115
%macro SAD_END_SSE2 0
116
MOVHL m1, m0
117
paddw m0, m1
118
movd eax, m0
119
RET
120
%endmacro
121
122
%macro SAD_W16 0
123
;-----------------------------------------------------------------------------
124
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
125
;-----------------------------------------------------------------------------
126
cglobal pixel_sad_16x16, 4,4,8
127
movu m0, [r2]
128
movu m1, [r2+r3]
129
lea r2, [r2+2*r3]
130
movu m2, [r2]
131
movu m3, [r2+r3]
132
lea r2, [r2+2*r3]
133
psadbw m0, [r0]
134
psadbw m1, [r0+r1]
135
lea r0, [r0+2*r1]
136
movu m4, [r2]
137
paddw m0, m1
138
psadbw m2, [r0]
139
psadbw m3, [r0+r1]
140
lea r0, [r0+2*r1]
141
movu m5, [r2+r3]
142
lea r2, [r2+2*r3]
143
paddw m2, m3
144
movu m6, [r2]
145
movu m7, [r2+r3]
146
lea r2, [r2+2*r3]
147
paddw m0, m2
148
psadbw m4, [r0]
149
psadbw m5, [r0+r1]
150
lea r0, [r0+2*r1]
151
movu m1, [r2]
152
paddw m4, m5
153
psadbw m6, [r0]
154
psadbw m7, [r0+r1]
155
lea r0, [r0+2*r1]
156
movu m2, [r2+r3]
157
lea r2, [r2+2*r3]
158
paddw m6, m7
159
movu m3, [r2]
160
paddw m0, m4
161
movu m4, [r2+r3]
162
lea r2, [r2+2*r3]
163
paddw m0, m6
164
psadbw m1, [r0]
165
psadbw m2, [r0+r1]
166
lea r0, [r0+2*r1]
167
movu m5, [r2]
168
paddw m1, m2
169
psadbw m3, [r0]
170
psadbw m4, [r0+r1]
171
lea r0, [r0+2*r1]
172
movu m6, [r2+r3]
173
lea r2, [r2+2*r3]
174
paddw m3, m4
175
movu m7, [r2]
176
paddw m0, m1
177
movu m1, [r2+r3]
178
paddw m0, m3
179
psadbw m5, [r0]
180
psadbw m6, [r0+r1]
181
lea r0, [r0+2*r1]
182
paddw m5, m6
183
psadbw m7, [r0]
184
psadbw m1, [r0+r1]
185
paddw m7, m1
186
paddw m0, m5
187
paddw m0, m7
188
SAD_END_SSE2
189
190
;-----------------------------------------------------------------------------
191
; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
192
;-----------------------------------------------------------------------------
193
cglobal pixel_sad_16x8, 4,4
194
movu m0, [r2]
195
movu m2, [r2+r3]
196
lea r2, [r2+2*r3]
197
movu m3, [r2]
198
movu m4, [r2+r3]
199
psadbw m0, [r0]
200
psadbw m2, [r0+r1]
201
lea r0, [r0+2*r1]
202
psadbw m3, [r0]
203
psadbw m4, [r0+r1]
204
lea r0, [r0+2*r1]
205
lea r2, [r2+2*r3]
206
paddw m0, m2
207
paddw m3, m4
208
paddw m0, m3
209
movu m1, [r2]
210
movu m2, [r2+r3]
211
lea r2, [r2+2*r3]
212
movu m3, [r2]
213
movu m4, [r2+r3]
214
psadbw m1, [r0]
215
psadbw m2, [r0+r1]
216
lea r0, [r0+2*r1]
217
psadbw m3, [r0]
218
psadbw m4, [r0+r1]
219
lea r0, [r0+2*r1]
220
lea r2, [r2+2*r3]
221
paddw m1, m2
222
paddw m3, m4
223
paddw m0, m1
224
paddw m0, m3
225
SAD_END_SSE2
226
%endmacro
227
228
INIT_XMM sse2
229
SAD_W16
230
INIT_XMM sse3
231
SAD_W16
232
INIT_XMM sse2, aligned
233
SAD_W16
234
235
%macro SAD_INC_4x8P_SSE 1
236
movq m1, [r0]
237
movq m2, [r0+r1]
238
lea r0, [r0+2*r1]
239
movq m3, [r2]
240
movq m4, [r2+r3]
241
lea r2, [r2+2*r3]
242
movhps m1, [r0]
243
movhps m2, [r0+r1]
244
movhps m3, [r2]
245
movhps m4, [r2+r3]
246
lea r0, [r0+2*r1]
247
psadbw m1, m3
248
psadbw m2, m4
249
lea r2, [r2+2*r3]
250
ACCUM paddw, 0, 1, %1
251
paddw m0, m2
252
%endmacro
253
254
INIT_XMM
255
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
256
cglobal pixel_sad_8x16_sse2, 4,4
257
SAD_INC_4x8P_SSE 0
258
SAD_INC_4x8P_SSE 1
259
SAD_INC_4x8P_SSE 1
260
SAD_INC_4x8P_SSE 1
261
SAD_END_SSE2
262
RET
263
264
;-----------------------------------------------------------------------------
265
; void pixel_vsad( pixel *src, intptr_t stride );
266
;-----------------------------------------------------------------------------
267
268
%if ARCH_X86_64 == 0
269
INIT_MMX
270
cglobal pixel_vsad_mmx2, 3,3
271
mova m0, [r0]
272
mova m1, [r0+8]
273
mova m2, [r0+r1]
274
mova m3, [r0+r1+8]
275
lea r0, [r0+r1*2]
276
psadbw m0, m2
277
psadbw m1, m3
278
paddw m0, m1
279
sub r2d, 2
280
je .end
281
.loop:
282
mova m4, [r0]
283
mova m5, [r0+8]
284
mova m6, [r0+r1]
285
mova m7, [r0+r1+8]
286
lea r0, [r0+r1*2]
287
psadbw m2, m4
288
psadbw m3, m5
289
psadbw m4, m6
290
psadbw m5, m7
291
;max sum: 31*16*255(pixel_max)=126480
292
paddd m0, m2
293
paddd m0, m3
294
paddd m0, m4
295
paddd m0, m5
296
mova m2, m6
297
mova m3, m7
298
sub r2d, 2
299
jg .loop
300
.end:
301
movd eax, m0
302
RET
303
%endif
304
305
INIT_XMM
306
cglobal pixel_vsad_sse2, 3,3
307
mova m0, [r0]
308
mova m1, [r0+r1]
309
lea r0, [r0+r1*2]
310
psadbw m0, m1
311
sub r2d, 2
312
je .end
313
.loop:
314
mova m2, [r0]
315
mova m3, [r0+r1]
316
lea r0, [r0+r1*2]
317
psadbw m1, m2
318
psadbw m2, m3
319
paddw m0, m1
320
paddw m0, m2
321
mova m1, m3
322
sub r2d, 2
323
jg .loop
324
.end:
325
MOVHL m1, m0
326
;max sum: 31*16*255(pixel_max)=126480
327
paddd m0, m1
328
movd eax, m0
329
RET
330
331
;-----------------------------------------------------------------------------
332
; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
333
;-----------------------------------------------------------------------------
334
335
cglobal intra_sad_x3_4x4_mmx2, 3,3
336
pxor mm7, mm7
337
movd mm0, [r1-FDEC_STRIDE]
338
movd mm1, [r0+FENC_STRIDE*0]
339
movd mm2, [r0+FENC_STRIDE*2]
340
punpckldq mm0, mm0
341
punpckldq mm1, [r0+FENC_STRIDE*1]
342
punpckldq mm2, [r0+FENC_STRIDE*3]
343
movq mm6, mm0
344
movq mm3, mm1
345
psadbw mm3, mm0
346
psadbw mm0, mm2
347
paddw mm0, mm3
348
movd [r2], mm0 ;V prediction cost
349
movd mm3, [r1+FDEC_STRIDE*0-4]
350
movd mm0, [r1+FDEC_STRIDE*1-4]
351
movd mm4, [r1+FDEC_STRIDE*2-4]
352
movd mm5, [r1+FDEC_STRIDE*3-4]
353
punpcklbw mm3, mm0
354
punpcklbw mm4, mm5
355
movq mm5, mm3
356
punpckhwd mm5, mm4
357
punpckhdq mm5, mm6
358
psadbw mm5, mm7
359
punpckhbw mm3, mm3
360
punpckhbw mm4, mm4
361
punpckhwd mm3, mm3
362
punpckhwd mm4, mm4
363
psraw mm5, 2
364
pavgw mm5, mm7
365
punpcklbw mm5, mm5
366
pshufw mm5, mm5, 0 ;DC prediction
367
movq mm6, mm5
368
psadbw mm5, mm1
369
psadbw mm6, mm2
370
psadbw mm1, mm3
371
psadbw mm2, mm4
372
paddw mm5, mm6
373
paddw mm1, mm2
374
movd [r2+8], mm5 ;DC prediction cost
375
movd [r2+4], mm1 ;H prediction cost
376
RET
377
378
;-----------------------------------------------------------------------------
379
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
380
;-----------------------------------------------------------------------------
381
382
;m0 = DC
383
;m6 = V
384
;m7 = H
385
;m1 = DC score
386
;m2 = V score
387
;m3 = H score
388
;m5 = pixel row
389
;m4 = temp
390
391
%macro INTRA_SAD_HVDC_ITER 2
392
movq m5, [r0+FENC_STRIDE*%1]
393
movq m4, m5
394
psadbw m4, m0
395
ACCUM paddw, 1, 4, %1
396
movq m4, m5
397
psadbw m4, m6
398
ACCUM paddw, 2, 4, %1
399
pshufw m4, m7, %2
400
psadbw m5, m4
401
ACCUM paddw, 3, 5, %1
402
%endmacro
403
404
INIT_MMX
405
cglobal intra_sad_x3_8x8_mmx2, 3,3
406
movq m7, [r1+7]
407
pxor m0, m0
408
movq m6, [r1+16] ;V prediction
409
pxor m1, m1
410
psadbw m0, m7
411
psadbw m1, m6
412
paddw m0, m1
413
paddw m0, [pw_8]
414
psrlw m0, 4
415
punpcklbw m0, m0
416
pshufw m0, m0, q0000 ;DC prediction
417
punpckhbw m7, m7
418
INTRA_SAD_HVDC_ITER 0, q3333
419
INTRA_SAD_HVDC_ITER 1, q2222
420
INTRA_SAD_HVDC_ITER 2, q1111
421
INTRA_SAD_HVDC_ITER 3, q0000
422
movq m7, [r1+7]
423
punpcklbw m7, m7
424
INTRA_SAD_HVDC_ITER 4, q3333
425
INTRA_SAD_HVDC_ITER 5, q2222
426
INTRA_SAD_HVDC_ITER 6, q1111
427
INTRA_SAD_HVDC_ITER 7, q0000
428
movd [r2+0], m2
429
movd [r2+4], m3
430
movd [r2+8], m1
431
RET
432
433
;-----------------------------------------------------------------------------
434
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
435
;-----------------------------------------------------------------------------
436
437
%macro INTRA_SAD_HV_ITER 1
438
%if cpuflag(ssse3)
439
movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
440
movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
441
pshufb m1, m7
442
pshufb m3, m7
443
%else
444
movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
445
movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
446
punpckhbw m1, m1
447
punpckhbw m3, m3
448
pshufw m1, m1, q3333
449
pshufw m3, m3, q3333
450
%endif
451
movq m4, [r0 + FENC_STRIDE*(%1+0)]
452
movq m5, [r0 + FENC_STRIDE*(%1+1)]
453
psadbw m1, m4
454
psadbw m3, m5
455
psadbw m4, m6
456
psadbw m5, m6
457
paddw m1, m3
458
paddw m4, m5
459
ACCUM paddw, 0, 1, %1
460
ACCUM paddw, 2, 4, %1
461
%endmacro
462
463
%macro INTRA_SAD_8x8C 0
464
cglobal intra_sad_x3_8x8c, 3,3
465
movq m6, [r1 - FDEC_STRIDE]
466
add r1, FDEC_STRIDE*4
467
%if cpuflag(ssse3)
468
movq m7, [pb_3]
469
%endif
470
INTRA_SAD_HV_ITER 0
471
INTRA_SAD_HV_ITER 2
472
INTRA_SAD_HV_ITER 4
473
INTRA_SAD_HV_ITER 6
474
movd [r2+4], m0
475
movd [r2+8], m2
476
pxor m7, m7
477
movq m2, [r1 + FDEC_STRIDE*-4 - 8]
478
movq m4, [r1 + FDEC_STRIDE*-2 - 8]
479
movq m3, [r1 + FDEC_STRIDE* 0 - 8]
480
movq m5, [r1 + FDEC_STRIDE* 2 - 8]
481
punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
482
punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
483
punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
484
punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
485
punpckhbw m2, m4
486
punpckhbw m3, m5
487
psrlq m2, 32
488
psrlq m3, 32
489
psadbw m2, m7 ; s2
490
psadbw m3, m7 ; s3
491
movq m1, m6
492
SWAP 0, 6
493
punpckldq m0, m7
494
punpckhdq m1, m7
495
psadbw m0, m7 ; s0
496
psadbw m1, m7 ; s1
497
punpcklwd m0, m1
498
punpcklwd m2, m3
499
punpckldq m0, m2 ;s0 s1 s2 s3
500
pshufw m3, m0, q3312 ;s2,s1,s3,s3
501
pshufw m0, m0, q1310 ;s0,s1,s3,s1
502
paddw m0, m3
503
psrlw m0, 2
504
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
505
%if cpuflag(ssse3)
506
movq2dq xmm0, m0
507
pshufb xmm0, [pb_shuf8x8c]
508
movq xmm1, [r0+FENC_STRIDE*0]
509
movq xmm2, [r0+FENC_STRIDE*1]
510
movq xmm3, [r0+FENC_STRIDE*2]
511
movq xmm4, [r0+FENC_STRIDE*3]
512
movhps xmm1, [r0+FENC_STRIDE*4]
513
movhps xmm2, [r0+FENC_STRIDE*5]
514
movhps xmm3, [r0+FENC_STRIDE*6]
515
movhps xmm4, [r0+FENC_STRIDE*7]
516
psadbw xmm1, xmm0
517
psadbw xmm2, xmm0
518
psadbw xmm3, xmm0
519
psadbw xmm4, xmm0
520
paddw xmm1, xmm2
521
paddw xmm1, xmm3
522
paddw xmm1, xmm4
523
MOVHL xmm0, xmm1
524
paddw xmm1, xmm0
525
movd [r2], xmm1
526
%else
527
packuswb m0, m0
528
punpcklbw m0, m0
529
movq m1, m0
530
punpcklbw m0, m0 ; 4x dc0 4x dc1
531
punpckhbw m1, m1 ; 4x dc2 4x dc3
532
movq m2, [r0+FENC_STRIDE*0]
533
movq m3, [r0+FENC_STRIDE*1]
534
movq m4, [r0+FENC_STRIDE*2]
535
movq m5, [r0+FENC_STRIDE*3]
536
movq m6, [r0+FENC_STRIDE*4]
537
movq m7, [r0+FENC_STRIDE*5]
538
psadbw m2, m0
539
psadbw m3, m0
540
psadbw m4, m0
541
psadbw m5, m0
542
movq m0, [r0+FENC_STRIDE*6]
543
psadbw m6, m1
544
psadbw m7, m1
545
psadbw m0, m1
546
psadbw m1, [r0+FENC_STRIDE*7]
547
paddw m2, m3
548
paddw m4, m5
549
paddw m6, m7
550
paddw m0, m1
551
paddw m2, m4
552
paddw m6, m0
553
paddw m2, m6
554
movd [r2], m2
555
%endif
556
RET
557
%endmacro
558
559
INIT_MMX mmx2
560
INTRA_SAD_8x8C
561
INIT_MMX ssse3
562
INTRA_SAD_8x8C
563
564
INIT_YMM avx2
565
cglobal intra_sad_x3_8x8c, 3,3,7
566
vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
567
add r1, FDEC_STRIDE*4-1
568
pxor xm5, xm5
569
punpckldq xm3, xm2, xm5 ; V0 _ V1 _
570
movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
571
movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
572
pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
573
pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
574
pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
575
pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
576
pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
577
pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
578
punpcklqdq xm0, xm1 ; H0 _ H1 _
579
vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
580
pshufb xm0, [hpred_shuf] ; H00224466 H11335577
581
psadbw m3, m5 ; s0 s1 s2 s3
582
vpermq m4, m3, q3312 ; s2 s1 s3 s3
583
vpermq m3, m3, q1310 ; s0 s1 s3 s1
584
paddw m3, m4
585
psrlw m3, 2
586
pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
587
pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
588
vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
589
vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
590
vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
591
vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
592
movddup m2, [r0+FENC_STRIDE*0]
593
movddup m4, [r0+FENC_STRIDE*2]
594
pshuflw m3, m0, q0000
595
psadbw m3, m2
596
psadbw m2, m1
597
pshuflw m5, m0, q1111
598
psadbw m5, m4
599
psadbw m4, m1
600
paddw m2, m4
601
paddw m3, m5
602
movddup m4, [r0+FENC_STRIDE*4]
603
pshuflw m5, m0, q2222
604
psadbw m5, m4
605
psadbw m4, m6
606
paddw m2, m4
607
paddw m3, m5
608
movddup m4, [r0+FENC_STRIDE*6]
609
pshuflw m5, m0, q3333
610
psadbw m5, m4
611
psadbw m4, m6
612
paddw m2, m4
613
paddw m3, m5
614
vextracti128 xm0, m2, 1
615
vextracti128 xm1, m3, 1
616
paddw xm2, xm0 ; DC V
617
paddw xm3, xm1 ; H
618
pextrd [r2+8], xm2, 2 ; V
619
movd [r2+4], xm3 ; H
620
movd [r2+0], xm2 ; DC
621
RET
622
623
624
;-----------------------------------------------------------------------------
625
; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
626
;-----------------------------------------------------------------------------
627
628
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
629
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
630
%macro INTRA_SAD16 0
631
cglobal intra_sad_x3_16x16, 3,5,8
632
pxor mm0, mm0
633
pxor mm1, mm1
634
psadbw mm0, [r1-FDEC_STRIDE+0]
635
psadbw mm1, [r1-FDEC_STRIDE+8]
636
paddw mm0, mm1
637
movd r3d, mm0
638
%if cpuflag(ssse3)
639
mova m1, [pb_3]
640
%endif
641
%assign x 0
642
%rep 16
643
movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
644
%if (x&3)==3 && x!=15
645
add r1, FDEC_STRIDE*4
646
%endif
647
add r3d, r4d
648
%assign x x+1
649
%endrep
650
sub r1, FDEC_STRIDE*12
651
add r3d, 16
652
shr r3d, 5
653
imul r3d, 0x01010101
654
movd m7, r3d
655
mova m5, [r1-FDEC_STRIDE]
656
%if mmsize==16
657
pshufd m7, m7, 0
658
%else
659
mova m1, [r1-FDEC_STRIDE+8]
660
punpckldq m7, m7
661
%endif
662
pxor m4, m4
663
pxor m3, m3
664
pxor m2, m2
665
mov r3d, 15*FENC_STRIDE
666
.vloop:
667
SPLATB_LOAD m6, r1+r3*2-1, m1
668
mova m0, [r0+r3]
669
psadbw m0, m7
670
paddw m4, m0
671
mova m0, [r0+r3]
672
psadbw m0, m5
673
paddw m2, m0
674
%if mmsize==8
675
mova m0, [r0+r3]
676
psadbw m0, m6
677
paddw m3, m0
678
mova m0, [r0+r3+8]
679
psadbw m0, m7
680
paddw m4, m0
681
mova m0, [r0+r3+8]
682
psadbw m0, m1
683
paddw m2, m0
684
psadbw m6, [r0+r3+8]
685
paddw m3, m6
686
%else
687
psadbw m6, [r0+r3]
688
paddw m3, m6
689
%endif
690
add r3d, -FENC_STRIDE
691
jge .vloop
692
%if mmsize==16
693
pslldq m3, 4
694
por m3, m2
695
MOVHL m1, m3
696
paddw m3, m1
697
movq [r2+0], m3
698
MOVHL m1, m4
699
paddw m4, m1
700
%else
701
movd [r2+0], m2
702
movd [r2+4], m3
703
%endif
704
movd [r2+8], m4
705
RET
706
%endmacro
707
708
INIT_MMX mmx2
709
INTRA_SAD16
710
INIT_XMM sse2
711
INTRA_SAD16
712
INIT_XMM ssse3
713
INTRA_SAD16
714
715
INIT_YMM avx2
716
cglobal intra_sad_x3_16x16, 3,5,6
717
pxor xm0, xm0
718
psadbw xm0, [r1-FDEC_STRIDE]
719
MOVHL xm1, xm0
720
paddw xm0, xm1
721
movd r3d, xm0
722
%assign x 0
723
%rep 16
724
movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
725
%if (x&3)==3 && x!=15
726
add r1, FDEC_STRIDE*4
727
%endif
728
add r3d, r4d
729
%assign x x+1
730
%endrep
731
sub r1, FDEC_STRIDE*12
732
add r3d, 16
733
shr r3d, 5
734
movd xm5, r3d
735
vpbroadcastb xm5, xm5
736
vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
737
738
pxor m4, m4 ; DC / V accumulator
739
pxor xm3, xm3 ; H accumulator
740
mov r3d, 15*FENC_STRIDE
741
.vloop:
742
vpbroadcastb xm2, [r1+r3*2-1]
743
vbroadcasti128 m0, [r0+r3]
744
psadbw m1, m0, m5
745
psadbw xm0, xm2
746
paddw m4, m1
747
paddw xm3, xm0
748
add r3d, -FENC_STRIDE
749
jge .vloop
750
punpckhqdq m5, m4, m4
751
MOVHL xm2, xm3
752
paddw m4, m5 ; DC / V
753
paddw xm3, xm2 ; H
754
vextracti128 xm2, m4, 1
755
movd [r2+0], xm2
756
movd [r2+4], xm3
757
movd [r2+8], xm4
758
RET
759
760
;=============================================================================
761
; SAD x3/x4 MMX
762
;=============================================================================
763
764
%macro SAD_X3_START_1x8P 0
765
movq mm3, [r0]
766
movq mm0, [r1]
767
movq mm1, [r2]
768
movq mm2, [r3]
769
psadbw mm0, mm3
770
psadbw mm1, mm3
771
psadbw mm2, mm3
772
%endmacro
773
774
%macro SAD_X3_1x8P 2
775
movq mm3, [r0+%1]
776
movq mm4, [r1+%2]
777
movq mm5, [r2+%2]
778
movq mm6, [r3+%2]
779
psadbw mm4, mm3
780
psadbw mm5, mm3
781
psadbw mm6, mm3
782
paddw mm0, mm4
783
paddw mm1, mm5
784
paddw mm2, mm6
785
%endmacro
786
787
%macro SAD_X3_START_2x4P 3
788
movd mm3, [r0]
789
movd %1, [r1]
790
movd %2, [r2]
791
movd %3, [r3]
792
punpckldq mm3, [r0+FENC_STRIDE]
793
punpckldq %1, [r1+r4]
794
punpckldq %2, [r2+r4]
795
punpckldq %3, [r3+r4]
796
psadbw %1, mm3
797
psadbw %2, mm3
798
psadbw %3, mm3
799
%endmacro
800
801
%macro SAD_X3_2x16P 1
802
%if %1
803
SAD_X3_START_1x8P
804
%else
805
SAD_X3_1x8P 0, 0
806
%endif
807
SAD_X3_1x8P 8, 8
808
SAD_X3_1x8P FENC_STRIDE, r4
809
SAD_X3_1x8P FENC_STRIDE+8, r4+8
810
add r0, 2*FENC_STRIDE
811
lea r1, [r1+2*r4]
812
lea r2, [r2+2*r4]
813
lea r3, [r3+2*r4]
814
%endmacro
815
816
%macro SAD_X3_2x8P 1
817
%if %1
818
SAD_X3_START_1x8P
819
%else
820
SAD_X3_1x8P 0, 0
821
%endif
822
SAD_X3_1x8P FENC_STRIDE, r4
823
add r0, 2*FENC_STRIDE
824
lea r1, [r1+2*r4]
825
lea r2, [r2+2*r4]
826
lea r3, [r3+2*r4]
827
%endmacro
828
829
%macro SAD_X3_2x4P 1
830
%if %1
831
SAD_X3_START_2x4P mm0, mm1, mm2
832
%else
833
SAD_X3_START_2x4P mm4, mm5, mm6
834
paddw mm0, mm4
835
paddw mm1, mm5
836
paddw mm2, mm6
837
%endif
838
add r0, 2*FENC_STRIDE
839
lea r1, [r1+2*r4]
840
lea r2, [r2+2*r4]
841
lea r3, [r3+2*r4]
842
%endmacro
843
844
%macro SAD_X4_START_1x8P 0
845
movq mm7, [r0]
846
movq mm0, [r1]
847
movq mm1, [r2]
848
movq mm2, [r3]
849
movq mm3, [r4]
850
psadbw mm0, mm7
851
psadbw mm1, mm7
852
psadbw mm2, mm7
853
psadbw mm3, mm7
854
%endmacro
855
856
%macro SAD_X4_1x8P 2
857
movq mm7, [r0+%1]
858
movq mm4, [r1+%2]
859
movq mm5, [r2+%2]
860
movq mm6, [r3+%2]
861
psadbw mm4, mm7
862
psadbw mm5, mm7
863
psadbw mm6, mm7
864
psadbw mm7, [r4+%2]
865
paddw mm0, mm4
866
paddw mm1, mm5
867
paddw mm2, mm6
868
paddw mm3, mm7
869
%endmacro
870
871
%macro SAD_X4_START_2x4P 0
872
movd mm7, [r0]
873
movd mm0, [r1]
874
movd mm1, [r2]
875
movd mm2, [r3]
876
movd mm3, [r4]
877
punpckldq mm7, [r0+FENC_STRIDE]
878
punpckldq mm0, [r1+r5]
879
punpckldq mm1, [r2+r5]
880
punpckldq mm2, [r3+r5]
881
punpckldq mm3, [r4+r5]
882
psadbw mm0, mm7
883
psadbw mm1, mm7
884
psadbw mm2, mm7
885
psadbw mm3, mm7
886
%endmacro
887
888
%macro SAD_X4_INC_2x4P 0
889
movd mm7, [r0]
890
movd mm4, [r1]
891
movd mm5, [r2]
892
punpckldq mm7, [r0+FENC_STRIDE]
893
punpckldq mm4, [r1+r5]
894
punpckldq mm5, [r2+r5]
895
psadbw mm4, mm7
896
psadbw mm5, mm7
897
paddw mm0, mm4
898
paddw mm1, mm5
899
movd mm4, [r3]
900
movd mm5, [r4]
901
punpckldq mm4, [r3+r5]
902
punpckldq mm5, [r4+r5]
903
psadbw mm4, mm7
904
psadbw mm5, mm7
905
paddw mm2, mm4
906
paddw mm3, mm5
907
%endmacro
908
909
%macro SAD_X4_2x16P 1
910
%if %1
911
SAD_X4_START_1x8P
912
%else
913
SAD_X4_1x8P 0, 0
914
%endif
915
SAD_X4_1x8P 8, 8
916
SAD_X4_1x8P FENC_STRIDE, r5
917
SAD_X4_1x8P FENC_STRIDE+8, r5+8
918
add r0, 2*FENC_STRIDE
919
lea r1, [r1+2*r5]
920
lea r2, [r2+2*r5]
921
lea r3, [r3+2*r5]
922
lea r4, [r4+2*r5]
923
%endmacro
924
925
%macro SAD_X4_2x8P 1
926
%if %1
927
SAD_X4_START_1x8P
928
%else
929
SAD_X4_1x8P 0, 0
930
%endif
931
SAD_X4_1x8P FENC_STRIDE, r5
932
add r0, 2*FENC_STRIDE
933
lea r1, [r1+2*r5]
934
lea r2, [r2+2*r5]
935
lea r3, [r3+2*r5]
936
lea r4, [r4+2*r5]
937
%endmacro
938
939
%macro SAD_X4_2x4P 1
940
%if %1
941
SAD_X4_START_2x4P
942
%else
943
SAD_X4_INC_2x4P
944
%endif
945
add r0, 2*FENC_STRIDE
946
lea r1, [r1+2*r5]
947
lea r2, [r2+2*r5]
948
lea r3, [r3+2*r5]
949
lea r4, [r4+2*r5]
950
%endmacro
951
952
%macro SAD_X3_END 0
953
%if UNIX64
954
movd [r5+0], mm0
955
movd [r5+4], mm1
956
movd [r5+8], mm2
957
%else
958
mov r0, r5mp
959
movd [r0+0], mm0
960
movd [r0+4], mm1
961
movd [r0+8], mm2
962
%endif
963
RET
964
%endmacro
965
966
%macro SAD_X4_END 0
967
mov r0, r6mp
968
movd [r0+0], mm0
969
movd [r0+4], mm1
970
movd [r0+8], mm2
971
movd [r0+12], mm3
972
RET
973
%endmacro
974
975
;-----------------------------------------------------------------------------
976
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
977
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
978
;-----------------------------------------------------------------------------
979
%macro SAD_X 3
980
cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
981
SAD_X%1_2x%2P 1
982
%rep %3/2-1
983
SAD_X%1_2x%2P 0
984
%endrep
985
SAD_X%1_END
986
%endmacro
987
988
INIT_MMX
989
SAD_X 3, 16, 16
990
SAD_X 3, 16, 8
991
SAD_X 3, 8, 16
992
SAD_X 3, 8, 8
993
SAD_X 3, 8, 4
994
SAD_X 3, 4, 8
995
SAD_X 3, 4, 4
996
SAD_X 4, 16, 16
997
SAD_X 4, 16, 8
998
SAD_X 4, 8, 16
999
SAD_X 4, 8, 8
1000
SAD_X 4, 8, 4
1001
SAD_X 4, 4, 8
1002
SAD_X 4, 4, 4
1003
1004
1005
1006
;=============================================================================
1007
; SAD x3/x4 XMM
1008
;=============================================================================
1009
1010
%macro SAD_X3_START_1x16P_SSE2 0
1011
mova m2, [r0]
1012
%if cpuflag(avx)
1013
psadbw m0, m2, [r1]
1014
psadbw m1, m2, [r2]
1015
psadbw m2, [r3]
1016
%else
1017
movu m0, [r1]
1018
movu m1, [r2]
1019
movu m3, [r3]
1020
psadbw m0, m2
1021
psadbw m1, m2
1022
psadbw m2, m3
1023
%endif
1024
%endmacro
1025
1026
%macro SAD_X3_1x16P_SSE2 2
1027
mova m3, [r0+%1]
1028
%if cpuflag(avx)
1029
psadbw m4, m3, [r1+%2]
1030
psadbw m5, m3, [r2+%2]
1031
psadbw m3, [r3+%2]
1032
%else
1033
movu m4, [r1+%2]
1034
movu m5, [r2+%2]
1035
movu m6, [r3+%2]
1036
psadbw m4, m3
1037
psadbw m5, m3
1038
psadbw m3, m6
1039
%endif
1040
paddw m0, m4
1041
paddw m1, m5
1042
paddw m2, m3
1043
%endmacro
1044
1045
%if ARCH_X86_64
1046
DECLARE_REG_TMP 6
1047
%else
1048
DECLARE_REG_TMP 5
1049
%endif
1050
1051
%macro SAD_X3_4x16P_SSE2 2
1052
%if %1==0
1053
lea t0, [r4*3]
1054
SAD_X3_START_1x16P_SSE2
1055
%else
1056
SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
1057
%endif
1058
SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
1059
SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
1060
SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
1061
%if %1 != %2-1
1062
%if (%1&1) != 0
1063
add r0, 8*FENC_STRIDE
1064
%endif
1065
lea r1, [r1+4*r4]
1066
lea r2, [r2+4*r4]
1067
lea r3, [r3+4*r4]
1068
%endif
1069
%endmacro
1070
1071
%macro SAD_X3_START_2x8P_SSE2 0
1072
movq m3, [r0]
1073
movq m0, [r1]
1074
movq m1, [r2]
1075
movq m2, [r3]
1076
movhps m3, [r0+FENC_STRIDE]
1077
movhps m0, [r1+r4]
1078
movhps m1, [r2+r4]
1079
movhps m2, [r3+r4]
1080
psadbw m0, m3
1081
psadbw m1, m3
1082
psadbw m2, m3
1083
%endmacro
1084
1085
%macro SAD_X3_2x8P_SSE2 4
1086
movq m6, [r0+%1]
1087
movq m3, [r1+%2]
1088
movq m4, [r2+%2]
1089
movq m5, [r3+%2]
1090
movhps m6, [r0+%3]
1091
movhps m3, [r1+%4]
1092
movhps m4, [r2+%4]
1093
movhps m5, [r3+%4]
1094
psadbw m3, m6
1095
psadbw m4, m6
1096
psadbw m5, m6
1097
paddw m0, m3
1098
paddw m1, m4
1099
paddw m2, m5
1100
%endmacro
1101
1102
%macro SAD_X4_START_2x8P_SSE2 0
1103
movq m4, [r0]
1104
movq m0, [r1]
1105
movq m1, [r2]
1106
movq m2, [r3]
1107
movq m3, [r4]
1108
movhps m4, [r0+FENC_STRIDE]
1109
movhps m0, [r1+r5]
1110
movhps m1, [r2+r5]
1111
movhps m2, [r3+r5]
1112
movhps m3, [r4+r5]
1113
psadbw m0, m4
1114
psadbw m1, m4
1115
psadbw m2, m4
1116
psadbw m3, m4
1117
%endmacro
1118
1119
%macro SAD_X4_2x8P_SSE2 4
1120
movq m6, [r0+%1]
1121
movq m4, [r1+%2]
1122
movq m5, [r2+%2]
1123
movhps m6, [r0+%3]
1124
movhps m4, [r1+%4]
1125
movhps m5, [r2+%4]
1126
psadbw m4, m6
1127
psadbw m5, m6
1128
paddw m0, m4
1129
paddw m1, m5
1130
movq m4, [r3+%2]
1131
movq m5, [r4+%2]
1132
movhps m4, [r3+%4]
1133
movhps m5, [r4+%4]
1134
psadbw m4, m6
1135
psadbw m5, m6
1136
paddw m2, m4
1137
paddw m3, m5
1138
%endmacro
1139
1140
%macro SAD_X4_START_1x16P_SSE2 0
1141
mova m3, [r0]
1142
%if cpuflag(avx)
1143
psadbw m0, m3, [r1]
1144
psadbw m1, m3, [r2]
1145
psadbw m2, m3, [r3]
1146
psadbw m3, [r4]
1147
%else
1148
movu m0, [r1]
1149
movu m1, [r2]
1150
movu m2, [r3]
1151
movu m4, [r4]
1152
psadbw m0, m3
1153
psadbw m1, m3
1154
psadbw m2, m3
1155
psadbw m3, m4
1156
%endif
1157
%endmacro
1158
1159
%macro SAD_X4_1x16P_SSE2 2
1160
mova m6, [r0+%1]
1161
%if cpuflag(avx)
1162
psadbw m4, m6, [r1+%2]
1163
psadbw m5, m6, [r2+%2]
1164
%else
1165
movu m4, [r1+%2]
1166
movu m5, [r2+%2]
1167
psadbw m4, m6
1168
psadbw m5, m6
1169
%endif
1170
paddw m0, m4
1171
paddw m1, m5
1172
%if cpuflag(avx)
1173
psadbw m4, m6, [r3+%2]
1174
psadbw m5, m6, [r4+%2]
1175
%else
1176
movu m4, [r3+%2]
1177
movu m5, [r4+%2]
1178
psadbw m4, m6
1179
psadbw m5, m6
1180
%endif
1181
paddw m2, m4
1182
paddw m3, m5
1183
%endmacro
1184
1185
%macro SAD_X4_4x16P_SSE2 2
1186
%if %1==0
1187
lea r6, [r5*3]
1188
SAD_X4_START_1x16P_SSE2
1189
%else
1190
SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
1191
%endif
1192
SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
1193
SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
1194
SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
1195
%if %1 != %2-1
1196
%if (%1&1) != 0
1197
add r0, 8*FENC_STRIDE
1198
%endif
1199
lea r1, [r1+4*r5]
1200
lea r2, [r2+4*r5]
1201
lea r3, [r3+4*r5]
1202
lea r4, [r4+4*r5]
1203
%endif
1204
%endmacro
1205
1206
%macro SAD_X3_4x8P_SSE2 2
1207
%if %1==0
1208
lea t0, [r4*3]
1209
SAD_X3_START_2x8P_SSE2
1210
%else
1211
SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
1212
%endif
1213
SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
1214
%if %1 != %2-1
1215
%if (%1&1) != 0
1216
add r0, 8*FENC_STRIDE
1217
%endif
1218
lea r1, [r1+4*r4]
1219
lea r2, [r2+4*r4]
1220
lea r3, [r3+4*r4]
1221
%endif
1222
%endmacro
1223
1224
%macro SAD_X4_4x8P_SSE2 2
1225
%if %1==0
1226
lea r6, [r5*3]
1227
SAD_X4_START_2x8P_SSE2
1228
%else
1229
SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1230
%endif
1231
SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1232
%if %1 != %2-1
1233
%if (%1&1) != 0
1234
add r0, 8*FENC_STRIDE
1235
%endif
1236
lea r1, [r1+4*r5]
1237
lea r2, [r2+4*r5]
1238
lea r3, [r3+4*r5]
1239
lea r4, [r4+4*r5]
1240
%endif
1241
%endmacro
1242
1243
%macro SAD_X3_END_SSE2 0
1244
movifnidn r5, r5mp
1245
%if cpuflag(ssse3)
1246
packssdw m0, m1
1247
packssdw m2, m2
1248
phaddd m0, m2
1249
mova [r5], m0
1250
%else
1251
movhlps m3, m0
1252
movhlps m4, m1
1253
movhlps m5, m2
1254
paddw m0, m3
1255
paddw m1, m4
1256
paddw m2, m5
1257
movd [r5+0], m0
1258
movd [r5+4], m1
1259
movd [r5+8], m2
1260
%endif
1261
RET
1262
%endmacro
1263
1264
%macro SAD_X4_END_SSE2 0
1265
mov r0, r6mp
1266
%if cpuflag(ssse3)
1267
packssdw m0, m1
1268
packssdw m2, m3
1269
phaddd m0, m2
1270
mova [r0], m0
1271
%else
1272
psllq m1, 32
1273
psllq m3, 32
1274
paddw m0, m1
1275
paddw m2, m3
1276
movhlps m1, m0
1277
movhlps m3, m2
1278
paddw m0, m1
1279
paddw m2, m3
1280
movq [r0+0], m0
1281
movq [r0+8], m2
1282
%endif
1283
RET
1284
%endmacro
1285
1286
%macro SAD_X4_START_2x8P_SSSE3 0
1287
movddup m4, [r0]
1288
movq m0, [r1]
1289
movq m1, [r3]
1290
movhps m0, [r2]
1291
movhps m1, [r4]
1292
movddup m5, [r0+FENC_STRIDE]
1293
movq m2, [r1+r5]
1294
movq m3, [r3+r5]
1295
movhps m2, [r2+r5]
1296
movhps m3, [r4+r5]
1297
psadbw m0, m4
1298
psadbw m1, m4
1299
psadbw m2, m5
1300
psadbw m3, m5
1301
paddw m0, m2
1302
paddw m1, m3
1303
%endmacro
1304
1305
%macro SAD_X4_2x8P_SSSE3 4
1306
movddup m6, [r0+%1]
1307
movq m2, [r1+%2]
1308
movq m3, [r3+%2]
1309
movhps m2, [r2+%2]
1310
movhps m3, [r4+%2]
1311
movddup m7, [r0+%3]
1312
movq m4, [r1+%4]
1313
movq m5, [r3+%4]
1314
movhps m4, [r2+%4]
1315
movhps m5, [r4+%4]
1316
psadbw m2, m6
1317
psadbw m3, m6
1318
psadbw m4, m7
1319
psadbw m5, m7
1320
paddw m0, m2
1321
paddw m1, m3
1322
paddw m0, m4
1323
paddw m1, m5
1324
%endmacro
1325
1326
%macro SAD_X4_4x8P_SSSE3 2
1327
%if %1==0
1328
lea r6, [r5*3]
1329
SAD_X4_START_2x8P_SSSE3
1330
%else
1331
SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1332
%endif
1333
SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1334
%if %1 != %2-1
1335
%if (%1&1) != 0
1336
add r0, 8*FENC_STRIDE
1337
%endif
1338
lea r1, [r1+4*r5]
1339
lea r2, [r2+4*r5]
1340
lea r3, [r3+4*r5]
1341
lea r4, [r4+4*r5]
1342
%endif
1343
%endmacro
1344
1345
%macro SAD_X4_END_SSSE3 0
1346
mov r0, r6mp
1347
packssdw m0, m1
1348
mova [r0], m0
1349
RET
1350
%endmacro
1351
1352
%macro SAD_X3_START_2x16P_AVX2 0
1353
movu m3, [r0] ; assumes FENC_STRIDE == 16
1354
movu xm0, [r1]
1355
movu xm1, [r2]
1356
movu xm2, [r3]
1357
vinserti128 m0, m0, [r1+r4], 1
1358
vinserti128 m1, m1, [r2+r4], 1
1359
vinserti128 m2, m2, [r3+r4], 1
1360
psadbw m0, m3
1361
psadbw m1, m3
1362
psadbw m2, m3
1363
%endmacro
1364
1365
%macro SAD_X3_2x16P_AVX2 3
1366
movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
1367
movu xm4, [r1+%2]
1368
movu xm5, [r2+%2]
1369
movu xm6, [r3+%2]
1370
vinserti128 m4, m4, [r1+%3], 1
1371
vinserti128 m5, m5, [r2+%3], 1
1372
vinserti128 m6, m6, [r3+%3], 1
1373
psadbw m4, m3
1374
psadbw m5, m3
1375
psadbw m6, m3
1376
paddw m0, m4
1377
paddw m1, m5
1378
paddw m2, m6
1379
%endmacro
1380
1381
%macro SAD_X3_4x16P_AVX2 2
1382
%if %1==0
1383
lea t0, [r4*3]
1384
SAD_X3_START_2x16P_AVX2
1385
%else
1386
SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
1387
%endif
1388
SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
1389
%if %1 != %2-1
1390
%if (%1&1) != 0
1391
add r0, 8*FENC_STRIDE
1392
%endif
1393
lea r1, [r1+4*r4]
1394
lea r2, [r2+4*r4]
1395
lea r3, [r3+4*r4]
1396
%endif
1397
%endmacro
1398
1399
%macro SAD_X4_START_2x16P_AVX2 0
1400
vbroadcasti128 m4, [r0]
1401
vbroadcasti128 m5, [r0+FENC_STRIDE]
1402
movu xm0, [r1]
1403
movu xm1, [r2]
1404
movu xm2, [r1+r5]
1405
movu xm3, [r2+r5]
1406
vinserti128 m0, m0, [r3], 1
1407
vinserti128 m1, m1, [r4], 1
1408
vinserti128 m2, m2, [r3+r5], 1
1409
vinserti128 m3, m3, [r4+r5], 1
1410
psadbw m0, m4
1411
psadbw m1, m4
1412
psadbw m2, m5
1413
psadbw m3, m5
1414
paddw m0, m2
1415
paddw m1, m3
1416
%endmacro
1417
1418
%macro SAD_X4_2x16P_AVX2 4
1419
vbroadcasti128 m6, [r0+%1]
1420
vbroadcasti128 m7, [r0+%3]
1421
movu xm2, [r1+%2]
1422
movu xm3, [r2+%2]
1423
movu xm4, [r1+%4]
1424
movu xm5, [r2+%4]
1425
vinserti128 m2, m2, [r3+%2], 1
1426
vinserti128 m3, m3, [r4+%2], 1
1427
vinserti128 m4, m4, [r3+%4], 1
1428
vinserti128 m5, m5, [r4+%4], 1
1429
psadbw m2, m6
1430
psadbw m3, m6
1431
psadbw m4, m7
1432
psadbw m5, m7
1433
paddw m0, m2
1434
paddw m1, m3
1435
paddw m0, m4
1436
paddw m1, m5
1437
%endmacro
1438
1439
%macro SAD_X4_4x16P_AVX2 2
1440
%if %1==0
1441
lea r6, [r5*3]
1442
SAD_X4_START_2x16P_AVX2
1443
%else
1444
SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
1445
%endif
1446
SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
1447
%if %1 != %2-1
1448
%if (%1&1) != 0
1449
add r0, 8*FENC_STRIDE
1450
%endif
1451
lea r1, [r1+4*r5]
1452
lea r2, [r2+4*r5]
1453
lea r3, [r3+4*r5]
1454
lea r4, [r4+4*r5]
1455
%endif
1456
%endmacro
1457
1458
%macro SAD_X3_END_AVX2 0
1459
movifnidn r5, r5mp
1460
packssdw m0, m1 ; 0 0 1 1 0 0 1 1
1461
packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
1462
phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
1463
vextracti128 xm1, m0, 1
1464
paddd xm0, xm1 ; 0 1 2 _
1465
mova [r5], xm0
1466
RET
1467
%endmacro
1468
1469
%macro SAD_X4_END_AVX2 0
1470
mov r0, r6mp
1471
packssdw m0, m1 ; 0 0 1 1 2 2 3 3
1472
vextracti128 xm1, m0, 1
1473
phaddd xm0, xm1 ; 0 1 2 3
1474
mova [r0], xm0
1475
RET
1476
%endmacro
1477
1478
;-----------------------------------------------------------------------------
1479
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
1480
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
1481
;-----------------------------------------------------------------------------
1482
%macro SAD_X_SSE2 4
1483
cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
1484
%assign x 0
1485
%rep %3/4
1486
SAD_X%1_4x%2P_SSE2 x, %3/4
1487
%assign x x+1
1488
%endrep
1489
SAD_X%1_END_SSE2
1490
%endmacro
1491
1492
INIT_XMM sse2
1493
SAD_X_SSE2 3, 16, 16, 7
1494
SAD_X_SSE2 3, 16, 8, 7
1495
SAD_X_SSE2 3, 8, 16, 7
1496
SAD_X_SSE2 3, 8, 8, 7
1497
SAD_X_SSE2 3, 8, 4, 7
1498
SAD_X_SSE2 4, 16, 16, 7
1499
SAD_X_SSE2 4, 16, 8, 7
1500
SAD_X_SSE2 4, 8, 16, 7
1501
SAD_X_SSE2 4, 8, 8, 7
1502
SAD_X_SSE2 4, 8, 4, 7
1503
1504
INIT_XMM sse3
1505
SAD_X_SSE2 3, 16, 16, 7
1506
SAD_X_SSE2 3, 16, 8, 7
1507
SAD_X_SSE2 4, 16, 16, 7
1508
SAD_X_SSE2 4, 16, 8, 7
1509
1510
%macro SAD_X_SSSE3 3
1511
cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
1512
%assign x 0
1513
%rep %3/4
1514
SAD_X%1_4x%2P_SSSE3 x, %3/4
1515
%assign x x+1
1516
%endrep
1517
SAD_X%1_END_SSSE3
1518
%endmacro
1519
1520
INIT_XMM ssse3
1521
SAD_X_SSE2 3, 16, 16, 7
1522
SAD_X_SSE2 3, 16, 8, 7
1523
SAD_X_SSE2 4, 16, 16, 7
1524
SAD_X_SSE2 4, 16, 8, 7
1525
SAD_X_SSSE3 4, 8, 16
1526
SAD_X_SSSE3 4, 8, 8
1527
SAD_X_SSSE3 4, 8, 4
1528
1529
INIT_XMM avx
1530
SAD_X_SSE2 3, 16, 16, 6
1531
SAD_X_SSE2 3, 16, 8, 6
1532
SAD_X_SSE2 4, 16, 16, 7
1533
SAD_X_SSE2 4, 16, 8, 7
1534
1535
%macro SAD_X_AVX2 4
1536
cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
1537
%assign x 0
1538
%rep %3/4
1539
SAD_X%1_4x%2P_AVX2 x, %3/4
1540
%assign x x+1
1541
%endrep
1542
SAD_X%1_END_AVX2
1543
%endmacro
1544
1545
INIT_YMM avx2
1546
SAD_X_AVX2 3, 16, 16, 7
1547
SAD_X_AVX2 3, 16, 8, 7
1548
SAD_X_AVX2 4, 16, 16, 8
1549
SAD_X_AVX2 4, 16, 8, 8
1550
1551
;=============================================================================
1552
; SAD cacheline split
1553
;=============================================================================
1554
1555
; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
1556
; unless the unaligned data spans the border between 2 cachelines, in which
1557
; case it's really slow. The exact numbers may differ, but all Intel cpus prior
1558
; to Nehalem have a large penalty for cacheline splits.
1559
; (8-byte alignment exactly half way between two cachelines is ok though.)
1560
; LDDQU was supposed to fix this, but it only works on Pentium 4.
1561
; So in the split case we load aligned data and explicitly perform the
1562
; alignment between registers. Like on archs that have only aligned loads,
1563
; except complicated by the fact that PALIGNR takes only an immediate, not
1564
; a variable alignment.
1565
; It is also possible to hoist the realignment to the macroblock level (keep
1566
; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
1567
; needed for that method makes it often slower.
1568
1569
; sad 16x16 costs on Core2:
1570
; good offsets: 49 cycles (50/64 of all mvs)
1571
; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
1572
; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
1573
; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
1574
1575
; computed jump assumes this loop is exactly 80 bytes
1576
%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
1577
ALIGN 16
1578
sad_w16_align%1_sse2:
1579
movdqa xmm1, [r2+16]
1580
movdqa xmm2, [r2+r3+16]
1581
movdqa xmm3, [r2]
1582
movdqa xmm4, [r2+r3]
1583
pslldq xmm1, 16-%1
1584
pslldq xmm2, 16-%1
1585
psrldq xmm3, %1
1586
psrldq xmm4, %1
1587
por xmm1, xmm3
1588
por xmm2, xmm4
1589
psadbw xmm1, [r0]
1590
psadbw xmm2, [r0+r1]
1591
paddw xmm0, xmm1
1592
paddw xmm0, xmm2
1593
lea r0, [r0+2*r1]
1594
lea r2, [r2+2*r3]
1595
dec r4
1596
jg sad_w16_align%1_sse2
1597
ret
1598
%endmacro
1599
1600
; computed jump assumes this loop is exactly 64 bytes
1601
%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
1602
ALIGN 16
1603
sad_w16_align%1_ssse3:
1604
movdqa xmm1, [r2+16]
1605
movdqa xmm2, [r2+r3+16]
1606
palignr xmm1, [r2], %1
1607
palignr xmm2, [r2+r3], %1
1608
psadbw xmm1, [r0]
1609
psadbw xmm2, [r0+r1]
1610
paddw xmm0, xmm1
1611
paddw xmm0, xmm2
1612
lea r0, [r0+2*r1]
1613
lea r2, [r2+2*r3]
1614
dec r4
1615
jg sad_w16_align%1_ssse3
1616
ret
1617
%endmacro
1618
1619
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
1620
cglobal pixel_sad_16x%2_cache64_%1
1621
mov eax, r2m
1622
and eax, 0x37
1623
cmp eax, 0x30
1624
jle pixel_sad_16x%2_sse2
1625
PROLOGUE 4,6
1626
mov r4d, r2d
1627
and r4d, 15
1628
%ifidn %1, ssse3
1629
shl r4d, 6 ; code size = 64
1630
%else
1631
lea r4, [r4*5]
1632
shl r4d, 4 ; code size = 80
1633
%endif
1634
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
1635
%ifdef PIC
1636
lea r5, [sad_w16_addr]
1637
add r5, r4
1638
%else
1639
lea r5, [sad_w16_addr + r4]
1640
%endif
1641
and r2, ~15
1642
mov r4d, %2/2
1643
pxor xmm0, xmm0
1644
call r5
1645
MOVHL xmm1, xmm0
1646
paddw xmm0, xmm1
1647
movd eax, xmm0
1648
RET
1649
%endmacro
1650
1651
%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
1652
mov eax, r2m
1653
and eax, 0x17|%1|(%4>>1)
1654
cmp eax, 0x10|%1|(%4>>1)
1655
jle pixel_sad_%1x%2_mmx2
1656
and eax, 7
1657
shl eax, 3
1658
movd mm6, [sw_64]
1659
movd mm7, eax
1660
psubw mm6, mm7
1661
PROLOGUE 4,5
1662
and r2, ~7
1663
mov r4d, %3
1664
pxor mm0, mm0
1665
%endmacro
1666
1667
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1668
cglobal pixel_sad_16x%1_cache%2_mmx2
1669
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
1670
.loop:
1671
movq mm1, [r2]
1672
movq mm2, [r2+8]
1673
movq mm3, [r2+16]
1674
movq mm4, mm2
1675
psrlq mm1, mm7
1676
psllq mm2, mm6
1677
psllq mm3, mm6
1678
psrlq mm4, mm7
1679
por mm1, mm2
1680
por mm3, mm4
1681
psadbw mm1, [r0]
1682
psadbw mm3, [r0+8]
1683
paddw mm0, mm1
1684
paddw mm0, mm3
1685
add r2, r3
1686
add r0, r1
1687
dec r4
1688
jg .loop
1689
movd eax, mm0
1690
RET
1691
%endmacro
1692
1693
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
1694
cglobal pixel_sad_8x%1_cache%2_mmx2
1695
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
1696
.loop:
1697
movq mm1, [r2+8]
1698
movq mm2, [r2+r3+8]
1699
movq mm3, [r2]
1700
movq mm4, [r2+r3]
1701
psllq mm1, mm6
1702
psllq mm2, mm6
1703
psrlq mm3, mm7
1704
psrlq mm4, mm7
1705
por mm1, mm3
1706
por mm2, mm4
1707
psadbw mm1, [r0]
1708
psadbw mm2, [r0+r1]
1709
paddw mm0, mm1
1710
paddw mm0, mm2
1711
lea r2, [r2+2*r3]
1712
lea r0, [r0+2*r1]
1713
dec r4
1714
jg .loop
1715
movd eax, mm0
1716
RET
1717
%endmacro
1718
1719
; sad_x3/x4_cache64: check each mv.
1720
; if they're all within a cacheline, use normal sad_x3/x4.
1721
; otherwise, send them individually to sad_cache64.
1722
%macro CHECK_SPLIT 3 ; pix, width, cacheline
1723
mov eax, %1
1724
and eax, 0x17|%2|(%3>>1)
1725
cmp eax, 0x10|%2|(%3>>1)
1726
jg .split
1727
%endmacro
1728
1729
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1730
cglobal pixel_sad_x3_%1x%2_cache%3_%6
1731
CHECK_SPLIT r1m, %1, %3
1732
CHECK_SPLIT r2m, %1, %3
1733
CHECK_SPLIT r3m, %1, %3
1734
jmp pixel_sad_x3_%1x%2_%4
1735
.split:
1736
%if ARCH_X86_64
1737
PROLOGUE 6,9
1738
push r3
1739
push r2
1740
%if WIN64
1741
movsxd r4, r4d
1742
sub rsp, 40 ; shadow space and alignment
1743
%endif
1744
mov r2, r1
1745
mov r1, FENC_STRIDE
1746
mov r3, r4
1747
mov r7, r0
1748
mov r8, r5
1749
call pixel_sad_%1x%2_cache%3_%5
1750
mov [r8], eax
1751
%if WIN64
1752
mov r2, [rsp+40+0*8]
1753
%else
1754
pop r2
1755
%endif
1756
mov r0, r7
1757
call pixel_sad_%1x%2_cache%3_%5
1758
mov [r8+4], eax
1759
%if WIN64
1760
mov r2, [rsp+40+1*8]
1761
%else
1762
pop r2
1763
%endif
1764
mov r0, r7
1765
call pixel_sad_%1x%2_cache%3_%5
1766
mov [r8+8], eax
1767
%if WIN64
1768
add rsp, 40+2*8
1769
%endif
1770
RET
1771
%else
1772
push edi
1773
mov edi, [esp+28]
1774
push dword [esp+24]
1775
push dword [esp+16]
1776
push dword 16
1777
push dword [esp+20]
1778
call pixel_sad_%1x%2_cache%3_%5
1779
mov ecx, [esp+32]
1780
mov [edi], eax
1781
mov [esp+8], ecx
1782
call pixel_sad_%1x%2_cache%3_%5
1783
mov ecx, [esp+36]
1784
mov [edi+4], eax
1785
mov [esp+8], ecx
1786
call pixel_sad_%1x%2_cache%3_%5
1787
mov [edi+8], eax
1788
add esp, 16
1789
pop edi
1790
ret
1791
%endif
1792
%endmacro
1793
1794
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
1795
cglobal pixel_sad_x4_%1x%2_cache%3_%6
1796
CHECK_SPLIT r1m, %1, %3
1797
CHECK_SPLIT r2m, %1, %3
1798
CHECK_SPLIT r3m, %1, %3
1799
CHECK_SPLIT r4m, %1, %3
1800
jmp pixel_sad_x4_%1x%2_%4
1801
.split:
1802
%if ARCH_X86_64
1803
PROLOGUE 6,9
1804
mov r8, r6mp
1805
push r4
1806
push r3
1807
push r2
1808
%if WIN64
1809
sub rsp, 32 ; shadow space
1810
%endif
1811
mov r2, r1
1812
mov r1, FENC_STRIDE
1813
mov r3, r5
1814
mov r7, r0
1815
call pixel_sad_%1x%2_cache%3_%5
1816
mov [r8], eax
1817
%if WIN64
1818
mov r2, [rsp+32+0*8]
1819
%else
1820
pop r2
1821
%endif
1822
mov r0, r7
1823
call pixel_sad_%1x%2_cache%3_%5
1824
mov [r8+4], eax
1825
%if WIN64
1826
mov r2, [rsp+32+1*8]
1827
%else
1828
pop r2
1829
%endif
1830
mov r0, r7
1831
call pixel_sad_%1x%2_cache%3_%5
1832
mov [r8+8], eax
1833
%if WIN64
1834
mov r2, [rsp+32+2*8]
1835
%else
1836
pop r2
1837
%endif
1838
mov r0, r7
1839
call pixel_sad_%1x%2_cache%3_%5
1840
mov [r8+12], eax
1841
%if WIN64
1842
add rsp, 32+3*8
1843
%endif
1844
RET
1845
%else
1846
push edi
1847
mov edi, [esp+32]
1848
push dword [esp+28]
1849
push dword [esp+16]
1850
push dword 16
1851
push dword [esp+20]
1852
call pixel_sad_%1x%2_cache%3_%5
1853
mov ecx, [esp+32]
1854
mov [edi], eax
1855
mov [esp+8], ecx
1856
call pixel_sad_%1x%2_cache%3_%5
1857
mov ecx, [esp+36]
1858
mov [edi+4], eax
1859
mov [esp+8], ecx
1860
call pixel_sad_%1x%2_cache%3_%5
1861
mov ecx, [esp+40]
1862
mov [edi+8], eax
1863
mov [esp+8], ecx
1864
call pixel_sad_%1x%2_cache%3_%5
1865
mov [edi+12], eax
1866
add esp, 16
1867
pop edi
1868
ret
1869
%endif
1870
%endmacro
1871
1872
%macro SADX34_CACHELINE_FUNC 1+
1873
SADX3_CACHELINE_FUNC %1
1874
SADX4_CACHELINE_FUNC %1
1875
%endmacro
1876
1877
1878
; instantiate the aligned sads
1879
1880
INIT_MMX
1881
%if ARCH_X86_64 == 0
1882
SAD16_CACHELINE_FUNC_MMX2 8, 32
1883
SAD16_CACHELINE_FUNC_MMX2 16, 32
1884
SAD8_CACHELINE_FUNC_MMX2 4, 32
1885
SAD8_CACHELINE_FUNC_MMX2 8, 32
1886
SAD8_CACHELINE_FUNC_MMX2 16, 32
1887
SAD16_CACHELINE_FUNC_MMX2 8, 64
1888
SAD16_CACHELINE_FUNC_MMX2 16, 64
1889
%endif ; !ARCH_X86_64
1890
SAD8_CACHELINE_FUNC_MMX2 4, 64
1891
SAD8_CACHELINE_FUNC_MMX2 8, 64
1892
SAD8_CACHELINE_FUNC_MMX2 16, 64
1893
1894
%if ARCH_X86_64 == 0
1895
SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
1896
SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
1897
SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
1898
SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
1899
SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
1900
SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
1901
%endif ; !ARCH_X86_64
1902
SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
1903
SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
1904
1905
%if ARCH_X86_64 == 0
1906
SAD16_CACHELINE_FUNC sse2, 8
1907
SAD16_CACHELINE_FUNC sse2, 16
1908
%assign i 1
1909
%rep 15
1910
SAD16_CACHELINE_LOOP_SSE2 i
1911
%assign i i+1
1912
%endrep
1913
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
1914
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
1915
%endif ; !ARCH_X86_64
1916
SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
1917
1918
SAD16_CACHELINE_FUNC ssse3, 8
1919
SAD16_CACHELINE_FUNC ssse3, 16
1920
%assign i 1
1921
%rep 15
1922
SAD16_CACHELINE_LOOP_SSSE3 i
1923
%assign i i+1
1924
%endrep
1925
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
1926
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
1927
1928
1929