Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
;*****************************************************************************
2
;* pixel.asm: x86 pixel metrics
3
;*****************************************************************************
4
;* Copyright (C) 2003-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Holger Lubitz <holger@lubitz.org>
8
;* Laurent Aimar <fenrir@via.ecp.fr>
9
;* Alex Izvorski <aizvorksi@gmail.com>
10
;* Fiona Glaser <fiona@x264.com>
11
;* Oskar Arvidsson <oskar@irock.se>
12
;*
13
;* This program is free software; you can redistribute it and/or modify
14
;* it under the terms of the GNU General Public License as published by
15
;* the Free Software Foundation; either version 2 of the License, or
16
;* (at your option) any later version.
17
;*
18
;* This program is distributed in the hope that it will be useful,
19
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21
;* GNU General Public License for more details.
22
;*
23
;* You should have received a copy of the GNU General Public License
24
;* along with this program; if not, write to the Free Software
25
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26
;*
27
;* This program is also available under a commercial proprietary license.
28
;* For more information, contact us at licensing@x264.com.
29
;*****************************************************************************
30
31
%include "x86inc.asm"
32
%include "x86util.asm"
33
34
SECTION_RODATA 32
35
hmul_16p: times 16 db 1
36
times 8 db 1, -1
37
hmul_8p: times 8 db 1
38
times 4 db 1, -1
39
times 8 db 1
40
times 4 db 1, -1
41
mask_ff: times 16 db 0xff
42
times 16 db 0
43
mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
44
mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
45
mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
46
%if BIT_DEPTH == 10
47
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
48
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
49
pf_64: times 4 dd 64.0
50
pf_128: times 4 dd 128.0
51
%elif BIT_DEPTH == 9
52
ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
53
ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
54
%else ; 8-bit
55
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
56
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
57
%endif
58
hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
59
mask_10: times 4 dw 0, -1
60
mask_1100: times 2 dd 0, -1
61
pb_pppm: times 4 db 1,1,1,-1
62
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
63
intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
64
65
intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
66
intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
67
intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
68
intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
69
intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
70
intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
71
intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
72
intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
73
intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
74
intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
75
pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
76
pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
77
intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
78
79
intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
80
intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
81
intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
82
intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
83
intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
84
intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
85
intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
86
intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
87
intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
88
intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
89
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
90
intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
91
92
ALIGN 32
93
intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
94
intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
95
intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
96
intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
97
intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
98
intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
99
intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
100
intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
101
intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
102
intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
103
intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
104
intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
105
intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
106
intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
107
intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
108
intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
109
intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
110
intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
111
intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
112
intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
113
intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
114
intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
115
intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
116
intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
117
intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
118
intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
119
intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
120
intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
121
pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
122
pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
123
124
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
125
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
126
127
sw_f0: dq 0xfff0, 0
128
pd_f0: times 4 dd 0xffff0000
129
130
pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
131
132
ads_mvs_shuffle:
133
%macro ADS_MVS_SHUFFLE 8
134
%assign y x
135
%rep 8
136
%rep 7
137
%rotate (~y)&1
138
%assign y y>>((~y)&1)
139
%endrep
140
db %1*2, %1*2+1
141
%rotate 1
142
%assign y y>>1
143
%endrep
144
%endmacro
145
%assign x 0
146
%rep 256
147
ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
148
%assign x x+1
149
%endrep
150
151
SECTION .text
152
153
cextern pb_0
154
cextern pb_1
155
cextern pw_1
156
cextern pw_8
157
cextern pw_16
158
cextern pw_32
159
cextern pw_00ff
160
cextern pw_ppppmmmm
161
cextern pw_ppmmppmm
162
cextern pw_pmpmpmpm
163
cextern pw_pmmpzzzz
164
cextern pd_1
165
cextern hsub_mul
166
cextern popcnt_table
167
168
;=============================================================================
169
; SSD
170
;=============================================================================
171
172
%if HIGH_BIT_DEPTH
173
;-----------------------------------------------------------------------------
174
; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
175
;-----------------------------------------------------------------------------
176
%macro SSD_ONE 2
177
cglobal pixel_ssd_%1x%2, 4,7,6
178
FIX_STRIDES r1, r3
179
%if mmsize == %1*2
180
%define offset0_1 r1
181
%define offset0_2 r1*2
182
%define offset0_3 r5
183
%define offset1_1 r3
184
%define offset1_2 r3*2
185
%define offset1_3 r6
186
lea r5, [3*r1]
187
lea r6, [3*r3]
188
%elif mmsize == %1
189
%define offset0_1 mmsize
190
%define offset0_2 r1
191
%define offset0_3 r1+mmsize
192
%define offset1_1 mmsize
193
%define offset1_2 r3
194
%define offset1_3 r3+mmsize
195
%elif mmsize == %1/2
196
%define offset0_1 mmsize
197
%define offset0_2 mmsize*2
198
%define offset0_3 mmsize*3
199
%define offset1_1 mmsize
200
%define offset1_2 mmsize*2
201
%define offset1_3 mmsize*3
202
%endif
203
%assign %%n %2/(2*mmsize/%1)
204
%if %%n > 1
205
mov r4d, %%n
206
%endif
207
pxor m0, m0
208
.loop:
209
mova m1, [r0]
210
mova m2, [r0+offset0_1]
211
mova m3, [r0+offset0_2]
212
mova m4, [r0+offset0_3]
213
psubw m1, [r2]
214
psubw m2, [r2+offset1_1]
215
psubw m3, [r2+offset1_2]
216
psubw m4, [r2+offset1_3]
217
%if %%n > 1
218
lea r0, [r0+r1*(%2/%%n)]
219
lea r2, [r2+r3*(%2/%%n)]
220
%endif
221
pmaddwd m1, m1
222
pmaddwd m2, m2
223
pmaddwd m3, m3
224
pmaddwd m4, m4
225
paddd m1, m2
226
paddd m3, m4
227
paddd m0, m1
228
paddd m0, m3
229
%if %%n > 1
230
dec r4d
231
jg .loop
232
%endif
233
HADDD m0, m5
234
movd eax, xm0
235
RET
236
%endmacro
237
238
INIT_MMX mmx2
239
SSD_ONE 4, 4
240
SSD_ONE 4, 8
241
SSD_ONE 4, 16
242
SSD_ONE 8, 4
243
SSD_ONE 8, 8
244
SSD_ONE 8, 16
245
SSD_ONE 16, 8
246
SSD_ONE 16, 16
247
INIT_XMM sse2
248
SSD_ONE 8, 4
249
SSD_ONE 8, 8
250
SSD_ONE 8, 16
251
SSD_ONE 16, 8
252
SSD_ONE 16, 16
253
INIT_YMM avx2
254
SSD_ONE 16, 8
255
SSD_ONE 16, 16
256
%endif ; HIGH_BIT_DEPTH
257
258
%if HIGH_BIT_DEPTH == 0
259
%macro SSD_LOAD_FULL 5
260
mova m1, [t0+%1]
261
mova m2, [t2+%2]
262
mova m3, [t0+%3]
263
mova m4, [t2+%4]
264
%if %5==1
265
add t0, t1
266
add t2, t3
267
%elif %5==2
268
lea t0, [t0+2*t1]
269
lea t2, [t2+2*t3]
270
%endif
271
%endmacro
272
273
%macro LOAD 5
274
movh m%1, %3
275
movh m%2, %4
276
%if %5
277
lea t0, [t0+2*t1]
278
%endif
279
%endmacro
280
281
%macro JOIN 7
282
movh m%3, %5
283
movh m%4, %6
284
%if %7
285
lea t2, [t2+2*t3]
286
%endif
287
punpcklbw m%1, m7
288
punpcklbw m%3, m7
289
psubw m%1, m%3
290
punpcklbw m%2, m7
291
punpcklbw m%4, m7
292
psubw m%2, m%4
293
%endmacro
294
295
%macro JOIN_SSE2 7
296
movh m%3, %5
297
movh m%4, %6
298
%if %7
299
lea t2, [t2+2*t3]
300
%endif
301
punpcklqdq m%1, m%2
302
punpcklqdq m%3, m%4
303
DEINTB %2, %1, %4, %3, 7
304
psubw m%2, m%4
305
psubw m%1, m%3
306
%endmacro
307
308
%macro JOIN_SSSE3 7
309
movh m%3, %5
310
movh m%4, %6
311
%if %7
312
lea t2, [t2+2*t3]
313
%endif
314
punpcklbw m%1, m%3
315
punpcklbw m%2, m%4
316
%endmacro
317
318
%macro LOAD_AVX2 5
319
mova xm%1, %3
320
vinserti128 m%1, m%1, %4, 1
321
%if %5
322
lea t0, [t0+2*t1]
323
%endif
324
%endmacro
325
326
%macro JOIN_AVX2 7
327
mova xm%2, %5
328
vinserti128 m%2, m%2, %6, 1
329
%if %7
330
lea t2, [t2+2*t3]
331
%endif
332
SBUTTERFLY bw, %1, %2, %3
333
%endmacro
334
335
%macro SSD_LOAD_HALF 5
336
LOAD 1, 2, [t0+%1], [t0+%3], 1
337
JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
338
LOAD 3, 4, [t0+%1], [t0+%3], %5
339
JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
340
%endmacro
341
342
%macro SSD_CORE 7-8
343
%ifidn %8, FULL
344
mova m%6, m%2
345
mova m%7, m%4
346
psubusb m%2, m%1
347
psubusb m%4, m%3
348
psubusb m%1, m%6
349
psubusb m%3, m%7
350
por m%1, m%2
351
por m%3, m%4
352
punpcklbw m%2, m%1, m%5
353
punpckhbw m%1, m%5
354
punpcklbw m%4, m%3, m%5
355
punpckhbw m%3, m%5
356
%endif
357
pmaddwd m%1, m%1
358
pmaddwd m%2, m%2
359
pmaddwd m%3, m%3
360
pmaddwd m%4, m%4
361
%endmacro
362
363
%macro SSD_CORE_SSE2 7-8
364
%ifidn %8, FULL
365
DEINTB %6, %1, %7, %2, %5
366
psubw m%6, m%7
367
psubw m%1, m%2
368
SWAP %6, %2, %1
369
DEINTB %6, %3, %7, %4, %5
370
psubw m%6, m%7
371
psubw m%3, m%4
372
SWAP %6, %4, %3
373
%endif
374
pmaddwd m%1, m%1
375
pmaddwd m%2, m%2
376
pmaddwd m%3, m%3
377
pmaddwd m%4, m%4
378
%endmacro
379
380
%macro SSD_CORE_SSSE3 7-8
381
%ifidn %8, FULL
382
punpckhbw m%6, m%1, m%2
383
punpckhbw m%7, m%3, m%4
384
punpcklbw m%1, m%2
385
punpcklbw m%3, m%4
386
SWAP %6, %2, %3
387
SWAP %7, %4
388
%endif
389
pmaddubsw m%1, m%5
390
pmaddubsw m%2, m%5
391
pmaddubsw m%3, m%5
392
pmaddubsw m%4, m%5
393
pmaddwd m%1, m%1
394
pmaddwd m%2, m%2
395
pmaddwd m%3, m%3
396
pmaddwd m%4, m%4
397
%endmacro
398
399
%macro SSD_ITER 6
400
SSD_LOAD_%1 %2,%3,%4,%5,%6
401
SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
402
paddd m1, m2
403
paddd m3, m4
404
paddd m0, m1
405
paddd m0, m3
406
%endmacro
407
408
;-----------------------------------------------------------------------------
409
; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
410
;-----------------------------------------------------------------------------
411
%macro SSD 2
412
%if %1 != %2
413
%assign function_align 8
414
%else
415
%assign function_align 16
416
%endif
417
cglobal pixel_ssd_%1x%2, 0,0,0
418
mov al, %1*%2/mmsize/2
419
420
%if %1 != %2
421
jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
422
%else
423
424
.startloop:
425
%if ARCH_X86_64
426
DECLARE_REG_TMP 0,1,2,3
427
PROLOGUE 0,0,8
428
%else
429
PROLOGUE 0,5
430
DECLARE_REG_TMP 1,2,3,4
431
mov t0, r0m
432
mov t1, r1m
433
mov t2, r2m
434
mov t3, r3m
435
%endif
436
437
%if cpuflag(ssse3)
438
mova m7, [hsub_mul]
439
%elifidn cpuname, sse2
440
mova m7, [pw_00ff]
441
%elif %1 >= mmsize
442
pxor m7, m7
443
%endif
444
pxor m0, m0
445
446
ALIGN 16
447
.loop:
448
%if %1 > mmsize
449
SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
450
%elif %1 == mmsize
451
SSD_ITER FULL, 0, 0, t1, t3, 2
452
%else
453
SSD_ITER HALF, 0, 0, t1, t3, 2
454
%endif
455
dec al
456
jg .loop
457
%if mmsize==32
458
vextracti128 xm1, m0, 1
459
paddd xm0, xm1
460
HADDD xm0, xm1
461
movd eax, xm0
462
%else
463
HADDD m0, m1
464
movd eax, m0
465
%endif
466
RET
467
%endif
468
%endmacro
469
470
INIT_MMX mmx
471
SSD 16, 16
472
SSD 16, 8
473
SSD 8, 8
474
SSD 8, 16
475
SSD 4, 4
476
SSD 8, 4
477
SSD 4, 8
478
SSD 4, 16
479
INIT_XMM sse2slow
480
SSD 16, 16
481
SSD 8, 8
482
SSD 16, 8
483
SSD 8, 16
484
SSD 8, 4
485
INIT_XMM sse2
486
%define SSD_CORE SSD_CORE_SSE2
487
%define JOIN JOIN_SSE2
488
SSD 16, 16
489
SSD 8, 8
490
SSD 16, 8
491
SSD 8, 16
492
SSD 8, 4
493
INIT_XMM ssse3
494
%define SSD_CORE SSD_CORE_SSSE3
495
%define JOIN JOIN_SSSE3
496
SSD 16, 16
497
SSD 8, 8
498
SSD 16, 8
499
SSD 8, 16
500
SSD 8, 4
501
INIT_XMM avx
502
SSD 16, 16
503
SSD 8, 8
504
SSD 16, 8
505
SSD 8, 16
506
SSD 8, 4
507
INIT_MMX ssse3
508
SSD 4, 4
509
SSD 4, 8
510
SSD 4, 16
511
INIT_XMM xop
512
SSD 16, 16
513
SSD 8, 8
514
SSD 16, 8
515
SSD 8, 16
516
SSD 8, 4
517
%define LOAD LOAD_AVX2
518
%define JOIN JOIN_AVX2
519
INIT_YMM avx2
520
SSD 16, 16
521
SSD 16, 8
522
%assign function_align 16
523
%endif ; !HIGH_BIT_DEPTH
524
525
;-----------------------------------------------------------------------------
526
; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
527
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
528
;
529
; The maximum width this function can handle without risk of overflow is given
530
; in the following equation: (mmsize in bits)
531
;
532
; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
533
;
534
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
535
; distortion levels it will take much more than that though.
536
;-----------------------------------------------------------------------------
537
%if HIGH_BIT_DEPTH
538
%macro SSD_NV12 0
539
cglobal pixel_ssd_nv12_core, 6,7,7
540
shl r4d, 2
541
FIX_STRIDES r1, r3
542
add r0, r4
543
add r2, r4
544
xor r6, r6
545
pxor m4, m4
546
pxor m5, m5
547
pxor m6, m6
548
.loopy:
549
mov r6, r4
550
neg r6
551
pxor m2, m2
552
pxor m3, m3
553
.loopx:
554
mova m0, [r0+r6]
555
mova m1, [r0+r6+mmsize]
556
psubw m0, [r2+r6]
557
psubw m1, [r2+r6+mmsize]
558
PSHUFLW m0, m0, q3120
559
PSHUFLW m1, m1, q3120
560
%if mmsize >= 16
561
pshufhw m0, m0, q3120
562
pshufhw m1, m1, q3120
563
%endif
564
%if cpuflag(xop)
565
pmadcswd m2, m0, m0, m2
566
pmadcswd m3, m1, m1, m3
567
%else
568
pmaddwd m0, m0
569
pmaddwd m1, m1
570
paddd m2, m0
571
paddd m3, m1
572
%endif
573
add r6, 2*mmsize
574
jl .loopx
575
%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
576
jz .no_overread
577
psubd m3, m1
578
.no_overread:
579
%endif
580
%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
581
; equation above, putting the width limit at 8208
582
punpckhdq m0, m2, m6
583
punpckhdq m1, m3, m6
584
punpckldq m2, m6
585
punpckldq m3, m6
586
paddq m3, m2
587
paddq m1, m0
588
paddq m4, m3
589
paddq m4, m1
590
%else ; unfortunately paddq is sse2
591
; emulate 48 bit precision for mmx2 instead
592
mova m0, m2
593
mova m1, m3
594
punpcklwd m2, m6
595
punpcklwd m3, m6
596
punpckhwd m0, m6
597
punpckhwd m1, m6
598
paddd m3, m2
599
paddd m1, m0
600
paddd m4, m3
601
paddd m5, m1
602
%endif
603
add r0, r1
604
add r2, r3
605
dec r5d
606
jg .loopy
607
mov r3, r6m
608
mov r4, r7m
609
%if mmsize == 32
610
vextracti128 xm0, m4, 1
611
paddq xm4, xm0
612
%endif
613
%if mmsize >= 16
614
movq [r3], xm4
615
movhps [r4], xm4
616
%else ; fixup for mmx2
617
SBUTTERFLY dq, 4, 5, 0
618
mova m0, m4
619
psrld m4, 16
620
paddd m5, m4
621
pslld m0, 16
622
SBUTTERFLY dq, 0, 5, 4
623
psrlq m0, 16
624
psrlq m5, 16
625
movq [r3], m0
626
movq [r4], m5
627
%endif
628
RET
629
%endmacro ; SSD_NV12
630
%endif ; HIGH_BIT_DEPTH
631
632
%if HIGH_BIT_DEPTH == 0
633
;-----------------------------------------------------------------------------
634
; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
635
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
636
;
637
; This implementation can potentially overflow on image widths >= 11008 (or
638
; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
639
; 20). At sane distortion levels it will take much more than that though.
640
;-----------------------------------------------------------------------------
641
%macro SSD_NV12 0
642
cglobal pixel_ssd_nv12_core, 6,7
643
add r4d, r4d
644
add r0, r4
645
add r2, r4
646
pxor m3, m3
647
pxor m4, m4
648
mova m5, [pw_00ff]
649
.loopy:
650
mov r6, r4
651
neg r6
652
.loopx:
653
%if mmsize == 32 ; only 16-byte alignment is guaranteed
654
movu m2, [r0+r6]
655
movu m1, [r2+r6]
656
%else
657
mova m2, [r0+r6]
658
mova m1, [r2+r6]
659
%endif
660
psubusb m0, m2, m1
661
psubusb m1, m2
662
por m0, m1
663
psrlw m2, m0, 8
664
pand m0, m5
665
%if cpuflag(xop)
666
pmadcswd m4, m2, m2, m4
667
pmadcswd m3, m0, m0, m3
668
%else
669
pmaddwd m2, m2
670
pmaddwd m0, m0
671
paddd m4, m2
672
paddd m3, m0
673
%endif
674
add r6, mmsize
675
jl .loopx
676
%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
677
jz .no_overread
678
pcmpeqb xm1, xm1
679
pandn m0, m1, m0 ; zero the lower half
680
pandn m2, m1, m2
681
psubd m3, m0
682
psubd m4, m2
683
.no_overread:
684
%endif
685
add r0, r1
686
add r2, r3
687
dec r5d
688
jg .loopy
689
mov r3, r6m
690
mov r4, r7m
691
HADDD m3, m0
692
HADDD m4, m0
693
pxor xm0, xm0
694
punpckldq xm3, xm0
695
punpckldq xm4, xm0
696
movq [r3], xm3
697
movq [r4], xm4
698
RET
699
%endmacro ; SSD_NV12
700
%endif ; !HIGH_BIT_DEPTH
701
702
INIT_MMX mmx2
703
SSD_NV12
704
INIT_XMM sse2
705
SSD_NV12
706
INIT_XMM avx
707
SSD_NV12
708
INIT_XMM xop
709
SSD_NV12
710
INIT_YMM avx2
711
SSD_NV12
712
713
;=============================================================================
714
; variance
715
;=============================================================================
716
717
%macro VAR_START 1
718
pxor m5, m5 ; sum
719
pxor m6, m6 ; sum squared
720
%if HIGH_BIT_DEPTH == 0
721
%if %1
722
mova m7, [pw_00ff]
723
%elif mmsize < 32
724
pxor m7, m7 ; zero
725
%endif
726
%endif ; !HIGH_BIT_DEPTH
727
%endmacro
728
729
%macro VAR_END 2
730
%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
731
HADDUW m5, m2
732
%else
733
HADDW m5, m2
734
%endif
735
HADDD m6, m1
736
%if ARCH_X86_64
737
punpckldq m5, m6
738
movq rax, m5
739
%else
740
movd eax, m5
741
movd edx, m6
742
%endif
743
RET
744
%endmacro
745
746
%macro VAR_CORE 0
747
paddw m5, m0
748
paddw m5, m3
749
paddw m5, m1
750
paddw m5, m4
751
pmaddwd m0, m0
752
pmaddwd m3, m3
753
pmaddwd m1, m1
754
pmaddwd m4, m4
755
paddd m6, m0
756
paddd m6, m3
757
paddd m6, m1
758
paddd m6, m4
759
%endmacro
760
761
%macro VAR_2ROW 2
762
mov r2d, %2
763
.loop:
764
%if HIGH_BIT_DEPTH
765
mova m0, [r0]
766
mova m1, [r0+mmsize]
767
mova m3, [r0+%1]
768
mova m4, [r0+%1+mmsize]
769
%else ; !HIGH_BIT_DEPTH
770
mova m0, [r0]
771
mova m3, [r0+%1]
772
punpckhbw m1, m0, m7
773
punpcklbw m0, m7
774
punpckhbw m4, m3, m7
775
punpcklbw m3, m7
776
%endif ; HIGH_BIT_DEPTH
777
%ifidn %1, r1
778
lea r0, [r0+%1*2]
779
%else
780
add r0, r1
781
%endif
782
VAR_CORE
783
dec r2d
784
jg .loop
785
%endmacro
786
787
;-----------------------------------------------------------------------------
788
; int pixel_var_wxh( uint8_t *, intptr_t )
789
;-----------------------------------------------------------------------------
790
INIT_MMX mmx2
791
cglobal pixel_var_16x16, 2,3
792
FIX_STRIDES r1
793
VAR_START 0
794
VAR_2ROW 8*SIZEOF_PIXEL, 16
795
VAR_END 16, 16
796
797
cglobal pixel_var_8x16, 2,3
798
FIX_STRIDES r1
799
VAR_START 0
800
VAR_2ROW r1, 8
801
VAR_END 8, 16
802
803
cglobal pixel_var_8x8, 2,3
804
FIX_STRIDES r1
805
VAR_START 0
806
VAR_2ROW r1, 4
807
VAR_END 8, 8
808
809
%if HIGH_BIT_DEPTH
810
%macro VAR 0
811
cglobal pixel_var_16x16, 2,3,8
812
FIX_STRIDES r1
813
VAR_START 0
814
VAR_2ROW r1, 8
815
VAR_END 16, 16
816
817
cglobal pixel_var_8x8, 2,3,8
818
lea r2, [r1*3]
819
VAR_START 0
820
mova m0, [r0]
821
mova m1, [r0+r1*2]
822
mova m3, [r0+r1*4]
823
mova m4, [r0+r2*2]
824
lea r0, [r0+r1*8]
825
VAR_CORE
826
mova m0, [r0]
827
mova m1, [r0+r1*2]
828
mova m3, [r0+r1*4]
829
mova m4, [r0+r2*2]
830
VAR_CORE
831
VAR_END 8, 8
832
%endmacro ; VAR
833
834
INIT_XMM sse2
835
VAR
836
INIT_XMM avx
837
VAR
838
INIT_XMM xop
839
VAR
840
%endif ; HIGH_BIT_DEPTH
841
842
%if HIGH_BIT_DEPTH == 0
843
%macro VAR 0
844
cglobal pixel_var_16x16, 2,3,8
845
VAR_START 1
846
mov r2d, 8
847
.loop:
848
mova m0, [r0]
849
mova m3, [r0+r1]
850
DEINTB 1, 0, 4, 3, 7
851
lea r0, [r0+r1*2]
852
VAR_CORE
853
dec r2d
854
jg .loop
855
VAR_END 16, 16
856
857
cglobal pixel_var_8x8, 2,4,8
858
VAR_START 1
859
mov r2d, 2
860
lea r3, [r1*3]
861
.loop:
862
movh m0, [r0]
863
movh m3, [r0+r1]
864
movhps m0, [r0+r1*2]
865
movhps m3, [r0+r3]
866
DEINTB 1, 0, 4, 3, 7
867
lea r0, [r0+r1*4]
868
VAR_CORE
869
dec r2d
870
jg .loop
871
VAR_END 8, 8
872
873
cglobal pixel_var_8x16, 2,4,8
874
VAR_START 1
875
mov r2d, 4
876
lea r3, [r1*3]
877
.loop:
878
movh m0, [r0]
879
movh m3, [r0+r1]
880
movhps m0, [r0+r1*2]
881
movhps m3, [r0+r3]
882
DEINTB 1, 0, 4, 3, 7
883
lea r0, [r0+r1*4]
884
VAR_CORE
885
dec r2d
886
jg .loop
887
VAR_END 8, 16
888
%endmacro ; VAR
889
890
INIT_XMM sse2
891
VAR
892
INIT_XMM avx
893
VAR
894
INIT_XMM xop
895
VAR
896
%endif ; !HIGH_BIT_DEPTH
897
898
INIT_YMM avx2
899
cglobal pixel_var_16x16, 2,4,7
900
FIX_STRIDES r1
901
VAR_START 0
902
mov r2d, 4
903
lea r3, [r1*3]
904
.loop:
905
%if HIGH_BIT_DEPTH
906
mova m0, [r0]
907
mova m3, [r0+r1]
908
mova m1, [r0+r1*2]
909
mova m4, [r0+r3]
910
%else
911
pmovzxbw m0, [r0]
912
pmovzxbw m3, [r0+r1]
913
pmovzxbw m1, [r0+r1*2]
914
pmovzxbw m4, [r0+r3]
915
%endif
916
lea r0, [r0+r1*4]
917
VAR_CORE
918
dec r2d
919
jg .loop
920
vextracti128 xm0, m5, 1
921
vextracti128 xm1, m6, 1
922
paddw xm5, xm0
923
paddd xm6, xm1
924
HADDW xm5, xm2
925
HADDD xm6, xm1
926
%if ARCH_X86_64
927
punpckldq xm5, xm6
928
movq rax, xm5
929
%else
930
movd eax, xm5
931
movd edx, xm6
932
%endif
933
RET
934
935
%macro VAR2_END 3
936
HADDW %2, xm1
937
movd r1d, %2
938
imul r1d, r1d
939
HADDD %3, xm1
940
shr r1d, %1
941
movd eax, %3
942
movd [r4], %3
943
sub eax, r1d ; sqr - (sum * sum >> shift)
944
RET
945
%endmacro
946
947
;-----------------------------------------------------------------------------
948
; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
949
;-----------------------------------------------------------------------------
950
%macro VAR2_8x8_MMX 2
951
cglobal pixel_var2_8x%1, 5,6
952
FIX_STRIDES r1, r3
953
VAR_START 0
954
mov r5d, %1
955
.loop:
956
%if HIGH_BIT_DEPTH
957
mova m0, [r0]
958
mova m1, [r0+mmsize]
959
psubw m0, [r2]
960
psubw m1, [r2+mmsize]
961
%else ; !HIGH_BIT_DEPTH
962
movq m0, [r0]
963
movq m1, m0
964
movq m2, [r2]
965
movq m3, m2
966
punpcklbw m0, m7
967
punpckhbw m1, m7
968
punpcklbw m2, m7
969
punpckhbw m3, m7
970
psubw m0, m2
971
psubw m1, m3
972
%endif ; HIGH_BIT_DEPTH
973
paddw m5, m0
974
paddw m5, m1
975
pmaddwd m0, m0
976
pmaddwd m1, m1
977
paddd m6, m0
978
paddd m6, m1
979
add r0, r1
980
add r2, r3
981
dec r5d
982
jg .loop
983
VAR2_END %2, m5, m6
984
%endmacro
985
986
%if ARCH_X86_64 == 0
987
INIT_MMX mmx2
988
VAR2_8x8_MMX 8, 6
989
VAR2_8x8_MMX 16, 7
990
%endif
991
992
%macro VAR2_8x8_SSE2 2
993
cglobal pixel_var2_8x%1, 5,6,8
994
VAR_START 1
995
mov r5d, %1/2
996
.loop:
997
%if HIGH_BIT_DEPTH
998
mova m0, [r0]
999
mova m1, [r0+r1*2]
1000
mova m2, [r2]
1001
mova m3, [r2+r3*2]
1002
%else ; !HIGH_BIT_DEPTH
1003
movq m1, [r0]
1004
movhps m1, [r0+r1]
1005
movq m3, [r2]
1006
movhps m3, [r2+r3]
1007
DEINTB 0, 1, 2, 3, 7
1008
%endif ; HIGH_BIT_DEPTH
1009
psubw m0, m2
1010
psubw m1, m3
1011
paddw m5, m0
1012
paddw m5, m1
1013
pmaddwd m0, m0
1014
pmaddwd m1, m1
1015
paddd m6, m0
1016
paddd m6, m1
1017
lea r0, [r0+r1*2*SIZEOF_PIXEL]
1018
lea r2, [r2+r3*2*SIZEOF_PIXEL]
1019
dec r5d
1020
jg .loop
1021
VAR2_END %2, m5, m6
1022
%endmacro
1023
1024
INIT_XMM sse2
1025
VAR2_8x8_SSE2 8, 6
1026
VAR2_8x8_SSE2 16, 7
1027
1028
%if HIGH_BIT_DEPTH == 0
1029
%macro VAR2_8x8_SSSE3 2
1030
cglobal pixel_var2_8x%1, 5,6,8
1031
pxor m5, m5 ; sum
1032
pxor m6, m6 ; sum squared
1033
mova m7, [hsub_mul]
1034
mov r5d, %1/4
1035
.loop:
1036
movq m0, [r0]
1037
movq m2, [r2]
1038
movq m1, [r0+r1]
1039
movq m3, [r2+r3]
1040
lea r0, [r0+r1*2]
1041
lea r2, [r2+r3*2]
1042
punpcklbw m0, m2
1043
punpcklbw m1, m3
1044
movq m2, [r0]
1045
movq m3, [r2]
1046
punpcklbw m2, m3
1047
movq m3, [r0+r1]
1048
movq m4, [r2+r3]
1049
punpcklbw m3, m4
1050
pmaddubsw m0, m7
1051
pmaddubsw m1, m7
1052
pmaddubsw m2, m7
1053
pmaddubsw m3, m7
1054
paddw m5, m0
1055
paddw m5, m1
1056
paddw m5, m2
1057
paddw m5, m3
1058
pmaddwd m0, m0
1059
pmaddwd m1, m1
1060
pmaddwd m2, m2
1061
pmaddwd m3, m3
1062
paddd m6, m0
1063
paddd m6, m1
1064
paddd m6, m2
1065
paddd m6, m3
1066
lea r0, [r0+r1*2]
1067
lea r2, [r2+r3*2]
1068
dec r5d
1069
jg .loop
1070
VAR2_END %2, m5, m6
1071
%endmacro
1072
1073
INIT_XMM ssse3
1074
VAR2_8x8_SSSE3 8, 6
1075
VAR2_8x8_SSSE3 16, 7
1076
INIT_XMM xop
1077
VAR2_8x8_SSSE3 8, 6
1078
VAR2_8x8_SSSE3 16, 7
1079
1080
%macro VAR2_8x8_AVX2 2
1081
cglobal pixel_var2_8x%1, 5,6,6
1082
pxor m3, m3 ; sum
1083
pxor m4, m4 ; sum squared
1084
mova m5, [hsub_mul]
1085
mov r5d, %1/4
1086
.loop:
1087
movq xm0, [r0]
1088
movq xm1, [r2]
1089
vinserti128 m0, m0, [r0+r1], 1
1090
vinserti128 m1, m1, [r2+r3], 1
1091
lea r0, [r0+r1*2]
1092
lea r2, [r2+r3*2]
1093
punpcklbw m0, m1
1094
movq xm1, [r0]
1095
movq xm2, [r2]
1096
vinserti128 m1, m1, [r0+r1], 1
1097
vinserti128 m2, m2, [r2+r3], 1
1098
lea r0, [r0+r1*2]
1099
lea r2, [r2+r3*2]
1100
punpcklbw m1, m2
1101
pmaddubsw m0, m5
1102
pmaddubsw m1, m5
1103
paddw m3, m0
1104
paddw m3, m1
1105
pmaddwd m0, m0
1106
pmaddwd m1, m1
1107
paddd m4, m0
1108
paddd m4, m1
1109
dec r5d
1110
jg .loop
1111
vextracti128 xm0, m3, 1
1112
vextracti128 xm1, m4, 1
1113
paddw xm3, xm0
1114
paddd xm4, xm1
1115
VAR2_END %2, xm3, xm4
1116
%endmacro
1117
1118
INIT_YMM avx2
1119
VAR2_8x8_AVX2 8, 6
1120
VAR2_8x8_AVX2 16, 7
1121
1122
%endif ; !HIGH_BIT_DEPTH
1123
1124
;=============================================================================
1125
; SATD
1126
;=============================================================================
1127
1128
%macro JDUP 2
1129
%if cpuflag(sse4)
1130
; just use shufps on anything post conroe
1131
shufps %1, %2, 0
1132
%elif cpuflag(ssse3) && notcpuflag(atom)
1133
; join 2x 32 bit and duplicate them
1134
; emulating shufps is faster on conroe
1135
punpcklqdq %1, %2
1136
movsldup %1, %1
1137
%else
1138
; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
1139
punpckldq %1, %2
1140
%endif
1141
%endmacro
1142
1143
%macro HSUMSUB 5
1144
pmaddubsw m%2, m%5
1145
pmaddubsw m%1, m%5
1146
pmaddubsw m%4, m%5
1147
pmaddubsw m%3, m%5
1148
%endmacro
1149
1150
%macro DIFF_UNPACK_SSE2 5
1151
punpcklbw m%1, m%5
1152
punpcklbw m%2, m%5
1153
punpcklbw m%3, m%5
1154
punpcklbw m%4, m%5
1155
psubw m%1, m%2
1156
psubw m%3, m%4
1157
%endmacro
1158
1159
%macro DIFF_SUMSUB_SSSE3 5
1160
HSUMSUB %1, %2, %3, %4, %5
1161
psubw m%1, m%2
1162
psubw m%3, m%4
1163
%endmacro
1164
1165
%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
1166
movd %1, %3
1167
movd %2, %4
1168
JDUP %1, %2
1169
%endmacro
1170
1171
%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
1172
movddup m%3, %6
1173
movddup m%4, %8
1174
movddup m%1, %5
1175
movddup m%2, %7
1176
%endmacro
1177
1178
%macro LOAD_DUP_4x8P_PENRYN 8
1179
; penryn and nehalem run punpcklqdq and movddup in different units
1180
movh m%3, %6
1181
movh m%4, %8
1182
punpcklqdq m%3, m%3
1183
movddup m%1, %5
1184
punpcklqdq m%4, m%4
1185
movddup m%2, %7
1186
%endmacro
1187
1188
%macro LOAD_SUMSUB_8x2P 9
1189
LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
1190
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1191
%endmacro
1192
1193
%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
1194
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1195
LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1196
LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1197
%if %10
1198
lea %8, [%8+4*r1]
1199
lea %9, [%9+4*r3]
1200
%endif
1201
%endmacro
1202
1203
%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
1204
movddup m%1, [%7]
1205
movddup m%2, [%7+8]
1206
mova m%4, [%6]
1207
movddup m%3, m%4
1208
punpckhqdq m%4, m%4
1209
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1210
%endmacro
1211
1212
%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
1213
movu m%4, [%7]
1214
mova m%2, [%6]
1215
DEINTB %1, %2, %3, %4, %5
1216
psubw m%1, m%3
1217
psubw m%2, m%4
1218
SUMSUB_BA w, %1, %2, %3
1219
%endmacro
1220
1221
%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
1222
; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
1223
LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
1224
LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
1225
LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
1226
LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
1227
%endmacro
1228
1229
%macro LOAD_SUMSUB_16x2P_AVX2 9
1230
; 2*dst, 2*tmp, mul, 4*ptr
1231
vbroadcasti128 m%1, [%6]
1232
vbroadcasti128 m%3, [%7]
1233
vbroadcasti128 m%2, [%8]
1234
vbroadcasti128 m%4, [%9]
1235
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1236
%endmacro
1237
1238
%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
1239
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1240
LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
1241
LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
1242
%if %10
1243
lea %8, [%8+4*r1]
1244
lea %9, [%9+4*r3]
1245
%endif
1246
%endmacro
1247
1248
%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
1249
mova xm%3, %6
1250
mova xm%4, %8
1251
mova xm%1, %5
1252
mova xm%2, %7
1253
vpermq m%3, m%3, q0011
1254
vpermq m%4, m%4, q0011
1255
vpermq m%1, m%1, q0011
1256
vpermq m%2, m%2, q0011
1257
%endmacro
1258
1259
%macro LOAD_SUMSUB8_16x2P_AVX2 9
1260
; 2*dst, 2*tmp, mul, 4*ptr
1261
LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
1262
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
1263
%endmacro
1264
1265
%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
1266
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
1267
LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
1268
LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
1269
%if %10
1270
lea %8, [%8+4*r1]
1271
lea %9, [%9+4*r3]
1272
%endif
1273
%endmacro
1274
1275
; in: r4=3*stride1, r5=3*stride2
1276
; in: %2 = horizontal offset
1277
; in: %3 = whether we need to increment pix1 and pix2
1278
; clobber: m3..m7
1279
; out: %1 = satd
1280
%macro SATD_4x4_MMX 3
1281
%xdefine %%n nn%1
1282
%assign offset %2*SIZEOF_PIXEL
1283
LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
1284
LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
1285
LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
1286
LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
1287
%if %3
1288
lea r0, [r0+4*r1]
1289
lea r2, [r2+4*r3]
1290
%endif
1291
HADAMARD4_2D 4, 5, 6, 7, 3, %%n
1292
paddw m4, m6
1293
SWAP %%n, 4
1294
%endmacro
1295
1296
; in: %1 = horizontal if 0, vertical if 1
1297
%macro SATD_8x4_SSE 8-9
1298
%if %1
1299
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
1300
%else
1301
HADAMARD4_V %2, %3, %4, %5, %6
1302
; doing the abs first is a slight advantage
1303
ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
1304
ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
1305
HADAMARD 1, max, %2, %4, %6, %7
1306
%endif
1307
%ifnidn %9, swap
1308
paddw m%8, m%2
1309
%else
1310
SWAP %8, %2
1311
%endif
1312
%if %1
1313
paddw m%8, m%4
1314
%else
1315
HADAMARD 1, max, %3, %5, %6, %7
1316
paddw m%8, m%3
1317
%endif
1318
%endmacro
1319
1320
%macro SATD_START_MMX 0
1321
FIX_STRIDES r1, r3
1322
lea r4, [3*r1] ; 3*stride1
1323
lea r5, [3*r3] ; 3*stride2
1324
%endmacro
1325
1326
%macro SATD_END_MMX 0
1327
%if HIGH_BIT_DEPTH
1328
HADDUW m0, m1
1329
movd eax, m0
1330
%else ; !HIGH_BIT_DEPTH
1331
pshufw m1, m0, q1032
1332
paddw m0, m1
1333
pshufw m1, m0, q2301
1334
paddw m0, m1
1335
movd eax, m0
1336
and eax, 0xffff
1337
%endif ; HIGH_BIT_DEPTH
1338
RET
1339
%endmacro
1340
1341
; FIXME avoid the spilling of regs to hold 3*stride.
1342
; for small blocks on x86_32, modify pixel pointer instead.
1343
1344
;-----------------------------------------------------------------------------
1345
; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1346
;-----------------------------------------------------------------------------
1347
INIT_MMX mmx2
1348
cglobal pixel_satd_16x4_internal
1349
SATD_4x4_MMX m2, 0, 0
1350
SATD_4x4_MMX m1, 4, 0
1351
paddw m0, m2
1352
SATD_4x4_MMX m2, 8, 0
1353
paddw m0, m1
1354
SATD_4x4_MMX m1, 12, 0
1355
paddw m0, m2
1356
paddw m0, m1
1357
ret
1358
1359
cglobal pixel_satd_8x8_internal
1360
SATD_4x4_MMX m2, 0, 0
1361
SATD_4x4_MMX m1, 4, 1
1362
paddw m0, m2
1363
paddw m0, m1
1364
pixel_satd_8x4_internal_mmx2:
1365
SATD_4x4_MMX m2, 0, 0
1366
SATD_4x4_MMX m1, 4, 0
1367
paddw m0, m2
1368
paddw m0, m1
1369
ret
1370
1371
%if HIGH_BIT_DEPTH
1372
%macro SATD_MxN_MMX 3
1373
cglobal pixel_satd_%1x%2, 4,7
1374
SATD_START_MMX
1375
pxor m0, m0
1376
call pixel_satd_%1x%3_internal_mmx2
1377
HADDUW m0, m1
1378
movd r6d, m0
1379
%rep %2/%3-1
1380
pxor m0, m0
1381
lea r0, [r0+4*r1]
1382
lea r2, [r2+4*r3]
1383
call pixel_satd_%1x%3_internal_mmx2
1384
movd m2, r4
1385
HADDUW m0, m1
1386
movd r4, m0
1387
add r6, r4
1388
movd r4, m2
1389
%endrep
1390
movifnidn eax, r6d
1391
RET
1392
%endmacro
1393
1394
SATD_MxN_MMX 16, 16, 4
1395
SATD_MxN_MMX 16, 8, 4
1396
SATD_MxN_MMX 8, 16, 8
1397
%endif ; HIGH_BIT_DEPTH
1398
1399
%if HIGH_BIT_DEPTH == 0
1400
cglobal pixel_satd_16x16, 4,6
1401
SATD_START_MMX
1402
pxor m0, m0
1403
%rep 3
1404
call pixel_satd_16x4_internal_mmx2
1405
lea r0, [r0+4*r1]
1406
lea r2, [r2+4*r3]
1407
%endrep
1408
call pixel_satd_16x4_internal_mmx2
1409
HADDUW m0, m1
1410
movd eax, m0
1411
RET
1412
1413
cglobal pixel_satd_16x8, 4,6
1414
SATD_START_MMX
1415
pxor m0, m0
1416
call pixel_satd_16x4_internal_mmx2
1417
lea r0, [r0+4*r1]
1418
lea r2, [r2+4*r3]
1419
call pixel_satd_16x4_internal_mmx2
1420
SATD_END_MMX
1421
1422
cglobal pixel_satd_8x16, 4,6
1423
SATD_START_MMX
1424
pxor m0, m0
1425
call pixel_satd_8x8_internal_mmx2
1426
lea r0, [r0+4*r1]
1427
lea r2, [r2+4*r3]
1428
call pixel_satd_8x8_internal_mmx2
1429
SATD_END_MMX
1430
%endif ; !HIGH_BIT_DEPTH
1431
1432
cglobal pixel_satd_8x8, 4,6
1433
SATD_START_MMX
1434
pxor m0, m0
1435
call pixel_satd_8x8_internal_mmx2
1436
SATD_END_MMX
1437
1438
cglobal pixel_satd_8x4, 4,6
1439
SATD_START_MMX
1440
pxor m0, m0
1441
call pixel_satd_8x4_internal_mmx2
1442
SATD_END_MMX
1443
1444
cglobal pixel_satd_4x16, 4,6
1445
SATD_START_MMX
1446
SATD_4x4_MMX m0, 0, 1
1447
SATD_4x4_MMX m1, 0, 1
1448
paddw m0, m1
1449
SATD_4x4_MMX m1, 0, 1
1450
paddw m0, m1
1451
SATD_4x4_MMX m1, 0, 0
1452
paddw m0, m1
1453
SATD_END_MMX
1454
1455
cglobal pixel_satd_4x8, 4,6
1456
SATD_START_MMX
1457
SATD_4x4_MMX m0, 0, 1
1458
SATD_4x4_MMX m1, 0, 0
1459
paddw m0, m1
1460
SATD_END_MMX
1461
1462
cglobal pixel_satd_4x4, 4,6
1463
SATD_START_MMX
1464
SATD_4x4_MMX m0, 0, 0
1465
SATD_END_MMX
1466
1467
%macro SATD_START_SSE2 2-3 0
1468
FIX_STRIDES r1, r3
1469
%if HIGH_BIT_DEPTH && %3
1470
pxor %2, %2
1471
%elif cpuflag(ssse3) && notcpuflag(atom)
1472
%if mmsize==32
1473
mova %2, [hmul_16p]
1474
%else
1475
mova %2, [hmul_8p]
1476
%endif
1477
%endif
1478
lea r4, [3*r1]
1479
lea r5, [3*r3]
1480
pxor %1, %1
1481
%endmacro
1482
1483
%macro SATD_END_SSE2 1-2
1484
%if HIGH_BIT_DEPTH
1485
HADDUW %1, xm0
1486
%if %0 == 2
1487
paddd %1, %2
1488
%endif
1489
%else
1490
HADDW %1, xm7
1491
%endif
1492
movd eax, %1
1493
RET
1494
%endmacro
1495
1496
%macro SATD_ACCUM 3
1497
%if HIGH_BIT_DEPTH
1498
HADDUW %1, %2
1499
paddd %3, %1
1500
pxor %1, %1
1501
%endif
1502
%endmacro
1503
1504
%macro BACKUP_POINTERS 0
1505
%if ARCH_X86_64
1506
%if WIN64
1507
PUSH r7
1508
%endif
1509
mov r6, r0
1510
mov r7, r2
1511
%endif
1512
%endmacro
1513
1514
%macro RESTORE_AND_INC_POINTERS 0
1515
%if ARCH_X86_64
1516
lea r0, [r6+8*SIZEOF_PIXEL]
1517
lea r2, [r7+8*SIZEOF_PIXEL]
1518
%if WIN64
1519
POP r7
1520
%endif
1521
%else
1522
mov r0, r0mp
1523
mov r2, r2mp
1524
add r0, 8*SIZEOF_PIXEL
1525
add r2, 8*SIZEOF_PIXEL
1526
%endif
1527
%endmacro
1528
1529
%macro SATD_4x8_SSE 3
1530
%if HIGH_BIT_DEPTH
1531
movh m0, [r0+0*r1]
1532
movh m4, [r2+0*r3]
1533
movh m1, [r0+1*r1]
1534
movh m5, [r2+1*r3]
1535
movhps m0, [r0+4*r1]
1536
movhps m4, [r2+4*r3]
1537
movh m2, [r0+2*r1]
1538
movh m6, [r2+2*r3]
1539
psubw m0, m4
1540
movh m3, [r0+r4]
1541
movh m4, [r2+r5]
1542
lea r0, [r0+4*r1]
1543
lea r2, [r2+4*r3]
1544
movhps m1, [r0+1*r1]
1545
movhps m5, [r2+1*r3]
1546
movhps m2, [r0+2*r1]
1547
movhps m6, [r2+2*r3]
1548
psubw m1, m5
1549
movhps m3, [r0+r4]
1550
movhps m4, [r2+r5]
1551
psubw m2, m6
1552
psubw m3, m4
1553
%else ; !HIGH_BIT_DEPTH
1554
movd m4, [r2]
1555
movd m5, [r2+r3]
1556
movd m6, [r2+2*r3]
1557
add r2, r5
1558
movd m0, [r0]
1559
movd m1, [r0+r1]
1560
movd m2, [r0+2*r1]
1561
add r0, r4
1562
movd m3, [r2+r3]
1563
JDUP m4, m3
1564
movd m3, [r0+r1]
1565
JDUP m0, m3
1566
movd m3, [r2+2*r3]
1567
JDUP m5, m3
1568
movd m3, [r0+2*r1]
1569
JDUP m1, m3
1570
%if %1==0 && %2==1
1571
mova m3, [hmul_4p]
1572
DIFFOP 0, 4, 1, 5, 3
1573
%else
1574
DIFFOP 0, 4, 1, 5, 7
1575
%endif
1576
movd m5, [r2]
1577
add r2, r5
1578
movd m3, [r0]
1579
add r0, r4
1580
movd m4, [r2]
1581
JDUP m6, m4
1582
movd m4, [r0]
1583
JDUP m2, m4
1584
movd m4, [r2+r3]
1585
JDUP m5, m4
1586
movd m4, [r0+r1]
1587
JDUP m3, m4
1588
%if %1==0 && %2==1
1589
mova m4, [hmul_4p]
1590
DIFFOP 2, 6, 3, 5, 4
1591
%else
1592
DIFFOP 2, 6, 3, 5, 7
1593
%endif
1594
%endif ; HIGH_BIT_DEPTH
1595
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
1596
%endmacro
1597
1598
;-----------------------------------------------------------------------------
1599
; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
1600
;-----------------------------------------------------------------------------
1601
%macro SATDS_SSE2 0
1602
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1603
1604
%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
1605
cglobal pixel_satd_4x4, 4, 6, 6
1606
SATD_START_MMX
1607
mova m4, [hmul_4p]
1608
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
1609
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
1610
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
1611
LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
1612
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
1613
HADAMARD 0, sumsub, 0, 1, 2, 3
1614
HADAMARD 4, sumsub, 0, 1, 2, 3
1615
HADAMARD 1, amax, 0, 1, 2, 3
1616
HADDW m0, m1
1617
movd eax, m0
1618
RET
1619
%endif
1620
1621
cglobal pixel_satd_4x8, 4, 6, 8
1622
SATD_START_MMX
1623
%if vertical==0
1624
mova m7, [hmul_4p]
1625
%endif
1626
SATD_4x8_SSE vertical, 0, swap
1627
HADDW m7, m1
1628
movd eax, m7
1629
RET
1630
1631
cglobal pixel_satd_4x16, 4, 6, 8
1632
SATD_START_MMX
1633
%if vertical==0
1634
mova m7, [hmul_4p]
1635
%endif
1636
SATD_4x8_SSE vertical, 0, swap
1637
lea r0, [r0+r1*2*SIZEOF_PIXEL]
1638
lea r2, [r2+r3*2*SIZEOF_PIXEL]
1639
SATD_4x8_SSE vertical, 1, add
1640
HADDW m7, m1
1641
movd eax, m7
1642
RET
1643
1644
cglobal pixel_satd_8x8_internal
1645
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1646
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1647
%%pixel_satd_8x4_internal:
1648
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
1649
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
1650
ret
1651
1652
; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
1653
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
1654
%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
1655
cglobal pixel_satd_16x4_internal
1656
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
1657
lea r2, [r2+4*r3]
1658
lea r0, [r0+4*r1]
1659
; always use horizontal mode here
1660
SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
1661
SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
1662
ret
1663
1664
cglobal pixel_satd_16x8, 4,6,12
1665
SATD_START_SSE2 m10, m7
1666
%if vertical
1667
mova m7, [pw_00ff]
1668
%endif
1669
jmp %%pixel_satd_16x8_internal
1670
1671
cglobal pixel_satd_16x16, 4,6,12
1672
SATD_START_SSE2 m10, m7
1673
%if vertical
1674
mova m7, [pw_00ff]
1675
%endif
1676
call pixel_satd_16x4_internal
1677
call pixel_satd_16x4_internal
1678
%%pixel_satd_16x8_internal:
1679
call pixel_satd_16x4_internal
1680
call pixel_satd_16x4_internal
1681
SATD_END_SSE2 m10
1682
%else
1683
cglobal pixel_satd_16x8, 4,6,8
1684
SATD_START_SSE2 m6, m7
1685
BACKUP_POINTERS
1686
call pixel_satd_8x8_internal
1687
RESTORE_AND_INC_POINTERS
1688
call pixel_satd_8x8_internal
1689
SATD_END_SSE2 m6
1690
1691
cglobal pixel_satd_16x16, 4,6,8
1692
SATD_START_SSE2 m6, m7, 1
1693
BACKUP_POINTERS
1694
call pixel_satd_8x8_internal
1695
call pixel_satd_8x8_internal
1696
SATD_ACCUM m6, m0, m7
1697
RESTORE_AND_INC_POINTERS
1698
call pixel_satd_8x8_internal
1699
call pixel_satd_8x8_internal
1700
SATD_END_SSE2 m6, m7
1701
%endif
1702
1703
cglobal pixel_satd_8x16, 4,6,8
1704
SATD_START_SSE2 m6, m7
1705
call pixel_satd_8x8_internal
1706
call pixel_satd_8x8_internal
1707
SATD_END_SSE2 m6
1708
1709
cglobal pixel_satd_8x8, 4,6,8
1710
SATD_START_SSE2 m6, m7
1711
call pixel_satd_8x8_internal
1712
SATD_END_SSE2 m6
1713
1714
cglobal pixel_satd_8x4, 4,6,8
1715
SATD_START_SSE2 m6, m7
1716
call %%pixel_satd_8x4_internal
1717
SATD_END_SSE2 m6
1718
%endmacro ; SATDS_SSE2
1719
1720
%macro SA8D_INTER 0
1721
%if ARCH_X86_64
1722
%define lh m10
1723
%define rh m0
1724
%else
1725
%define lh m0
1726
%define rh [esp+48]
1727
%endif
1728
%if HIGH_BIT_DEPTH
1729
HADDUW m0, m1
1730
paddd lh, rh
1731
%else
1732
paddusw lh, rh
1733
%endif ; HIGH_BIT_DEPTH
1734
%endmacro
1735
1736
%macro SA8D 0
1737
; sse2 doesn't seem to like the horizontal way of doing things
1738
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
1739
1740
%if ARCH_X86_64
1741
;-----------------------------------------------------------------------------
1742
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
1743
;-----------------------------------------------------------------------------
1744
cglobal pixel_sa8d_8x8_internal
1745
lea r6, [r0+4*r1]
1746
lea r7, [r2+4*r3]
1747
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
1748
LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
1749
%if vertical
1750
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
1751
%else ; non-sse2
1752
HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
1753
%endif
1754
paddw m0, m1
1755
paddw m0, m2
1756
paddw m0, m8
1757
SAVE_MM_PERMUTATION
1758
ret
1759
1760
cglobal pixel_sa8d_8x8, 4,8,12
1761
FIX_STRIDES r1, r3
1762
lea r4, [3*r1]
1763
lea r5, [3*r3]
1764
%if vertical == 0
1765
mova m7, [hmul_8p]
1766
%endif
1767
call pixel_sa8d_8x8_internal
1768
%if HIGH_BIT_DEPTH
1769
HADDUW m0, m1
1770
%else
1771
HADDW m0, m1
1772
%endif ; HIGH_BIT_DEPTH
1773
movd eax, m0
1774
add eax, 1
1775
shr eax, 1
1776
RET
1777
1778
cglobal pixel_sa8d_16x16, 4,8,12
1779
FIX_STRIDES r1, r3
1780
lea r4, [3*r1]
1781
lea r5, [3*r3]
1782
%if vertical == 0
1783
mova m7, [hmul_8p]
1784
%endif
1785
call pixel_sa8d_8x8_internal ; pix[0]
1786
add r2, 8*SIZEOF_PIXEL
1787
add r0, 8*SIZEOF_PIXEL
1788
%if HIGH_BIT_DEPTH
1789
HADDUW m0, m1
1790
%endif
1791
mova m10, m0
1792
call pixel_sa8d_8x8_internal ; pix[8]
1793
lea r2, [r2+8*r3]
1794
lea r0, [r0+8*r1]
1795
SA8D_INTER
1796
call pixel_sa8d_8x8_internal ; pix[8*stride+8]
1797
sub r2, 8*SIZEOF_PIXEL
1798
sub r0, 8*SIZEOF_PIXEL
1799
SA8D_INTER
1800
call pixel_sa8d_8x8_internal ; pix[8*stride]
1801
SA8D_INTER
1802
SWAP 0, 10
1803
%if HIGH_BIT_DEPTH == 0
1804
HADDUW m0, m1
1805
%endif
1806
movd eax, m0
1807
add eax, 1
1808
shr eax, 1
1809
RET
1810
1811
%else ; ARCH_X86_32
1812
%if mmsize == 16
1813
cglobal pixel_sa8d_8x8_internal
1814
%define spill0 [esp+4]
1815
%define spill1 [esp+20]
1816
%define spill2 [esp+36]
1817
%if vertical
1818
LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
1819
HADAMARD4_2D 0, 1, 2, 3, 4
1820
movdqa spill0, m3
1821
LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
1822
HADAMARD4_2D 4, 5, 6, 7, 3
1823
HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
1824
movdqa m3, spill0
1825
paddw m0, m1
1826
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
1827
%else ; mmsize == 8
1828
mova m7, [hmul_8p]
1829
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
1830
; could do first HADAMARD4_V here to save spilling later
1831
; surprisingly, not a win on conroe or even p4
1832
mova spill0, m2
1833
mova spill1, m3
1834
mova spill2, m1
1835
SWAP 1, 7
1836
LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
1837
HADAMARD4_V 4, 5, 6, 7, 3
1838
mova m1, spill2
1839
mova m2, spill0
1840
mova m3, spill1
1841
mova spill0, m6
1842
mova spill1, m7
1843
HADAMARD4_V 0, 1, 2, 3, 7
1844
SUMSUB_BADC w, 0, 4, 1, 5, 7
1845
HADAMARD 2, sumsub, 0, 4, 7, 6
1846
HADAMARD 2, sumsub, 1, 5, 7, 6
1847
HADAMARD 1, amax, 0, 4, 7, 6
1848
HADAMARD 1, amax, 1, 5, 7, 6
1849
mova m6, spill0
1850
mova m7, spill1
1851
paddw m0, m1
1852
SUMSUB_BADC w, 2, 6, 3, 7, 4
1853
HADAMARD 2, sumsub, 2, 6, 4, 5
1854
HADAMARD 2, sumsub, 3, 7, 4, 5
1855
HADAMARD 1, amax, 2, 6, 4, 5
1856
HADAMARD 1, amax, 3, 7, 4, 5
1857
%endif ; sse2/non-sse2
1858
paddw m0, m2
1859
paddw m0, m3
1860
SAVE_MM_PERMUTATION
1861
ret
1862
%endif ; ifndef mmx2
1863
1864
cglobal pixel_sa8d_8x8, 4,7
1865
FIX_STRIDES r1, r3
1866
mov r6, esp
1867
and esp, ~15
1868
sub esp, 48
1869
lea r4, [3*r1]
1870
lea r5, [3*r3]
1871
call pixel_sa8d_8x8_internal
1872
%if HIGH_BIT_DEPTH
1873
HADDUW m0, m1
1874
%else
1875
HADDW m0, m1
1876
%endif ; HIGH_BIT_DEPTH
1877
movd eax, m0
1878
add eax, 1
1879
shr eax, 1
1880
mov esp, r6
1881
RET
1882
1883
cglobal pixel_sa8d_16x16, 4,7
1884
FIX_STRIDES r1, r3
1885
mov r6, esp
1886
and esp, ~15
1887
sub esp, 64
1888
lea r4, [3*r1]
1889
lea r5, [3*r3]
1890
call pixel_sa8d_8x8_internal
1891
%if mmsize == 8
1892
lea r0, [r0+4*r1]
1893
lea r2, [r2+4*r3]
1894
%endif
1895
%if HIGH_BIT_DEPTH
1896
HADDUW m0, m1
1897
%endif
1898
mova [esp+48], m0
1899
call pixel_sa8d_8x8_internal
1900
mov r0, [r6+20]
1901
mov r2, [r6+28]
1902
add r0, 8*SIZEOF_PIXEL
1903
add r2, 8*SIZEOF_PIXEL
1904
SA8D_INTER
1905
mova [esp+48], m0
1906
call pixel_sa8d_8x8_internal
1907
%if mmsize == 8
1908
lea r0, [r0+4*r1]
1909
lea r2, [r2+4*r3]
1910
%else
1911
SA8D_INTER
1912
%endif
1913
mova [esp+64-mmsize], m0
1914
call pixel_sa8d_8x8_internal
1915
%if HIGH_BIT_DEPTH
1916
SA8D_INTER
1917
%else ; !HIGH_BIT_DEPTH
1918
paddusw m0, [esp+64-mmsize]
1919
%if mmsize == 16
1920
HADDUW m0, m1
1921
%else
1922
mova m2, [esp+48]
1923
pxor m7, m7
1924
mova m1, m0
1925
mova m3, m2
1926
punpcklwd m0, m7
1927
punpckhwd m1, m7
1928
punpcklwd m2, m7
1929
punpckhwd m3, m7
1930
paddd m0, m1
1931
paddd m2, m3
1932
paddd m0, m2
1933
HADDD m0, m1
1934
%endif
1935
%endif ; HIGH_BIT_DEPTH
1936
movd eax, m0
1937
add eax, 1
1938
shr eax, 1
1939
mov esp, r6
1940
RET
1941
%endif ; !ARCH_X86_64
1942
%endmacro ; SA8D
1943
1944
;=============================================================================
1945
; SA8D_SATD
1946
;=============================================================================
1947
1948
; %1: vertical/horizontal mode
1949
; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
1950
; m10: satd result
1951
; m6, m11-15: tmp regs
1952
%macro SA8D_SATD_8x4 5
1953
%if %1
1954
LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1955
HADAMARD 0, sumsub, %2, %3, 6
1956
HADAMARD 0, sumsub, %4, %5, 6
1957
SBUTTERFLY wd, %2, %3, 6
1958
SBUTTERFLY wd, %4, %5, 6
1959
HADAMARD2_2D %2, %4, %3, %5, 6, dq
1960
1961
mova m12, m%2
1962
mova m13, m%3
1963
mova m14, m%4
1964
mova m15, m%5
1965
HADAMARD 0, sumsub, %2, %3, 6
1966
HADAMARD 0, sumsub, %4, %5, 6
1967
SBUTTERFLY qdq, 12, 13, 6
1968
HADAMARD 0, amax, 12, 13, 6
1969
SBUTTERFLY qdq, 14, 15, 6
1970
paddw m10, m12
1971
HADAMARD 0, amax, 14, 15, 6
1972
paddw m10, m14
1973
%else
1974
LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
1975
HADAMARD4_V %2, %3, %4, %5, 6
1976
1977
pabsw m12, m%2 ; doing the abs first is a slight advantage
1978
pabsw m14, m%4
1979
pabsw m13, m%3
1980
pabsw m15, m%5
1981
HADAMARD 1, max, 12, 14, 6, 11
1982
paddw m10, m12
1983
HADAMARD 1, max, 13, 15, 6, 11
1984
paddw m10, m13
1985
%endif
1986
%endmacro ; SA8D_SATD_8x4
1987
1988
; %1: add spilled regs?
1989
; %2: spill regs?
1990
%macro SA8D_SATD_ACCUM 2
1991
%if HIGH_BIT_DEPTH
1992
pmaddwd m10, [pw_1]
1993
HADDUWD m0, m1
1994
%if %1
1995
paddd m10, temp1
1996
paddd m0, temp0
1997
%endif
1998
%if %2
1999
mova temp1, m10
2000
pxor m10, m10
2001
%endif
2002
%elif %1
2003
paddw m0, temp0
2004
%endif
2005
%if %2
2006
mova temp0, m0
2007
%endif
2008
%endmacro
2009
2010
%macro SA8D_SATD 0
2011
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
2012
cglobal pixel_sa8d_satd_8x8_internal
2013
SA8D_SATD_8x4 vertical, 0, 1, 2, 3
2014
SA8D_SATD_8x4 vertical, 4, 5, 8, 9
2015
2016
%if vertical ; sse2-style
2017
HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
2018
HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
2019
%else ; complete sa8d
2020
SUMSUB_BADC w, 0, 4, 1, 5, 12
2021
HADAMARD 2, sumsub, 0, 4, 12, 11
2022
HADAMARD 2, sumsub, 1, 5, 12, 11
2023
SUMSUB_BADC w, 2, 8, 3, 9, 12
2024
HADAMARD 2, sumsub, 2, 8, 12, 11
2025
HADAMARD 2, sumsub, 3, 9, 12, 11
2026
HADAMARD 1, amax, 0, 4, 12, 11
2027
HADAMARD 1, amax, 1, 5, 12, 4
2028
HADAMARD 1, amax, 2, 8, 12, 4
2029
HADAMARD 1, amax, 3, 9, 12, 4
2030
%endif
2031
2032
; create sa8d sub results
2033
paddw m1, m2
2034
paddw m0, m3
2035
paddw m0, m1
2036
2037
SAVE_MM_PERMUTATION
2038
ret
2039
2040
;-------------------------------------------------------------------------------
2041
; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
2042
;-------------------------------------------------------------------------------
2043
cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
2044
%define temp0 [rsp+0*mmsize]
2045
%define temp1 [rsp+1*mmsize]
2046
FIX_STRIDES r1, r3
2047
%if vertical==0
2048
mova m7, [hmul_8p]
2049
%endif
2050
lea r4, [3*r1]
2051
lea r5, [3*r3]
2052
pxor m10, m10
2053
2054
%if mmsize==32
2055
call pixel_sa8d_satd_8x8_internal
2056
SA8D_SATD_ACCUM 0, 1
2057
call pixel_sa8d_satd_8x8_internal
2058
SA8D_SATD_ACCUM 1, 0
2059
vextracti128 xm1, m0, 1
2060
vextracti128 xm2, m10, 1
2061
paddw xm0, xm1
2062
paddw xm10, xm2
2063
%else
2064
lea r6, [r2+8*SIZEOF_PIXEL]
2065
lea r7, [r0+8*SIZEOF_PIXEL]
2066
2067
call pixel_sa8d_satd_8x8_internal
2068
SA8D_SATD_ACCUM 0, 1
2069
call pixel_sa8d_satd_8x8_internal
2070
SA8D_SATD_ACCUM 1, 1
2071
2072
mov r0, r7
2073
mov r2, r6
2074
2075
call pixel_sa8d_satd_8x8_internal
2076
SA8D_SATD_ACCUM 1, 1
2077
call pixel_sa8d_satd_8x8_internal
2078
SA8D_SATD_ACCUM 1, 0
2079
%endif
2080
2081
; xop already has fast horizontal sums
2082
%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
2083
pmaddwd xm10, [pw_1]
2084
HADDUWD xm0, xm1
2085
phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
2086
pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
2087
paddd xm0, xm1 ; sa8d sa8d satd satd
2088
movd r0d, xm0
2089
pextrd eax, xm0, 2
2090
%else
2091
%if HIGH_BIT_DEPTH
2092
HADDD xm0, xm1
2093
HADDD xm10, xm2
2094
%else
2095
HADDUW xm0, xm1
2096
HADDW xm10, xm2
2097
%endif
2098
movd r0d, xm0
2099
movd eax, xm10
2100
%endif
2101
add r0d, 1
2102
shl rax, 32
2103
shr r0d, 1
2104
or rax, r0
2105
RET
2106
%endmacro ; SA8D_SATD
2107
2108
;=============================================================================
2109
; INTRA SATD
2110
;=============================================================================
2111
2112
%macro HSUMSUB2 8
2113
pshufd %4, %2, %7
2114
pshufd %5, %3, %7
2115
%1 %2, %8
2116
%1 %6, %8
2117
paddw %2, %4
2118
paddw %3, %5
2119
%endmacro
2120
2121
; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
2122
; and are only retained for old cpus.
2123
%macro INTRA_SA8D_SSE2 0
2124
%if ARCH_X86_64
2125
;-----------------------------------------------------------------------------
2126
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
2127
;-----------------------------------------------------------------------------
2128
cglobal intra_sa8d_x3_8x8, 3,3,14
2129
; 8x8 hadamard
2130
pxor m8, m8
2131
movq m0, [r0+0*FENC_STRIDE]
2132
movq m1, [r0+1*FENC_STRIDE]
2133
movq m2, [r0+2*FENC_STRIDE]
2134
movq m3, [r0+3*FENC_STRIDE]
2135
movq m4, [r0+4*FENC_STRIDE]
2136
movq m5, [r0+5*FENC_STRIDE]
2137
movq m6, [r0+6*FENC_STRIDE]
2138
movq m7, [r0+7*FENC_STRIDE]
2139
punpcklbw m0, m8
2140
punpcklbw m1, m8
2141
punpcklbw m2, m8
2142
punpcklbw m3, m8
2143
punpcklbw m4, m8
2144
punpcklbw m5, m8
2145
punpcklbw m6, m8
2146
punpcklbw m7, m8
2147
2148
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
2149
2150
ABSW2 m8, m9, m2, m3, m2, m3
2151
ABSW2 m10, m11, m4, m5, m4, m5
2152
paddusw m8, m10
2153
paddusw m9, m11
2154
ABSW2 m10, m11, m6, m7, m6, m7
2155
ABSW m13, m1, m1
2156
paddusw m10, m11
2157
paddusw m8, m9
2158
paddusw m13, m10
2159
paddusw m13, m8
2160
2161
; 1D hadamard of edges
2162
movq m8, [r1+7]
2163
movq m9, [r1+16]
2164
pxor m10, m10
2165
punpcklbw m8, m10
2166
punpcklbw m9, m10
2167
HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
2168
HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
2169
pshuflw m10, m8, q2301
2170
pshuflw m11, m9, q2301
2171
pshufhw m10, m10, q2301
2172
pshufhw m11, m11, q2301
2173
pmullw m8, [pw_pmpmpmpm]
2174
pmullw m11, [pw_pmpmpmpm]
2175
paddw m8, m10
2176
paddw m9, m11
2177
2178
; differences
2179
paddw m10, m8, m9
2180
paddw m10, [pw_8]
2181
pand m10, [sw_f0]
2182
psllw m10, 2 ; dc
2183
2184
psllw m8, 3 ; left edge
2185
psubw m8, m0
2186
psubw m10, m0
2187
ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
2188
paddusw m8, m13
2189
paddusw m13, m10
2190
punpcklwd m0, m1
2191
punpcklwd m2, m3
2192
punpcklwd m4, m5
2193
punpcklwd m6, m7
2194
punpckldq m0, m2
2195
punpckldq m4, m6
2196
punpcklqdq m0, m4 ; transpose
2197
psllw m9, 3 ; top edge
2198
psrldq m2, m13, 2 ; 8x7 sum
2199
psubw m0, m9 ; 8x1 sum
2200
ABSW m0, m0, m9
2201
paddusw m2, m0
2202
2203
; 3x HADDW
2204
movdqa m7, [pw_1]
2205
pmaddwd m2, m7
2206
pmaddwd m8, m7
2207
pmaddwd m13, m7
2208
punpckhdq m3, m2, m8
2209
punpckldq m2, m8
2210
pshufd m5, m13, q3311
2211
paddd m2, m3
2212
paddd m5, m13
2213
punpckhqdq m0, m2, m5
2214
punpcklqdq m2, m5
2215
pavgw m0, m2
2216
pxor m1, m1
2217
pavgw m0, m1
2218
movq [r2], m0 ; i8x8_v, i8x8_h
2219
psrldq m0, 8
2220
movd [r2+8], m0 ; i8x8_dc
2221
RET
2222
%endif ; ARCH_X86_64
2223
%endmacro ; INTRA_SA8D_SSE2
2224
2225
; in: r0 = fenc
2226
; out: m0..m3 = hadamard coefs
2227
INIT_MMX
2228
cglobal hadamard_load
2229
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
2230
%if HIGH_BIT_DEPTH
2231
mova m0, [r0+0*FENC_STRIDEB]
2232
mova m1, [r0+1*FENC_STRIDEB]
2233
mova m2, [r0+2*FENC_STRIDEB]
2234
mova m3, [r0+3*FENC_STRIDEB]
2235
%else
2236
pxor m7, m7
2237
movd m0, [r0+0*FENC_STRIDE]
2238
movd m1, [r0+1*FENC_STRIDE]
2239
movd m2, [r0+2*FENC_STRIDE]
2240
movd m3, [r0+3*FENC_STRIDE]
2241
punpcklbw m0, m7
2242
punpcklbw m1, m7
2243
punpcklbw m2, m7
2244
punpcklbw m3, m7
2245
%endif
2246
HADAMARD4_2D 0, 1, 2, 3, 4
2247
SAVE_MM_PERMUTATION
2248
ret
2249
2250
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
2251
%ifidn %1, top
2252
%if HIGH_BIT_DEPTH
2253
mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2254
%else
2255
movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
2256
pxor %5, %5
2257
punpcklbw %3, %5
2258
%endif
2259
%else ; left
2260
%ifnidn %2, 0
2261
shl %2d, 5 ; log(FDEC_STRIDEB)
2262
%endif
2263
movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
2264
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
2265
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
2266
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
2267
%if HIGH_BIT_DEPTH == 0
2268
psrlw %3, 8
2269
%endif
2270
%ifnidn %2, 0
2271
shr %2d, 5
2272
%endif
2273
%endif ; direction
2274
%if cpuflag(ssse3)
2275
%define %%sign psignw
2276
%else
2277
%define %%sign pmullw
2278
%endif
2279
pshufw %4, %3, q1032
2280
%%sign %4, [pw_ppmmppmm]
2281
paddw %3, %4
2282
pshufw %4, %3, q2301
2283
%%sign %4, [pw_pmpmpmpm]
2284
paddw %3, %4
2285
psllw %3, 2
2286
mova [%1_1d+2*%2], %3
2287
%endmacro
2288
2289
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
2290
pxor %7, %7
2291
pshufw %4, %1, q1032
2292
pshufw %5, %2, q1032
2293
pshufw %6, %3, q1032
2294
paddw %1, %4
2295
paddw %2, %5
2296
paddw %3, %6
2297
punpcklwd %1, %7
2298
punpcklwd %2, %7
2299
punpcklwd %3, %7
2300
pshufw %4, %1, q1032
2301
pshufw %5, %2, q1032
2302
pshufw %6, %3, q1032
2303
%8 %1, %4
2304
%8 %2, %5
2305
%8 %3, %6
2306
%endmacro
2307
2308
; in: m1..m3
2309
; out: m7
2310
; clobber: m4..m6
2311
%macro SUM3x4 0
2312
ABSW2 m4, m5, m1, m2, m1, m2
2313
ABSW m7, m3, m3
2314
paddw m4, m5
2315
paddw m7, m4
2316
%endmacro
2317
2318
; in: m0..m3 (4x4)
2319
; out: m0 v, m4 h, m5 dc
2320
; clobber: m1..m3
2321
%macro SUM4x3 3 ; dc, left, top
2322
movq m4, %2
2323
%ifid %1
2324
movq m5, %1
2325
%else
2326
movd m5, %1
2327
%endif
2328
psubw m4, m0
2329
psubw m5, m0
2330
punpcklwd m0, m1
2331
punpcklwd m2, m3
2332
punpckldq m0, m2 ; transpose
2333
psubw m0, %3
2334
ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
2335
ABSW m0, m0, m1 ; 4x1 sum
2336
%endmacro
2337
2338
%macro INTRA_X3_MMX 0
2339
;-----------------------------------------------------------------------------
2340
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
2341
;-----------------------------------------------------------------------------
2342
cglobal intra_satd_x3_4x4, 3,3
2343
%if UNIX64
2344
; stack is 16 byte aligned because abi says so
2345
%define top_1d rsp-8 ; size 8
2346
%define left_1d rsp-16 ; size 8
2347
%else
2348
; WIN64: stack is 16 byte aligned because abi says so
2349
; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
2350
SUB rsp, 16
2351
%define top_1d rsp+8
2352
%define left_1d rsp
2353
%endif
2354
2355
call hadamard_load
2356
SCALAR_HADAMARD left, 0, m4, m5
2357
SCALAR_HADAMARD top, 0, m6, m5, m7
2358
paddw m6, m4
2359
pavgw m6, [pw_16]
2360
pand m6, [sw_f0] ; dc
2361
2362
SUM3x4
2363
SUM4x3 m6, [left_1d], [top_1d]
2364
paddw m4, m7
2365
paddw m5, m7
2366
movq m1, m5
2367
psrlq m1, 16 ; 4x3 sum
2368
paddw m0, m1
2369
2370
SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
2371
movd [r2+0], m0 ; i4x4_v satd
2372
movd [r2+4], m4 ; i4x4_h satd
2373
movd [r2+8], m5 ; i4x4_dc satd
2374
%if UNIX64 == 0
2375
ADD rsp, 16
2376
%endif
2377
RET
2378
2379
;-----------------------------------------------------------------------------
2380
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
2381
;-----------------------------------------------------------------------------
2382
cglobal intra_satd_x3_16x16, 0,5
2383
%assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
2384
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2385
SUB rsp, stack_pad
2386
%define sums rsp+64 ; size 56
2387
%define top_1d rsp+32 ; size 32
2388
%define left_1d rsp ; size 32
2389
movifnidn r1, r1mp
2390
2391
pxor m7, m7
2392
mova [sums+ 0], m7
2393
mova [sums+ 8], m7
2394
mova [sums+16], m7
2395
%if HIGH_BIT_DEPTH
2396
mova [sums+24], m7
2397
mova [sums+32], m7
2398
mova [sums+40], m7
2399
mova [sums+48], m7
2400
%endif
2401
2402
; 1D hadamards
2403
mov r3d, 12
2404
movd m6, [pw_32]
2405
.loop_edge:
2406
SCALAR_HADAMARD left, r3, m0, m1
2407
SCALAR_HADAMARD top, r3, m1, m2, m3
2408
pavgw m0, m1
2409
paddw m6, m0
2410
sub r3d, 4
2411
jge .loop_edge
2412
psrlw m6, 2
2413
pand m6, [sw_f0] ; dc
2414
2415
; 2D hadamards
2416
movifnidn r0, r0mp
2417
mov r3, -4
2418
.loop_y:
2419
mov r4, -4
2420
.loop_x:
2421
call hadamard_load
2422
2423
SUM3x4
2424
SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
2425
pavgw m4, m7
2426
pavgw m5, m7
2427
paddw m0, [sums+ 0] ; i16x16_v satd
2428
paddw m4, [sums+ 8] ; i16x16_h satd
2429
paddw m5, [sums+16] ; i16x16_dc satd
2430
mova [sums+ 0], m0
2431
mova [sums+ 8], m4
2432
mova [sums+16], m5
2433
2434
add r0, 4*SIZEOF_PIXEL
2435
inc r4
2436
jl .loop_x
2437
%if HIGH_BIT_DEPTH
2438
psrld m7, m4, 16
2439
pslld m4, 16
2440
psrld m4, 16
2441
paddd m4, m7
2442
psrld m7, m0, 16
2443
pslld m0, 16
2444
psrld m0, 16
2445
paddd m0, m7
2446
paddd m4, [sums+32]
2447
paddd m0, [sums+24]
2448
mova [sums+32], m4
2449
mova [sums+24], m0
2450
pxor m7, m7
2451
punpckhwd m3, m5, m7
2452
punpcklwd m5, m7
2453
paddd m3, [sums+48]
2454
paddd m5, [sums+40]
2455
mova [sums+48], m3
2456
mova [sums+40], m5
2457
mova [sums+ 0], m7
2458
mova [sums+ 8], m7
2459
mova [sums+16], m7
2460
%endif
2461
add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
2462
inc r3
2463
jl .loop_y
2464
2465
; horizontal sum
2466
movifnidn r2, r2mp
2467
%if HIGH_BIT_DEPTH
2468
mova m1, m5
2469
paddd m5, m3
2470
HADDD m5, m7 ; DC satd
2471
HADDD m4, m7 ; H satd
2472
HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
2473
psrld m0, 1
2474
psrlq m1, 32 ; DC[1]
2475
paddd m0, m3 ; DC[2]
2476
psrlq m3, 32 ; DC[3]
2477
paddd m0, m1
2478
paddd m0, m3
2479
%else
2480
mova m7, m5
2481
SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
2482
psrld m0, 1
2483
pslld m7, 16
2484
psrld m7, 16
2485
paddd m0, m5
2486
psubd m0, m7
2487
%endif
2488
movd [r2+8], m5 ; i16x16_dc satd
2489
movd [r2+4], m4 ; i16x16_h satd
2490
movd [r2+0], m0 ; i16x16_v satd
2491
ADD rsp, stack_pad
2492
RET
2493
2494
%if ARCH_X86_64
2495
%define t0 r6
2496
%else
2497
%define t0 r2
2498
%endif
2499
2500
;-----------------------------------------------------------------------------
2501
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
2502
;-----------------------------------------------------------------------------
2503
cglobal intra_satd_x3_8x8c, 0,6
2504
; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
2505
SUB rsp, 72
2506
%define sums rsp+48 ; size 24
2507
%define dc_1d rsp+32 ; size 16
2508
%define top_1d rsp+16 ; size 16
2509
%define left_1d rsp ; size 16
2510
movifnidn r1, r1mp
2511
pxor m7, m7
2512
mova [sums+ 0], m7
2513
mova [sums+ 8], m7
2514
mova [sums+16], m7
2515
2516
; 1D hadamards
2517
mov r3d, 4
2518
.loop_edge:
2519
SCALAR_HADAMARD left, r3, m0, m1
2520
SCALAR_HADAMARD top, r3, m0, m1, m2
2521
sub r3d, 4
2522
jge .loop_edge
2523
2524
; dc
2525
movzx t0d, word [left_1d+0]
2526
movzx r3d, word [top_1d+0]
2527
movzx r4d, word [left_1d+8]
2528
movzx r5d, word [top_1d+8]
2529
lea t0d, [t0 + r3 + 16]
2530
lea r3d, [r4 + r5 + 16]
2531
shr t0d, 1
2532
shr r3d, 1
2533
add r4d, 8
2534
add r5d, 8
2535
and t0d, -16 ; tl
2536
and r3d, -16 ; br
2537
and r4d, -16 ; bl
2538
and r5d, -16 ; tr
2539
mov [dc_1d+ 0], t0d ; tl
2540
mov [dc_1d+ 4], r5d ; tr
2541
mov [dc_1d+ 8], r4d ; bl
2542
mov [dc_1d+12], r3d ; br
2543
lea r5, [dc_1d]
2544
2545
; 2D hadamards
2546
movifnidn r0, r0mp
2547
movifnidn r2, r2mp
2548
mov r3, -2
2549
.loop_y:
2550
mov r4, -2
2551
.loop_x:
2552
call hadamard_load
2553
2554
SUM3x4
2555
SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
2556
pavgw m4, m7
2557
pavgw m5, m7
2558
paddw m0, [sums+16] ; i4x4_v satd
2559
paddw m4, [sums+8] ; i4x4_h satd
2560
paddw m5, [sums+0] ; i4x4_dc satd
2561
movq [sums+16], m0
2562
movq [sums+8], m4
2563
movq [sums+0], m5
2564
2565
add r0, 4*SIZEOF_PIXEL
2566
inc r4
2567
jl .loop_x
2568
add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
2569
add r5, 8
2570
inc r3
2571
jl .loop_y
2572
2573
; horizontal sum
2574
movq m0, [sums+0]
2575
movq m1, [sums+8]
2576
movq m2, [sums+16]
2577
movq m7, m0
2578
%if HIGH_BIT_DEPTH
2579
psrlq m7, 16
2580
HADDW m7, m3
2581
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2582
psrld m2, 1
2583
paddd m2, m7
2584
%else
2585
psrlq m7, 15
2586
paddw m2, m7
2587
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
2588
psrld m2, 1
2589
%endif
2590
movd [r2+0], m0 ; i8x8c_dc satd
2591
movd [r2+4], m1 ; i8x8c_h satd
2592
movd [r2+8], m2 ; i8x8c_v satd
2593
ADD rsp, 72
2594
RET
2595
%endmacro ; INTRA_X3_MMX
2596
2597
2598
2599
%macro PRED4x4_LOWPASS 5
2600
%ifid %5
2601
pavgb %5, %2, %3
2602
pxor %3, %2
2603
pand %3, [pb_1]
2604
psubusb %5, %3
2605
pavgb %1, %4, %5
2606
%else
2607
mova %5, %2
2608
pavgb %2, %3
2609
pxor %3, %5
2610
pand %3, [pb_1]
2611
psubusb %2, %3
2612
pavgb %1, %4, %2
2613
%endif
2614
%endmacro
2615
2616
%macro INTRA_X9_PRED 2
2617
%if cpuflag(sse4)
2618
movu m1, [r1-1*FDEC_STRIDE-8]
2619
pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
2620
pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
2621
pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
2622
pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
2623
%else
2624
movd mm0, [r1+3*FDEC_STRIDE-4]
2625
punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
2626
movd mm1, [r1+1*FDEC_STRIDE-4]
2627
punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
2628
punpckhwd mm0, mm1
2629
psrlq mm0, 32
2630
movq2dq m0, mm0
2631
movu m1, [r1-1*FDEC_STRIDE-8]
2632
movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
2633
%endif ; cpuflag
2634
pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
2635
psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
2636
psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
2637
pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
2638
mova %2, m1
2639
PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
2640
; ddl ddr
2641
; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
2642
; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
2643
; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
2644
; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
2645
pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
2646
pshufb m3, m0, [%1_ddlr2] ; rows 2,3
2647
; hd hu
2648
; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
2649
; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
2650
; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
2651
; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
2652
pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2653
palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
2654
pshufb m6, m7, [%1_hdu1]
2655
pshufb m7, m7, [%1_hdu2]
2656
; vr vl
2657
; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
2658
; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
2659
; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
2660
; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
2661
psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
2662
palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
2663
pshufb m4, m5, [%1_vrl1]
2664
pshufb m5, m5, [%1_vrl2]
2665
%endmacro ; INTRA_X9_PRED
2666
2667
%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
2668
pshufb m2, m%1, [intrax9b_vh1]
2669
pshufb m3, m%1, [intrax9b_vh2]
2670
mova [pred_buf+0x60], m2
2671
mova [pred_buf+0x70], m3
2672
pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
2673
pmaddubsw m%1, [hmul_4p]
2674
pshufhw m0, m%1, q2301
2675
pshuflw m0, m0, q2301
2676
psignw m%1, [pw_pmpmpmpm]
2677
paddw m0, m%1
2678
psllw m0, 2 ; hadamard(top), hadamard(left)
2679
MOVHL m3, m0
2680
pshufb m1, m0, [intrax9b_v1]
2681
pshufb m2, m0, [intrax9b_v2]
2682
paddw m0, m3
2683
psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
2684
pavgw m0, [pw_16]
2685
pand m0, [sw_f0] ; dc
2686
; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
2687
; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
2688
; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
2689
HADAMARD 0, sumsub, %2, %3, %4, %5
2690
HADAMARD 1, sumsub, %2, %3, %4, %5
2691
movd r3d, m0
2692
shr r3d, 4
2693
imul r3d, 0x01010101
2694
mov [pred_buf+0x80], r3d
2695
mov [pred_buf+0x88], r3d
2696
mov [pred_buf+0x90], r3d
2697
mov [pred_buf+0x98], r3d
2698
psubw m3, m%2
2699
psubw m0, m%2
2700
psubw m1, m%2
2701
psubw m2, m%3
2702
pabsw m%3, m%3
2703
pabsw m3, m3
2704
pabsw m0, m0
2705
pabsw m1, m1
2706
pabsw m2, m2
2707
pavgw m3, m%3
2708
pavgw m0, m%3
2709
pavgw m1, m2
2710
%if cpuflag(sse4)
2711
phaddw m3, m0
2712
%else
2713
SBUTTERFLY qdq, 3, 0, 2
2714
paddw m3, m0
2715
%endif
2716
MOVHL m2, m1
2717
paddw m1, m2
2718
%if cpuflag(xop)
2719
vphaddwq m3, m3
2720
vphaddwq m1, m1
2721
packssdw m1, m3
2722
%else
2723
phaddw m1, m3
2724
pmaddwd m1, [pw_1] ; v, _, h, dc
2725
%endif
2726
%endmacro ; INTRA_X9_VHDC
2727
2728
%macro INTRA_X9_END 2
2729
%if cpuflag(sse4)
2730
phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
2731
movd eax, m0
2732
add eax, 1<<16
2733
cmp ax, r3w
2734
cmovge eax, r3d
2735
%else
2736
%if %1
2737
; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
2738
psllw m0, 3
2739
paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
2740
%else
2741
; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
2742
psllw m0, 2
2743
paddusw m0, m0
2744
paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
2745
%endif
2746
movhlps m1, m0
2747
pminsw m0, m1
2748
pshuflw m1, m0, q0032
2749
pminsw m0, m1
2750
pshuflw m1, m0, q0001
2751
pminsw m0, m1
2752
movd eax, m0
2753
movsx r2d, ax
2754
and eax, 7
2755
sar r2d, 3
2756
shl eax, 16
2757
; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
2758
; 1<<12: undo sign manipulation
2759
lea eax, [rax+r2+(1<<16)+(1<<12)]
2760
cmp ax, r3w
2761
cmovge eax, r3d
2762
%endif ; cpuflag
2763
2764
; output the predicted samples
2765
mov r3d, eax
2766
shr r3d, 16
2767
%ifdef PIC
2768
lea r2, [%2_lut]
2769
movzx r2d, byte [r2+r3]
2770
%else
2771
movzx r2d, byte [%2_lut+r3]
2772
%endif
2773
%if %1 ; sad
2774
movq mm0, [pred_buf+r2]
2775
movq mm1, [pred_buf+r2+16]
2776
movd [r1+0*FDEC_STRIDE], mm0
2777
movd [r1+2*FDEC_STRIDE], mm1
2778
psrlq mm0, 32
2779
psrlq mm1, 32
2780
movd [r1+1*FDEC_STRIDE], mm0
2781
movd [r1+3*FDEC_STRIDE], mm1
2782
%else ; satd
2783
%assign i 0
2784
%rep 4
2785
mov r3d, [pred_buf+r2+8*i]
2786
mov [r1+i*FDEC_STRIDE], r3d
2787
%assign i i+1
2788
%endrep
2789
%endif
2790
%endmacro ; INTRA_X9_END
2791
2792
%macro INTRA_X9 0
2793
;-----------------------------------------------------------------------------
2794
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2795
;-----------------------------------------------------------------------------
2796
%if notcpuflag(xop)
2797
cglobal intra_sad_x9_4x4, 3,4,9
2798
%assign pad 0xc0-gprsize-(stack_offset&15)
2799
%define pred_buf rsp
2800
sub rsp, pad
2801
%if ARCH_X86_64
2802
INTRA_X9_PRED intrax9a, m8
2803
%else
2804
INTRA_X9_PRED intrax9a, [rsp+0xa0]
2805
%endif
2806
mova [rsp+0x00], m2
2807
mova [rsp+0x10], m3
2808
mova [rsp+0x20], m4
2809
mova [rsp+0x30], m5
2810
mova [rsp+0x40], m6
2811
mova [rsp+0x50], m7
2812
%if cpuflag(sse4)
2813
movd m0, [r0+0*FENC_STRIDE]
2814
pinsrd m0, [r0+1*FENC_STRIDE], 1
2815
movd m1, [r0+2*FENC_STRIDE]
2816
pinsrd m1, [r0+3*FENC_STRIDE], 1
2817
%else
2818
movd mm0, [r0+0*FENC_STRIDE]
2819
punpckldq mm0, [r0+1*FENC_STRIDE]
2820
movd mm1, [r0+2*FENC_STRIDE]
2821
punpckldq mm1, [r0+3*FENC_STRIDE]
2822
movq2dq m0, mm0
2823
movq2dq m1, mm1
2824
%endif
2825
punpcklqdq m0, m0
2826
punpcklqdq m1, m1
2827
psadbw m2, m0
2828
psadbw m3, m1
2829
psadbw m4, m0
2830
psadbw m5, m1
2831
psadbw m6, m0
2832
psadbw m7, m1
2833
paddd m2, m3
2834
paddd m4, m5
2835
paddd m6, m7
2836
%if ARCH_X86_64
2837
SWAP 7, 8
2838
pxor m8, m8
2839
%define %%zero m8
2840
%else
2841
mova m7, [rsp+0xa0]
2842
%define %%zero [pb_0]
2843
%endif
2844
pshufb m3, m7, [intrax9a_vh1]
2845
pshufb m5, m7, [intrax9a_vh2]
2846
pshufb m7, [intrax9a_dc]
2847
psadbw m7, %%zero
2848
psrlw m7, 2
2849
mova [rsp+0x60], m3
2850
mova [rsp+0x70], m5
2851
psadbw m3, m0
2852
pavgw m7, %%zero
2853
pshufb m7, %%zero
2854
psadbw m5, m1
2855
movq [rsp+0x80], m7
2856
movq [rsp+0x90], m7
2857
psadbw m0, m7
2858
paddd m3, m5
2859
psadbw m1, m7
2860
paddd m0, m1
2861
movzx r3d, word [r2]
2862
movd r0d, m3 ; v
2863
add r3d, r0d
2864
punpckhqdq m3, m0 ; h, dc
2865
shufps m3, m2, q2020
2866
psllq m6, 32
2867
por m4, m6
2868
movu m0, [r2+2]
2869
packssdw m3, m4
2870
paddw m0, m3
2871
INTRA_X9_END 1, intrax9a
2872
add rsp, pad
2873
RET
2874
%endif ; cpuflag
2875
2876
%if ARCH_X86_64
2877
;-----------------------------------------------------------------------------
2878
; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
2879
;-----------------------------------------------------------------------------
2880
cglobal intra_satd_x9_4x4, 3,4,16
2881
%assign pad 0xb0-gprsize-(stack_offset&15)
2882
%define pred_buf rsp
2883
sub rsp, pad
2884
INTRA_X9_PRED intrax9b, m15
2885
mova [rsp+0x00], m2
2886
mova [rsp+0x10], m3
2887
mova [rsp+0x20], m4
2888
mova [rsp+0x30], m5
2889
mova [rsp+0x40], m6
2890
mova [rsp+0x50], m7
2891
movd m8, [r0+0*FENC_STRIDE]
2892
movd m9, [r0+1*FENC_STRIDE]
2893
movd m10, [r0+2*FENC_STRIDE]
2894
movd m11, [r0+3*FENC_STRIDE]
2895
mova m12, [hmul_8p]
2896
pshufd m8, m8, 0
2897
pshufd m9, m9, 0
2898
pshufd m10, m10, 0
2899
pshufd m11, m11, 0
2900
pmaddubsw m8, m12
2901
pmaddubsw m9, m12
2902
pmaddubsw m10, m12
2903
pmaddubsw m11, m12
2904
movddup m0, m2
2905
pshufd m1, m2, q3232
2906
movddup m2, m3
2907
punpckhqdq m3, m3
2908
call .satd_8x4 ; ddr, ddl
2909
movddup m2, m5
2910
pshufd m3, m5, q3232
2911
mova m5, m0
2912
movddup m0, m4
2913
pshufd m1, m4, q3232
2914
call .satd_8x4 ; vr, vl
2915
movddup m2, m7
2916
pshufd m3, m7, q3232
2917
mova m4, m0
2918
movddup m0, m6
2919
pshufd m1, m6, q3232
2920
call .satd_8x4 ; hd, hu
2921
%if cpuflag(sse4)
2922
punpckldq m4, m0
2923
%else
2924
punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
2925
%endif
2926
mova m1, [pw_ppmmppmm]
2927
psignw m8, m1
2928
psignw m10, m1
2929
paddw m8, m9
2930
paddw m10, m11
2931
INTRA_X9_VHDC 15, 8, 10, 6, 7
2932
; find minimum
2933
movu m0, [r2+2]
2934
movd r3d, m1
2935
palignr m5, m1, 8
2936
%if notcpuflag(sse4)
2937
pshufhw m0, m0, q3120 ; compensate for different order in unpack
2938
%endif
2939
packssdw m5, m4
2940
paddw m0, m5
2941
movzx r0d, word [r2]
2942
add r3d, r0d
2943
INTRA_X9_END 0, intrax9b
2944
add rsp, pad
2945
RET
2946
RESET_MM_PERMUTATION
2947
ALIGN 16
2948
.satd_8x4:
2949
pmaddubsw m0, m12
2950
pmaddubsw m1, m12
2951
pmaddubsw m2, m12
2952
pmaddubsw m3, m12
2953
psubw m0, m8
2954
psubw m1, m9
2955
psubw m2, m10
2956
psubw m3, m11
2957
SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
2958
pmaddwd m0, [pw_1]
2959
MOVHL m1, m0
2960
paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
2961
ret
2962
2963
%else ; !ARCH_X86_64
2964
cglobal intra_satd_x9_4x4, 3,4,8
2965
%assign pad 0x120-gprsize-(stack_offset&15)
2966
%define fenc_buf rsp
2967
%define pred_buf rsp+0x40
2968
%define spill rsp+0xe0
2969
sub rsp, pad
2970
INTRA_X9_PRED intrax9b, [spill+0x20]
2971
mova [pred_buf+0x00], m2
2972
mova [pred_buf+0x10], m3
2973
mova [pred_buf+0x20], m4
2974
mova [pred_buf+0x30], m5
2975
mova [pred_buf+0x40], m6
2976
mova [pred_buf+0x50], m7
2977
movd m4, [r0+0*FENC_STRIDE]
2978
movd m5, [r0+1*FENC_STRIDE]
2979
movd m6, [r0+2*FENC_STRIDE]
2980
movd m0, [r0+3*FENC_STRIDE]
2981
mova m7, [hmul_8p]
2982
pshufd m4, m4, 0
2983
pshufd m5, m5, 0
2984
pshufd m6, m6, 0
2985
pshufd m0, m0, 0
2986
pmaddubsw m4, m7
2987
pmaddubsw m5, m7
2988
pmaddubsw m6, m7
2989
pmaddubsw m0, m7
2990
mova [fenc_buf+0x00], m4
2991
mova [fenc_buf+0x10], m5
2992
mova [fenc_buf+0x20], m6
2993
mova [fenc_buf+0x30], m0
2994
movddup m0, m2
2995
pshufd m1, m2, q3232
2996
movddup m2, m3
2997
punpckhqdq m3, m3
2998
pmaddubsw m0, m7
2999
pmaddubsw m1, m7
3000
pmaddubsw m2, m7
3001
pmaddubsw m3, m7
3002
psubw m0, m4
3003
psubw m1, m5
3004
psubw m2, m6
3005
call .satd_8x4b ; ddr, ddl
3006
mova m3, [pred_buf+0x30]
3007
mova m1, [pred_buf+0x20]
3008
movddup m2, m3
3009
punpckhqdq m3, m3
3010
movq [spill+0x08], m0
3011
movddup m0, m1
3012
punpckhqdq m1, m1
3013
call .satd_8x4 ; vr, vl
3014
mova m3, [pred_buf+0x50]
3015
mova m1, [pred_buf+0x40]
3016
movddup m2, m3
3017
punpckhqdq m3, m3
3018
movq [spill+0x10], m0
3019
movddup m0, m1
3020
punpckhqdq m1, m1
3021
call .satd_8x4 ; hd, hu
3022
movq [spill+0x18], m0
3023
mova m1, [spill+0x20]
3024
mova m4, [fenc_buf+0x00]
3025
mova m5, [fenc_buf+0x20]
3026
mova m2, [pw_ppmmppmm]
3027
psignw m4, m2
3028
psignw m5, m2
3029
paddw m4, [fenc_buf+0x10]
3030
paddw m5, [fenc_buf+0x30]
3031
INTRA_X9_VHDC 1, 4, 5, 6, 7
3032
; find minimum
3033
movu m0, [r2+2]
3034
movd r3d, m1
3035
punpckhqdq m1, [spill+0x00]
3036
packssdw m1, [spill+0x10]
3037
%if cpuflag(sse4)
3038
pshufhw m1, m1, q3120
3039
%else
3040
pshufhw m0, m0, q3120
3041
%endif
3042
paddw m0, m1
3043
movzx r0d, word [r2]
3044
add r3d, r0d
3045
INTRA_X9_END 0, intrax9b
3046
add rsp, pad
3047
RET
3048
RESET_MM_PERMUTATION
3049
ALIGN 16
3050
.satd_8x4:
3051
pmaddubsw m0, m7
3052
pmaddubsw m1, m7
3053
pmaddubsw m2, m7
3054
pmaddubsw m3, m7
3055
%xdefine fenc_buf fenc_buf+gprsize
3056
psubw m0, [fenc_buf+0x00]
3057
psubw m1, [fenc_buf+0x10]
3058
psubw m2, [fenc_buf+0x20]
3059
.satd_8x4b:
3060
psubw m3, [fenc_buf+0x30]
3061
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
3062
pmaddwd m0, [pw_1]
3063
MOVHL m1, m0
3064
paddd xmm0, m0, m1
3065
ret
3066
%endif ; ARCH
3067
%endmacro ; INTRA_X9
3068
3069
%macro INTRA8_X9 0
3070
;-----------------------------------------------------------------------------
3071
; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3072
;-----------------------------------------------------------------------------
3073
cglobal intra_sad_x9_8x8, 5,6,9
3074
%define fenc02 m4
3075
%define fenc13 m5
3076
%define fenc46 m6
3077
%define fenc57 m7
3078
%if ARCH_X86_64
3079
%define tmp m8
3080
%assign padbase 0x0
3081
%else
3082
%define tmp [rsp]
3083
%assign padbase 0x10
3084
%endif
3085
%assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
3086
%define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
3087
3088
SUB rsp, pad
3089
movq fenc02, [r0+FENC_STRIDE* 0]
3090
movq fenc13, [r0+FENC_STRIDE* 1]
3091
movq fenc46, [r0+FENC_STRIDE* 4]
3092
movq fenc57, [r0+FENC_STRIDE* 5]
3093
movhps fenc02, [r0+FENC_STRIDE* 2]
3094
movhps fenc13, [r0+FENC_STRIDE* 3]
3095
movhps fenc46, [r0+FENC_STRIDE* 6]
3096
movhps fenc57, [r0+FENC_STRIDE* 7]
3097
3098
; save instruction size: avoid 4-byte memory offsets
3099
lea r0, [intra8x9_h1+128]
3100
%define off(m) (r0+m-(intra8x9_h1+128))
3101
3102
; v
3103
movddup m0, [r2+16]
3104
mova pred(0,0), m0
3105
psadbw m1, m0, fenc02
3106
mova pred(0,1), m0
3107
psadbw m2, m0, fenc13
3108
mova pred(0,2), m0
3109
psadbw m3, m0, fenc46
3110
mova pred(0,3), m0
3111
psadbw m0, m0, fenc57
3112
paddw m1, m2
3113
paddw m0, m3
3114
paddw m0, m1
3115
MOVHL m1, m0
3116
paddw m0, m1
3117
movd [r4+0], m0
3118
3119
; h
3120
movq m0, [r2+7]
3121
pshufb m1, m0, [off(intra8x9_h1)]
3122
pshufb m2, m0, [off(intra8x9_h2)]
3123
mova pred(1,0), m1
3124
psadbw m1, fenc02
3125
mova pred(1,1), m2
3126
psadbw m2, fenc13
3127
paddw m1, m2
3128
pshufb m3, m0, [off(intra8x9_h3)]
3129
pshufb m2, m0, [off(intra8x9_h4)]
3130
mova pred(1,2), m3
3131
psadbw m3, fenc46
3132
mova pred(1,3), m2
3133
psadbw m2, fenc57
3134
paddw m1, m3
3135
paddw m1, m2
3136
MOVHL m2, m1
3137
paddw m1, m2
3138
movd [r4+2], m1
3139
3140
lea r5, [rsp+padbase+0x100]
3141
%define pred(i,j) [r5+i*0x40+j*0x10-0x100]
3142
3143
; dc
3144
movhps m0, [r2+16]
3145
pxor m2, m2
3146
psadbw m0, m2
3147
MOVHL m1, m0
3148
paddw m0, m1
3149
psrlw m0, 3
3150
pavgw m0, m2
3151
pshufb m0, m2
3152
mova pred(2,0), m0
3153
psadbw m1, m0, fenc02
3154
mova pred(2,1), m0
3155
psadbw m2, m0, fenc13
3156
mova pred(2,2), m0
3157
psadbw m3, m0, fenc46
3158
mova pred(2,3), m0
3159
psadbw m0, m0, fenc57
3160
paddw m1, m2
3161
paddw m0, m3
3162
paddw m0, m1
3163
MOVHL m1, m0
3164
paddw m0, m1
3165
movd [r4+4], m0
3166
3167
; ddl
3168
; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3169
; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3170
; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3171
; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3172
; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
3173
; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
3174
; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
3175
; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3176
mova m0, [r2+16]
3177
movu m2, [r2+17]
3178
pslldq m1, m0, 1
3179
pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
3180
PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
3181
pshufb m1, m0, [off(intra8x9_ddl1)]
3182
pshufb m2, m0, [off(intra8x9_ddl2)]
3183
mova pred(3,0), m1
3184
psadbw m1, fenc02
3185
mova pred(3,1), m2
3186
psadbw m2, fenc13
3187
paddw m1, m2
3188
pshufb m2, m0, [off(intra8x9_ddl3)]
3189
mova pred(3,2), m2
3190
psadbw m2, fenc46
3191
paddw m1, m2
3192
pshufb m2, m0, [off(intra8x9_ddl4)]
3193
mova pred(3,3), m2
3194
psadbw m2, fenc57
3195
paddw m1, m2
3196
MOVHL m2, m1
3197
paddw m1, m2
3198
movd [r4+6], m1
3199
3200
; vl
3201
; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
3202
; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
3203
; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
3204
; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
3205
; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
3206
; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
3207
; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
3208
; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
3209
pshufb m1, m3, [off(intra8x9_vl1)]
3210
pshufb m2, m0, [off(intra8x9_vl2)]
3211
pshufb m3, m3, [off(intra8x9_vl3)]
3212
pshufb m0, m0, [off(intra8x9_vl4)]
3213
mova pred(7,0), m1
3214
psadbw m1, fenc02
3215
mova pred(7,1), m2
3216
psadbw m2, fenc13
3217
mova pred(7,2), m3
3218
psadbw m3, fenc46
3219
mova pred(7,3), m0
3220
psadbw m0, fenc57
3221
paddw m1, m2
3222
paddw m0, m3
3223
paddw m0, m1
3224
MOVHL m1, m0
3225
paddw m0, m1
3226
%if cpuflag(sse4)
3227
pextrw [r4+14], m0, 0
3228
%else
3229
movd r5d, m0
3230
mov [r4+14], r5w
3231
lea r5, [rsp+padbase+0x100]
3232
%endif
3233
3234
; ddr
3235
; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3236
; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3237
; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3238
; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
3239
; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
3240
; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
3241
; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
3242
; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
3243
movu m2, [r2+8]
3244
movu m0, [r2+7]
3245
movu m1, [r2+6]
3246
pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3247
PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3248
pshufb m1, m0, [off(intra8x9_ddr1)]
3249
pshufb m2, m0, [off(intra8x9_ddr2)]
3250
mova pred(4,0), m1
3251
psadbw m1, fenc02
3252
mova pred(4,1), m2
3253
psadbw m2, fenc13
3254
paddw m1, m2
3255
pshufb m2, m0, [off(intra8x9_ddr3)]
3256
mova pred(4,2), m2
3257
psadbw m2, fenc46
3258
paddw m1, m2
3259
pshufb m2, m0, [off(intra8x9_ddr4)]
3260
mova pred(4,3), m2
3261
psadbw m2, fenc57
3262
paddw m1, m2
3263
MOVHL m2, m1
3264
paddw m1, m2
3265
movd [r4+8], m1
3266
3267
add r0, 256
3268
add r5, 0xC0
3269
%define off(m) (r0+m-(intra8x9_h1+256+128))
3270
%define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
3271
3272
; vr
3273
; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3274
; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
3275
; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
3276
; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3277
; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
3278
; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
3279
; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
3280
; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
3281
movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
3282
pshufb m1, m2, [off(intra8x9_vr1)]
3283
pshufb m2, m2, [off(intra8x9_vr3)]
3284
mova pred(5,0), m1
3285
psadbw m1, fenc02
3286
mova pred(5,2), m2
3287
psadbw m2, fenc46
3288
paddw m1, m2
3289
pshufb m2, m0, [off(intra8x9_vr2)]
3290
mova pred(5,1), m2
3291
psadbw m2, fenc13
3292
paddw m1, m2
3293
pshufb m2, m0, [off(intra8x9_vr4)]
3294
mova pred(5,3), m2
3295
psadbw m2, fenc57
3296
paddw m1, m2
3297
MOVHL m2, m1
3298
paddw m1, m2
3299
movd [r4+10], m1
3300
3301
; hd
3302
; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
3303
; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
3304
; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
3305
; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
3306
; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
3307
; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
3308
; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
3309
; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
3310
pshufd m2, m3, q0001
3311
%if cpuflag(sse4)
3312
pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
3313
%else
3314
movss m1, m0, m2
3315
SWAP 1, 2
3316
%endif
3317
punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
3318
pshufb m1, m2, [off(intra8x9_hd1)]
3319
pshufb m2, m2, [off(intra8x9_hd2)]
3320
mova pred(6,0), m1
3321
psadbw m1, fenc02
3322
mova pred(6,1), m2
3323
psadbw m2, fenc13
3324
paddw m1, m2
3325
pshufb m2, m0, [off(intra8x9_hd3)]
3326
pshufb m3, m0, [off(intra8x9_hd4)]
3327
mova pred(6,2), m2
3328
psadbw m2, fenc46
3329
mova pred(6,3), m3
3330
psadbw m3, fenc57
3331
paddw m1, m2
3332
paddw m1, m3
3333
MOVHL m2, m1
3334
paddw m1, m2
3335
; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
3336
pslldq m1, 12
3337
SWAP 3, 1
3338
3339
; hu
3340
; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
3341
; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
3342
; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
3343
; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
3344
; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
3345
; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
3346
; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3347
; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
3348
%if cpuflag(sse4)
3349
pinsrb m0, [r2+7], 15 ; Gl7
3350
%else
3351
movd m1, [r2+7]
3352
pslldq m0, 1
3353
palignr m1, m0, 1
3354
SWAP 0, 1
3355
%endif
3356
pshufb m1, m0, [off(intra8x9_hu1)]
3357
pshufb m2, m0, [off(intra8x9_hu2)]
3358
mova pred(8,0), m1
3359
psadbw m1, fenc02
3360
mova pred(8,1), m2
3361
psadbw m2, fenc13
3362
paddw m1, m2
3363
pshufb m2, m0, [off(intra8x9_hu3)]
3364
pshufb m0, m0, [off(intra8x9_hu4)]
3365
mova pred(8,2), m2
3366
psadbw m2, fenc46
3367
mova pred(8,3), m0
3368
psadbw m0, fenc57
3369
paddw m1, m2
3370
paddw m1, m0
3371
MOVHL m2, m1
3372
paddw m1, m2
3373
movd r2d, m1
3374
3375
movu m0, [r3]
3376
por m3, [r4]
3377
paddw m0, m3
3378
mova [r4], m0
3379
movzx r5d, word [r3+16]
3380
add r2d, r5d
3381
mov [r4+16], r2w
3382
3383
%if cpuflag(sse4)
3384
phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
3385
movd eax, m0
3386
%else
3387
; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
3388
paddusw m0, m0
3389
paddusw m0, m0
3390
paddw m0, [off(pw_s00112233)]
3391
MOVHL m1, m0
3392
pminsw m0, m1
3393
pshuflw m1, m0, q0032
3394
pminsw m0, m1
3395
movd eax, m0
3396
; repack with 3 bit index
3397
xor eax, 0x80008000
3398
movzx r3d, ax
3399
shr eax, 15
3400
add r3d, r3d
3401
or eax, 1
3402
cmp eax, r3d
3403
cmovg eax, r3d
3404
; reverse to phminposuw order
3405
mov r3d, eax
3406
and eax, 7
3407
shr r3d, 3
3408
shl eax, 16
3409
or eax, r3d
3410
%endif
3411
add r2d, 8<<16
3412
cmp ax, r2w
3413
cmovg eax, r2d
3414
3415
mov r2d, eax
3416
shr r2d, 16
3417
shl r2d, 6
3418
add r1, 4*FDEC_STRIDE
3419
mova m0, [rsp+padbase+r2+0x00]
3420
mova m1, [rsp+padbase+r2+0x10]
3421
mova m2, [rsp+padbase+r2+0x20]
3422
mova m3, [rsp+padbase+r2+0x30]
3423
movq [r1+FDEC_STRIDE*-4], m0
3424
movhps [r1+FDEC_STRIDE*-2], m0
3425
movq [r1+FDEC_STRIDE*-3], m1
3426
movhps [r1+FDEC_STRIDE*-1], m1
3427
movq [r1+FDEC_STRIDE* 0], m2
3428
movhps [r1+FDEC_STRIDE* 2], m2
3429
movq [r1+FDEC_STRIDE* 1], m3
3430
movhps [r1+FDEC_STRIDE* 3], m3
3431
ADD rsp, pad
3432
RET
3433
3434
%if ARCH_X86_64
3435
;-----------------------------------------------------------------------------
3436
; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
3437
;-----------------------------------------------------------------------------
3438
cglobal intra_sa8d_x9_8x8, 5,6,16
3439
%assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
3440
%define fenc_buf rsp
3441
%define pred_buf rsp+0x80
3442
SUB rsp, pad
3443
mova m15, [hmul_8p]
3444
pxor m8, m8
3445
%assign %%i 0
3446
%rep 8
3447
movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
3448
pmaddubsw m9, m %+ %%i, m15
3449
punpcklbw m %+ %%i, m8
3450
mova [fenc_buf+%%i*0x10], m9
3451
%assign %%i %%i+1
3452
%endrep
3453
3454
; save instruction size: avoid 4-byte memory offsets
3455
lea r0, [intra8x9_h1+0x80]
3456
%define off(m) (r0+m-(intra8x9_h1+0x80))
3457
lea r5, [pred_buf+0x80]
3458
3459
; v, h, dc
3460
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
3461
pabsw m11, m1
3462
%assign %%i 2
3463
%rep 6
3464
pabsw m8, m %+ %%i
3465
paddw m11, m8
3466
%assign %%i %%i+1
3467
%endrep
3468
3469
; 1D hadamard of edges
3470
movq m8, [r2+7]
3471
movddup m9, [r2+16]
3472
mova [r5-0x80], m9
3473
mova [r5-0x70], m9
3474
mova [r5-0x60], m9
3475
mova [r5-0x50], m9
3476
punpcklwd m8, m8
3477
pshufb m9, [intrax3_shuf]
3478
pmaddubsw m8, [pb_pppm]
3479
pmaddubsw m9, [pb_pppm]
3480
HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
3481
HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
3482
3483
; dc
3484
paddw m10, m8, m9
3485
paddw m10, [pw_8]
3486
pand m10, [sw_f0]
3487
psrlw m12, m10, 4
3488
psllw m10, 2
3489
pxor m13, m13
3490
pshufb m12, m13
3491
mova [r5+0x00], m12
3492
mova [r5+0x10], m12
3493
mova [r5+0x20], m12
3494
mova [r5+0x30], m12
3495
3496
; differences
3497
psllw m8, 3 ; left edge
3498
psubw m8, m0
3499
psubw m10, m0
3500
pabsw m8, m8 ; 1x8 sum
3501
pabsw m10, m10
3502
paddw m8, m11
3503
paddw m11, m10
3504
punpcklwd m0, m1
3505
punpcklwd m2, m3
3506
punpcklwd m4, m5
3507
punpcklwd m6, m7
3508
punpckldq m0, m2
3509
punpckldq m4, m6
3510
punpcklqdq m0, m4 ; transpose
3511
psllw m9, 3 ; top edge
3512
psrldq m10, m11, 2 ; 8x7 sum
3513
psubw m0, m9 ; 8x1 sum
3514
pabsw m0, m0
3515
paddw m10, m0
3516
3517
phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
3518
psrlw m11, 1
3519
psrlw m10, 1
3520
3521
; store h
3522
movq m3, [r2+7]
3523
pshufb m0, m3, [off(intra8x9_h1)]
3524
pshufb m1, m3, [off(intra8x9_h2)]
3525
pshufb m2, m3, [off(intra8x9_h3)]
3526
pshufb m3, m3, [off(intra8x9_h4)]
3527
mova [r5-0x40], m0
3528
mova [r5-0x30], m1
3529
mova [r5-0x20], m2
3530
mova [r5-0x10], m3
3531
3532
; ddl
3533
mova m8, [r2+16]
3534
movu m2, [r2+17]
3535
pslldq m1, m8, 1
3536
pavgb m9, m8, m2
3537
PRED4x4_LOWPASS m8, m1, m2, m8, m3
3538
pshufb m0, m8, [off(intra8x9_ddl1)]
3539
pshufb m1, m8, [off(intra8x9_ddl2)]
3540
pshufb m2, m8, [off(intra8x9_ddl3)]
3541
pshufb m3, m8, [off(intra8x9_ddl4)]
3542
add r5, 0x40
3543
call .sa8d
3544
phaddd m11, m0
3545
3546
; vl
3547
pshufb m0, m9, [off(intra8x9_vl1)]
3548
pshufb m1, m8, [off(intra8x9_vl2)]
3549
pshufb m2, m9, [off(intra8x9_vl3)]
3550
pshufb m3, m8, [off(intra8x9_vl4)]
3551
add r5, 0x100
3552
call .sa8d
3553
phaddd m10, m11
3554
mova m12, m0
3555
3556
; ddr
3557
movu m2, [r2+8]
3558
movu m8, [r2+7]
3559
movu m1, [r2+6]
3560
pavgb m9, m2, m8
3561
PRED4x4_LOWPASS m8, m1, m2, m8, m3
3562
pshufb m0, m8, [off(intra8x9_ddr1)]
3563
pshufb m1, m8, [off(intra8x9_ddr2)]
3564
pshufb m2, m8, [off(intra8x9_ddr3)]
3565
pshufb m3, m8, [off(intra8x9_ddr4)]
3566
sub r5, 0xc0
3567
call .sa8d
3568
mova m11, m0
3569
3570
add r0, 0x100
3571
%define off(m) (r0+m-(intra8x9_h1+0x180))
3572
3573
; vr
3574
movsd m2, m9, m8
3575
pshufb m0, m2, [off(intra8x9_vr1)]
3576
pshufb m1, m8, [off(intra8x9_vr2)]
3577
pshufb m2, m2, [off(intra8x9_vr3)]
3578
pshufb m3, m8, [off(intra8x9_vr4)]
3579
add r5, 0x40
3580
call .sa8d
3581
phaddd m11, m0
3582
3583
; hd
3584
%if cpuflag(sse4)
3585
pshufd m1, m9, q0001
3586
pblendw m1, m8, q3330
3587
%else
3588
pshufd m2, m9, q0001
3589
movss m1, m8, m2
3590
%endif
3591
punpcklbw m8, m9
3592
pshufb m0, m1, [off(intra8x9_hd1)]
3593
pshufb m1, m1, [off(intra8x9_hd2)]
3594
pshufb m2, m8, [off(intra8x9_hd3)]
3595
pshufb m3, m8, [off(intra8x9_hd4)]
3596
add r5, 0x40
3597
call .sa8d
3598
phaddd m0, m12
3599
phaddd m11, m0
3600
3601
; hu
3602
%if cpuflag(sse4)
3603
pinsrb m8, [r2+7], 15
3604
%else
3605
movd m9, [r2+7]
3606
pslldq m8, 1
3607
palignr m9, m8, 1
3608
SWAP 8, 9
3609
%endif
3610
pshufb m0, m8, [off(intra8x9_hu1)]
3611
pshufb m1, m8, [off(intra8x9_hu2)]
3612
pshufb m2, m8, [off(intra8x9_hu3)]
3613
pshufb m3, m8, [off(intra8x9_hu4)]
3614
add r5, 0x80
3615
call .sa8d
3616
3617
pmaddwd m0, [pw_1]
3618
phaddw m10, m11
3619
MOVHL m1, m0
3620
paddw m0, m1
3621
pshuflw m1, m0, q0032
3622
pavgw m0, m1
3623
pxor m2, m2
3624
pavgw m10, m2
3625
movd r2d, m0
3626
3627
movu m0, [r3]
3628
paddw m0, m10
3629
mova [r4], m0
3630
movzx r5d, word [r3+16]
3631
add r2d, r5d
3632
mov [r4+16], r2w
3633
3634
%if cpuflag(sse4)
3635
phminposuw m0, m0
3636
movd eax, m0
3637
%else
3638
; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
3639
paddusw m0, m0
3640
paddw m0, [off(pw_s00001111)]
3641
MOVHL m1, m0
3642
pminsw m0, m1
3643
pshuflw m1, m0, q0032
3644
mova m2, m0
3645
pminsw m0, m1
3646
pcmpgtw m2, m1 ; 2nd index bit
3647
movd r3d, m0
3648
movd r4d, m2
3649
; repack with 3 bit index
3650
xor r3d, 0x80008000
3651
and r4d, 0x00020002
3652
movzx eax, r3w
3653
movzx r5d, r4w
3654
shr r3d, 16
3655
shr r4d, 16
3656
lea eax, [rax*4+r5]
3657
lea r3d, [ r3*4+r4+1]
3658
cmp eax, r3d
3659
cmovg eax, r3d
3660
; reverse to phminposuw order
3661
mov r3d, eax
3662
and eax, 7
3663
shr r3d, 3
3664
shl eax, 16
3665
or eax, r3d
3666
%endif
3667
add r2d, 8<<16
3668
cmp ax, r2w
3669
cmovg eax, r2d
3670
3671
mov r2d, eax
3672
shr r2d, 16
3673
shl r2d, 6
3674
add r1, 4*FDEC_STRIDE
3675
mova m0, [pred_buf+r2+0x00]
3676
mova m1, [pred_buf+r2+0x10]
3677
mova m2, [pred_buf+r2+0x20]
3678
mova m3, [pred_buf+r2+0x30]
3679
movq [r1+FDEC_STRIDE*-4], m0
3680
movhps [r1+FDEC_STRIDE*-2], m0
3681
movq [r1+FDEC_STRIDE*-3], m1
3682
movhps [r1+FDEC_STRIDE*-1], m1
3683
movq [r1+FDEC_STRIDE* 0], m2
3684
movhps [r1+FDEC_STRIDE* 2], m2
3685
movq [r1+FDEC_STRIDE* 1], m3
3686
movhps [r1+FDEC_STRIDE* 3], m3
3687
ADD rsp, pad
3688
RET
3689
3690
ALIGN 16
3691
.sa8d:
3692
%xdefine mret m0
3693
%xdefine fenc_buf fenc_buf+gprsize
3694
mova [r5+0x00], m0
3695
mova [r5+0x10], m1
3696
mova [r5+0x20], m2
3697
mova [r5+0x30], m3
3698
movddup m4, m0
3699
movddup m5, m1
3700
movddup m6, m2
3701
movddup m7, m3
3702
punpckhqdq m0, m0
3703
punpckhqdq m1, m1
3704
punpckhqdq m2, m2
3705
punpckhqdq m3, m3
3706
PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
3707
pmaddubsw m0, m15
3708
pmaddubsw m1, m15
3709
psubw m0, [fenc_buf+0x00]
3710
psubw m1, [fenc_buf+0x10]
3711
pmaddubsw m2, m15
3712
pmaddubsw m3, m15
3713
psubw m2, [fenc_buf+0x20]
3714
psubw m3, [fenc_buf+0x30]
3715
pmaddubsw m4, m15
3716
pmaddubsw m5, m15
3717
psubw m4, [fenc_buf+0x40]
3718
psubw m5, [fenc_buf+0x50]
3719
pmaddubsw m6, m15
3720
pmaddubsw m7, m15
3721
psubw m6, [fenc_buf+0x60]
3722
psubw m7, [fenc_buf+0x70]
3723
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
3724
paddw m0, m1
3725
paddw m0, m2
3726
paddw mret, m0, m3
3727
ret
3728
%endif ; ARCH_X86_64
3729
%endmacro ; INTRA8_X9
3730
3731
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
3732
; out: [tmp]=hadamard4, m0=satd
3733
INIT_MMX mmx2
3734
cglobal hadamard_ac_4x4
3735
%if HIGH_BIT_DEPTH
3736
mova m0, [r0]
3737
mova m1, [r0+r1]
3738
mova m2, [r0+r1*2]
3739
mova m3, [r0+r2]
3740
%else ; !HIGH_BIT_DEPTH
3741
movh m0, [r0]
3742
movh m1, [r0+r1]
3743
movh m2, [r0+r1*2]
3744
movh m3, [r0+r2]
3745
punpcklbw m0, m7
3746
punpcklbw m1, m7
3747
punpcklbw m2, m7
3748
punpcklbw m3, m7
3749
%endif ; HIGH_BIT_DEPTH
3750
HADAMARD4_2D 0, 1, 2, 3, 4
3751
mova [r3], m0
3752
mova [r3+8], m1
3753
mova [r3+16], m2
3754
mova [r3+24], m3
3755
ABSW m0, m0, m4
3756
ABSW m1, m1, m4
3757
pand m0, m6
3758
ABSW m2, m2, m4
3759
ABSW m3, m3, m4
3760
paddw m0, m1
3761
paddw m2, m3
3762
paddw m0, m2
3763
SAVE_MM_PERMUTATION
3764
ret
3765
3766
cglobal hadamard_ac_2x2max
3767
mova m0, [r3+0x00]
3768
mova m1, [r3+0x20]
3769
mova m2, [r3+0x40]
3770
mova m3, [r3+0x60]
3771
sub r3, 8
3772
SUMSUB_BADC w, 0, 1, 2, 3, 4
3773
ABSW2 m0, m2, m0, m2, m4, m5
3774
ABSW2 m1, m3, m1, m3, m4, m5
3775
HADAMARD 0, max, 0, 2, 4, 5
3776
HADAMARD 0, max, 1, 3, 4, 5
3777
%if HIGH_BIT_DEPTH
3778
pmaddwd m0, m7
3779
pmaddwd m1, m7
3780
paddd m6, m0
3781
paddd m6, m1
3782
%else ; !HIGH_BIT_DEPTH
3783
paddw m7, m0
3784
paddw m7, m1
3785
%endif ; HIGH_BIT_DEPTH
3786
SAVE_MM_PERMUTATION
3787
ret
3788
3789
%macro AC_PREP 2
3790
%if HIGH_BIT_DEPTH
3791
pmaddwd %1, %2
3792
%endif
3793
%endmacro
3794
3795
%macro AC_PADD 3
3796
%if HIGH_BIT_DEPTH
3797
AC_PREP %2, %3
3798
paddd %1, %2
3799
%else
3800
paddw %1, %2
3801
%endif ; HIGH_BIT_DEPTH
3802
%endmacro
3803
3804
cglobal hadamard_ac_8x8
3805
mova m6, [mask_ac4]
3806
%if HIGH_BIT_DEPTH
3807
mova m7, [pw_1]
3808
%else
3809
pxor m7, m7
3810
%endif ; HIGH_BIT_DEPTH
3811
call hadamard_ac_4x4_mmx2
3812
add r0, 4*SIZEOF_PIXEL
3813
add r3, 32
3814
mova m5, m0
3815
AC_PREP m5, m7
3816
call hadamard_ac_4x4_mmx2
3817
lea r0, [r0+4*r1]
3818
add r3, 64
3819
AC_PADD m5, m0, m7
3820
call hadamard_ac_4x4_mmx2
3821
sub r0, 4*SIZEOF_PIXEL
3822
sub r3, 32
3823
AC_PADD m5, m0, m7
3824
call hadamard_ac_4x4_mmx2
3825
AC_PADD m5, m0, m7
3826
sub r3, 40
3827
mova [rsp+gprsize+8], m5 ; save satd
3828
%if HIGH_BIT_DEPTH
3829
pxor m6, m6
3830
%endif
3831
%rep 3
3832
call hadamard_ac_2x2max_mmx2
3833
%endrep
3834
mova m0, [r3+0x00]
3835
mova m1, [r3+0x20]
3836
mova m2, [r3+0x40]
3837
mova m3, [r3+0x60]
3838
SUMSUB_BADC w, 0, 1, 2, 3, 4
3839
HADAMARD 0, sumsub, 0, 2, 4, 5
3840
ABSW2 m1, m3, m1, m3, m4, m5
3841
ABSW2 m0, m2, m0, m2, m4, m5
3842
HADAMARD 0, max, 1, 3, 4, 5
3843
%if HIGH_BIT_DEPTH
3844
pand m0, [mask_ac4]
3845
pmaddwd m1, m7
3846
pmaddwd m0, m7
3847
pmaddwd m2, m7
3848
paddd m6, m1
3849
paddd m0, m2
3850
paddd m6, m6
3851
paddd m0, m6
3852
SWAP 0, 6
3853
%else ; !HIGH_BIT_DEPTH
3854
pand m6, m0
3855
paddw m7, m1
3856
paddw m6, m2
3857
paddw m7, m7
3858
paddw m6, m7
3859
%endif ; HIGH_BIT_DEPTH
3860
mova [rsp+gprsize], m6 ; save sa8d
3861
SWAP 0, 6
3862
SAVE_MM_PERMUTATION
3863
ret
3864
3865
%macro HADAMARD_AC_WXH_SUM_MMX 2
3866
mova m1, [rsp+1*mmsize]
3867
%if HIGH_BIT_DEPTH
3868
%if %1*%2 >= 128
3869
paddd m0, [rsp+2*mmsize]
3870
paddd m1, [rsp+3*mmsize]
3871
%endif
3872
%if %1*%2 == 256
3873
mova m2, [rsp+4*mmsize]
3874
paddd m1, [rsp+5*mmsize]
3875
paddd m2, [rsp+6*mmsize]
3876
mova m3, m0
3877
paddd m1, [rsp+7*mmsize]
3878
paddd m0, m2
3879
%endif
3880
psrld m0, 1
3881
HADDD m0, m2
3882
psrld m1, 1
3883
HADDD m1, m3
3884
%else ; !HIGH_BIT_DEPTH
3885
%if %1*%2 >= 128
3886
paddusw m0, [rsp+2*mmsize]
3887
paddusw m1, [rsp+3*mmsize]
3888
%endif
3889
%if %1*%2 == 256
3890
mova m2, [rsp+4*mmsize]
3891
paddusw m1, [rsp+5*mmsize]
3892
paddusw m2, [rsp+6*mmsize]
3893
mova m3, m0
3894
paddusw m1, [rsp+7*mmsize]
3895
pxor m3, m2
3896
pand m3, [pw_1]
3897
pavgw m0, m2
3898
psubusw m0, m3
3899
HADDUW m0, m2
3900
%else
3901
psrlw m0, 1
3902
HADDW m0, m2
3903
%endif
3904
psrlw m1, 1
3905
HADDW m1, m3
3906
%endif ; HIGH_BIT_DEPTH
3907
%endmacro
3908
3909
%macro HADAMARD_AC_WXH_MMX 2
3910
cglobal pixel_hadamard_ac_%1x%2, 2,4
3911
%assign pad 16-gprsize-(stack_offset&15)
3912
%define ysub r1
3913
FIX_STRIDES r1
3914
sub rsp, 16+128+pad
3915
lea r2, [r1*3]
3916
lea r3, [rsp+16]
3917
call hadamard_ac_8x8_mmx2
3918
%if %2==16
3919
%define ysub r2
3920
lea r0, [r0+r1*4]
3921
sub rsp, 16
3922
call hadamard_ac_8x8_mmx2
3923
%endif
3924
%if %1==16
3925
neg ysub
3926
sub rsp, 16
3927
lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
3928
neg ysub
3929
call hadamard_ac_8x8_mmx2
3930
%if %2==16
3931
lea r0, [r0+r1*4]
3932
sub rsp, 16
3933
call hadamard_ac_8x8_mmx2
3934
%endif
3935
%endif
3936
HADAMARD_AC_WXH_SUM_MMX %1, %2
3937
movd edx, m0
3938
movd eax, m1
3939
shr edx, 1
3940
%if ARCH_X86_64
3941
shl rdx, 32
3942
add rax, rdx
3943
%endif
3944
add rsp, 128+%1*%2/4+pad
3945
RET
3946
%endmacro ; HADAMARD_AC_WXH_MMX
3947
3948
HADAMARD_AC_WXH_MMX 16, 16
3949
HADAMARD_AC_WXH_MMX 8, 16
3950
HADAMARD_AC_WXH_MMX 16, 8
3951
HADAMARD_AC_WXH_MMX 8, 8
3952
3953
%macro LOAD_INC_8x4W_SSE2 5
3954
%if HIGH_BIT_DEPTH
3955
movu m%1, [r0]
3956
movu m%2, [r0+r1]
3957
movu m%3, [r0+r1*2]
3958
movu m%4, [r0+r2]
3959
%ifidn %1, 0
3960
lea r0, [r0+r1*4]
3961
%endif
3962
%else ; !HIGH_BIT_DEPTH
3963
movh m%1, [r0]
3964
movh m%2, [r0+r1]
3965
movh m%3, [r0+r1*2]
3966
movh m%4, [r0+r2]
3967
%ifidn %1, 0
3968
lea r0, [r0+r1*4]
3969
%endif
3970
punpcklbw m%1, m%5
3971
punpcklbw m%2, m%5
3972
punpcklbw m%3, m%5
3973
punpcklbw m%4, m%5
3974
%endif ; HIGH_BIT_DEPTH
3975
%endmacro
3976
3977
%macro LOAD_INC_8x4W_SSSE3 5
3978
LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
3979
%ifidn %1, 0
3980
lea r0, [r0+r1*4]
3981
%endif
3982
HSUMSUB %1, %2, %3, %4, %5
3983
%endmacro
3984
3985
%macro HADAMARD_AC_SSE2 0
3986
; in: r0=pix, r1=stride, r2=stride*3
3987
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
3988
cglobal hadamard_ac_8x8
3989
%if ARCH_X86_64
3990
%define spill0 m8
3991
%define spill1 m9
3992
%define spill2 m10
3993
%else
3994
%define spill0 [rsp+gprsize]
3995
%define spill1 [rsp+gprsize+mmsize]
3996
%define spill2 [rsp+gprsize+mmsize*2]
3997
%endif
3998
%if HIGH_BIT_DEPTH
3999
%define vertical 1
4000
%elif cpuflag(ssse3) && notcpuflag(atom)
4001
%define vertical 0
4002
;LOAD_INC loads sumsubs
4003
mova m7, [hmul_8p]
4004
%else
4005
%define vertical 1
4006
;LOAD_INC only unpacks to words
4007
pxor m7, m7
4008
%endif
4009
LOAD_INC_8x4W 0, 1, 2, 3, 7
4010
%if vertical
4011
HADAMARD4_2D_SSE 0, 1, 2, 3, 4
4012
%else
4013
HADAMARD4_V 0, 1, 2, 3, 4
4014
%endif
4015
mova spill0, m1
4016
SWAP 1, 7
4017
LOAD_INC_8x4W 4, 5, 6, 7, 1
4018
%if vertical
4019
HADAMARD4_2D_SSE 4, 5, 6, 7, 1
4020
%else
4021
HADAMARD4_V 4, 5, 6, 7, 1
4022
; FIXME SWAP
4023
mova m1, spill0
4024
mova spill0, m6
4025
mova spill1, m7
4026
HADAMARD 1, sumsub, 0, 1, 6, 7
4027
HADAMARD 1, sumsub, 2, 3, 6, 7
4028
mova m6, spill0
4029
mova m7, spill1
4030
mova spill0, m1
4031
mova spill1, m0
4032
HADAMARD 1, sumsub, 4, 5, 1, 0
4033
HADAMARD 1, sumsub, 6, 7, 1, 0
4034
mova m0, spill1
4035
%endif
4036
mova spill1, m2
4037
mova spill2, m3
4038
ABSW m1, m0, m0
4039
ABSW m2, m4, m4
4040
ABSW m3, m5, m5
4041
paddw m1, m2
4042
SUMSUB_BA w, 0, 4
4043
%if vertical
4044
pand m1, [mask_ac4]
4045
%else
4046
pand m1, [mask_ac4b]
4047
%endif
4048
AC_PREP m1, [pw_1]
4049
ABSW m2, spill0
4050
AC_PADD m1, m3, [pw_1]
4051
ABSW m3, spill1
4052
AC_PADD m1, m2, [pw_1]
4053
ABSW m2, spill2
4054
AC_PADD m1, m3, [pw_1]
4055
ABSW m3, m6, m6
4056
AC_PADD m1, m2, [pw_1]
4057
ABSW m2, m7, m7
4058
AC_PADD m1, m3, [pw_1]
4059
AC_PADD m1, m2, [pw_1]
4060
paddw m3, m7, spill2
4061
psubw m7, spill2
4062
mova [rsp+gprsize+mmsize*2], m1 ; save satd
4063
paddw m2, m6, spill1
4064
psubw m6, spill1
4065
paddw m1, m5, spill0
4066
psubw m5, spill0
4067
%assign %%x 2
4068
%if vertical
4069
%assign %%x 4
4070
%endif
4071
mova spill1, m4
4072
HADAMARD %%x, amax, 3, 7, 4
4073
HADAMARD %%x, amax, 2, 6, 7, 4
4074
mova m4, spill1
4075
HADAMARD %%x, amax, 1, 5, 6, 7
4076
HADAMARD %%x, sumsub, 0, 4, 5, 6
4077
AC_PREP m2, [pw_1]
4078
AC_PADD m2, m3, [pw_1]
4079
AC_PADD m2, m1, [pw_1]
4080
%if HIGH_BIT_DEPTH
4081
paddd m2, m2
4082
%else
4083
paddw m2, m2
4084
%endif ; HIGH_BIT_DEPTH
4085
ABSW m4, m4, m7
4086
pand m0, [mask_ac8]
4087
ABSW m0, m0, m7
4088
AC_PADD m2, m4, [pw_1]
4089
AC_PADD m2, m0, [pw_1]
4090
mova [rsp+gprsize+mmsize], m2 ; save sa8d
4091
SWAP 0, 2
4092
SAVE_MM_PERMUTATION
4093
ret
4094
4095
HADAMARD_AC_WXH_SSE2 16, 16
4096
HADAMARD_AC_WXH_SSE2 16, 8
4097
%if mmsize <= 16
4098
HADAMARD_AC_WXH_SSE2 8, 16
4099
HADAMARD_AC_WXH_SSE2 8, 8
4100
%endif
4101
%endmacro ; HADAMARD_AC_SSE2
4102
4103
%macro HADAMARD_AC_WXH_SUM_SSE2 2
4104
mova m1, [rsp+2*mmsize]
4105
%if HIGH_BIT_DEPTH
4106
%if %1*%2 >= 128
4107
paddd m0, [rsp+3*mmsize]
4108
paddd m1, [rsp+4*mmsize]
4109
%endif
4110
%if %1*%2 == 256
4111
paddd m0, [rsp+5*mmsize]
4112
paddd m1, [rsp+6*mmsize]
4113
paddd m0, [rsp+7*mmsize]
4114
paddd m1, [rsp+8*mmsize]
4115
psrld m0, 1
4116
%endif
4117
HADDD xm0, xm2
4118
HADDD xm1, xm3
4119
%else ; !HIGH_BIT_DEPTH
4120
%if %1*%2*16/mmsize >= 128
4121
paddusw m0, [rsp+3*mmsize]
4122
paddusw m1, [rsp+4*mmsize]
4123
%endif
4124
%if %1*%2*16/mmsize == 256
4125
paddusw m0, [rsp+5*mmsize]
4126
paddusw m1, [rsp+6*mmsize]
4127
paddusw m0, [rsp+7*mmsize]
4128
paddusw m1, [rsp+8*mmsize]
4129
psrlw m0, 1
4130
%endif
4131
%if mmsize==32
4132
vextracti128 xm2, m0, 1
4133
vextracti128 xm3, m1, 1
4134
paddusw xm0, xm2
4135
paddusw xm1, xm3
4136
%endif
4137
HADDUW xm0, xm2
4138
HADDW xm1, xm3
4139
%endif ; HIGH_BIT_DEPTH
4140
%endmacro
4141
4142
; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
4143
%macro HADAMARD_AC_WXH_SSE2 2
4144
cglobal pixel_hadamard_ac_%1x%2, 2,4,11
4145
%define ysub r1
4146
FIX_STRIDES r1
4147
mov r3, rsp
4148
and rsp, ~(mmsize-1)
4149
sub rsp, mmsize*3
4150
lea r2, [r1*3]
4151
call hadamard_ac_8x8
4152
%if %2==16
4153
%define ysub r2
4154
lea r0, [r0+r1*4]
4155
sub rsp, mmsize*2
4156
call hadamard_ac_8x8
4157
%endif
4158
%if %1==16 && mmsize <= 16
4159
neg ysub
4160
sub rsp, mmsize*2
4161
lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
4162
neg ysub
4163
call hadamard_ac_8x8
4164
%if %2==16
4165
lea r0, [r0+r1*4]
4166
sub rsp, mmsize*2
4167
call hadamard_ac_8x8
4168
%endif
4169
%endif
4170
HADAMARD_AC_WXH_SUM_SSE2 %1, %2
4171
movd edx, xm0
4172
movd eax, xm1
4173
shr edx, 2 - (%1*%2*16/mmsize >> 8)
4174
shr eax, 1
4175
%if ARCH_X86_64
4176
shl rdx, 32
4177
add rax, rdx
4178
%endif
4179
mov rsp, r3
4180
RET
4181
%endmacro ; HADAMARD_AC_WXH_SSE2
4182
4183
; instantiate satds
4184
4185
%if ARCH_X86_64 == 0
4186
cextern pixel_sa8d_8x8_internal_mmx2
4187
INIT_MMX mmx2
4188
SA8D
4189
%endif
4190
4191
%define TRANS TRANS_SSE2
4192
%define DIFFOP DIFF_UNPACK_SSE2
4193
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
4194
%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
4195
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
4196
%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
4197
%define movdqu movups
4198
%define punpcklqdq movlhps
4199
INIT_XMM sse2
4200
SA8D
4201
SATDS_SSE2
4202
%if ARCH_X86_64
4203
SA8D_SATD
4204
%endif
4205
%if HIGH_BIT_DEPTH == 0
4206
INTRA_SA8D_SSE2
4207
%endif
4208
INIT_MMX mmx2
4209
INTRA_X3_MMX
4210
INIT_XMM sse2
4211
HADAMARD_AC_SSE2
4212
4213
%if HIGH_BIT_DEPTH == 0
4214
INIT_XMM ssse3,atom
4215
SATDS_SSE2
4216
SA8D
4217
HADAMARD_AC_SSE2
4218
%if ARCH_X86_64
4219
SA8D_SATD
4220
%endif
4221
%endif
4222
4223
%define DIFFOP DIFF_SUMSUB_SSSE3
4224
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4225
%if HIGH_BIT_DEPTH == 0
4226
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
4227
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
4228
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
4229
%endif
4230
INIT_XMM ssse3
4231
SATDS_SSE2
4232
SA8D
4233
HADAMARD_AC_SSE2
4234
%if ARCH_X86_64
4235
SA8D_SATD
4236
%endif
4237
%if HIGH_BIT_DEPTH == 0
4238
INTRA_X9
4239
INTRA8_X9
4240
%endif
4241
%undef movdqa ; nehalem doesn't like movaps
4242
%undef movdqu ; movups
4243
%undef punpcklqdq ; or movlhps
4244
%if HIGH_BIT_DEPTH == 0
4245
INIT_MMX ssse3
4246
INTRA_X3_MMX
4247
%endif
4248
4249
%define TRANS TRANS_SSE4
4250
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
4251
INIT_XMM sse4
4252
SATDS_SSE2
4253
SA8D
4254
HADAMARD_AC_SSE2
4255
%if ARCH_X86_64
4256
SA8D_SATD
4257
%endif
4258
%if HIGH_BIT_DEPTH == 0
4259
INTRA_X9
4260
INTRA8_X9
4261
%endif
4262
4263
; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
4264
; it's effectively free.
4265
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
4266
INIT_XMM avx
4267
SATDS_SSE2
4268
SA8D
4269
%if ARCH_X86_64
4270
SA8D_SATD
4271
%endif
4272
%if HIGH_BIT_DEPTH == 0
4273
INTRA_X9
4274
INTRA8_X9
4275
%endif
4276
HADAMARD_AC_SSE2
4277
4278
%define TRANS TRANS_XOP
4279
INIT_XMM xop
4280
SATDS_SSE2
4281
SA8D
4282
%if ARCH_X86_64
4283
SA8D_SATD
4284
%endif
4285
%if HIGH_BIT_DEPTH == 0
4286
INTRA_X9
4287
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
4288
%endif
4289
HADAMARD_AC_SSE2
4290
4291
4292
%if HIGH_BIT_DEPTH == 0
4293
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
4294
%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
4295
%define TRANS TRANS_SSE4
4296
INIT_YMM avx2
4297
HADAMARD_AC_SSE2
4298
%if ARCH_X86_64
4299
SA8D_SATD
4300
%endif
4301
4302
%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
4303
movq xm%1, [r0]
4304
movq xm%3, [r2]
4305
movq xm%2, [r0+r1]
4306
movq xm%4, [r2+r3]
4307
vinserti128 m%1, m%1, [r0+4*r1], 1
4308
vinserti128 m%3, m%3, [r2+4*r3], 1
4309
vinserti128 m%2, m%2, [r0+r4], 1
4310
vinserti128 m%4, m%4, [r2+r5], 1
4311
punpcklqdq m%1, m%1
4312
punpcklqdq m%3, m%3
4313
punpcklqdq m%2, m%2
4314
punpcklqdq m%4, m%4
4315
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
4316
lea r0, [r0+2*r1]
4317
lea r2, [r2+2*r3]
4318
4319
movq xm%3, [r0]
4320
movq xm%5, [r2]
4321
movq xm%4, [r0+r1]
4322
movq xm%6, [r2+r3]
4323
vinserti128 m%3, m%3, [r0+4*r1], 1
4324
vinserti128 m%5, m%5, [r2+4*r3], 1
4325
vinserti128 m%4, m%4, [r0+r4], 1
4326
vinserti128 m%6, m%6, [r2+r5], 1
4327
punpcklqdq m%3, m%3
4328
punpcklqdq m%5, m%5
4329
punpcklqdq m%4, m%4
4330
punpcklqdq m%6, m%6
4331
DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
4332
%endmacro
4333
4334
%macro SATD_START_AVX2 2-3 0
4335
FIX_STRIDES r1, r3
4336
%if %3
4337
mova %2, [hmul_8p]
4338
lea r4, [5*r1]
4339
lea r5, [5*r3]
4340
%else
4341
mova %2, [hmul_16p]
4342
lea r4, [3*r1]
4343
lea r5, [3*r3]
4344
%endif
4345
pxor %1, %1
4346
%endmacro
4347
4348
%define TRANS TRANS_SSE4
4349
INIT_YMM avx2
4350
cglobal pixel_satd_16x8_internal
4351
LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
4352
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4353
LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
4354
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4355
ret
4356
4357
cglobal pixel_satd_16x16, 4,6,8
4358
SATD_START_AVX2 m6, m7
4359
call pixel_satd_16x8_internal
4360
lea r0, [r0+4*r1]
4361
lea r2, [r2+4*r3]
4362
pixel_satd_16x8_internal:
4363
call pixel_satd_16x8_internal
4364
vextracti128 xm0, m6, 1
4365
paddw xm0, xm6
4366
SATD_END_SSE2 xm0
4367
RET
4368
4369
cglobal pixel_satd_16x8, 4,6,8
4370
SATD_START_AVX2 m6, m7
4371
jmp pixel_satd_16x8_internal
4372
4373
cglobal pixel_satd_8x8_internal
4374
LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4375
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
4376
ret
4377
4378
cglobal pixel_satd_8x16, 4,6,8
4379
SATD_START_AVX2 m6, m7, 1
4380
call pixel_satd_8x8_internal
4381
lea r0, [r0+2*r1]
4382
lea r2, [r2+2*r3]
4383
lea r0, [r0+4*r1]
4384
lea r2, [r2+4*r3]
4385
call pixel_satd_8x8_internal
4386
vextracti128 xm0, m6, 1
4387
paddw xm0, xm6
4388
SATD_END_SSE2 xm0
4389
RET
4390
4391
cglobal pixel_satd_8x8, 4,6,8
4392
SATD_START_AVX2 m6, m7, 1
4393
call pixel_satd_8x8_internal
4394
vextracti128 xm0, m6, 1
4395
paddw xm0, xm6
4396
SATD_END_SSE2 xm0
4397
RET
4398
4399
cglobal pixel_sa8d_8x8_internal
4400
LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
4401
HADAMARD4_V 0, 1, 2, 3, 4
4402
HADAMARD 8, sumsub, 0, 1, 4, 5
4403
HADAMARD 8, sumsub, 2, 3, 4, 5
4404
HADAMARD 2, sumsub, 0, 1, 4, 5
4405
HADAMARD 2, sumsub, 2, 3, 4, 5
4406
HADAMARD 1, amax, 0, 1, 4, 5
4407
HADAMARD 1, amax, 2, 3, 4, 5
4408
paddw m6, m0
4409
paddw m6, m2
4410
ret
4411
4412
cglobal pixel_sa8d_8x8, 4,6,8
4413
SATD_START_AVX2 m6, m7, 1
4414
call pixel_sa8d_8x8_internal
4415
vextracti128 xm1, m6, 1
4416
paddw xm6, xm1
4417
HADDW xm6, xm1
4418
movd eax, xm6
4419
add eax, 1
4420
shr eax, 1
4421
RET
4422
4423
cglobal intra_sad_x9_8x8, 5,7,8
4424
%define pred(i,j) [rsp+i*0x40+j*0x20]
4425
4426
mov r6, rsp
4427
and rsp, ~31
4428
sub rsp, 0x240
4429
movu m5, [r0+0*FENC_STRIDE]
4430
movu m6, [r0+4*FENC_STRIDE]
4431
punpcklqdq m5, [r0+2*FENC_STRIDE]
4432
punpcklqdq m6, [r0+6*FENC_STRIDE]
4433
4434
; save instruction size: avoid 4-byte memory offsets
4435
lea r0, [intra8x9_h1+128]
4436
%define off(m) (r0+m-(intra8x9_h1+128))
4437
4438
vpbroadcastq m0, [r2+16]
4439
psadbw m4, m0, m5
4440
psadbw m2, m0, m6
4441
mova pred(0,0), m0
4442
mova pred(0,1), m0
4443
paddw m4, m2
4444
4445
vpbroadcastq m1, [r2+7]
4446
pshufb m3, m1, [off(intra8x9_h1)]
4447
pshufb m2, m1, [off(intra8x9_h3)]
4448
mova pred(1,0), m3
4449
mova pred(1,1), m2
4450
psadbw m3, m5
4451
psadbw m2, m6
4452
paddw m3, m2
4453
4454
lea r5, [rsp+0x100]
4455
%define pred(i,j) [r5+i*0x40+j*0x20-0x100]
4456
4457
; combine the first two
4458
pslldq m3, 2
4459
por m4, m3
4460
4461
pxor m2, m2
4462
psadbw m0, m2
4463
psadbw m1, m2
4464
paddw m0, m1
4465
psrlw m0, 3
4466
pavgw m0, m2
4467
pshufb m0, m2
4468
mova pred(2,0), m0
4469
mova pred(2,1), m0
4470
psadbw m3, m0, m5
4471
psadbw m2, m0, m6
4472
paddw m3, m2
4473
4474
pslldq m3, 4
4475
por m4, m3
4476
4477
vbroadcasti128 m0, [r2+16]
4478
vbroadcasti128 m2, [r2+17]
4479
pslldq m1, m0, 1
4480
pavgb m3, m0, m2
4481
PRED4x4_LOWPASS m0, m1, m2, m0, m7
4482
pshufb m1, m0, [off(intra8x9_ddl1)]
4483
pshufb m2, m0, [off(intra8x9_ddl3)]
4484
mova pred(3,0), m1
4485
mova pred(3,1), m2
4486
psadbw m1, m5
4487
psadbw m2, m6
4488
paddw m1, m2
4489
4490
pslldq m1, 6
4491
por m4, m1
4492
vextracti128 xm1, m4, 1
4493
paddw xm4, xm1
4494
mova [r4], xm4
4495
4496
; for later
4497
vinserti128 m7, m3, xm0, 1
4498
4499
vbroadcasti128 m2, [r2+8]
4500
vbroadcasti128 m0, [r2+7]
4501
vbroadcasti128 m1, [r2+6]
4502
pavgb m3, m2, m0
4503
PRED4x4_LOWPASS m0, m1, m2, m0, m4
4504
pshufb m1, m0, [off(intra8x9_ddr1)]
4505
pshufb m2, m0, [off(intra8x9_ddr3)]
4506
mova pred(4,0), m1
4507
mova pred(4,1), m2
4508
psadbw m4, m1, m5
4509
psadbw m2, m6
4510
paddw m4, m2
4511
4512
add r0, 256
4513
add r5, 0xC0
4514
%define off(m) (r0+m-(intra8x9_h1+256+128))
4515
%define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
4516
4517
vpblendd m2, m3, m0, 11110011b
4518
pshufb m1, m2, [off(intra8x9_vr1)]
4519
pshufb m2, m2, [off(intra8x9_vr3)]
4520
mova pred(5,0), m1
4521
mova pred(5,1), m2
4522
psadbw m1, m5
4523
psadbw m2, m6
4524
paddw m1, m2
4525
4526
pslldq m1, 2
4527
por m4, m1
4528
4529
psrldq m2, m3, 4
4530
pblendw m2, m0, q3330
4531
punpcklbw m0, m3
4532
pshufb m1, m2, [off(intra8x9_hd1)]
4533
pshufb m2, m0, [off(intra8x9_hd3)]
4534
mova pred(6,0), m1
4535
mova pred(6,1), m2
4536
psadbw m1, m5
4537
psadbw m2, m6
4538
paddw m1, m2
4539
4540
pslldq m1, 4
4541
por m4, m1
4542
4543
pshufb m1, m7, [off(intra8x9_vl1)]
4544
pshufb m2, m7, [off(intra8x9_vl3)]
4545
mova pred(7,0), m1
4546
mova pred(7,1), m2
4547
psadbw m1, m5
4548
psadbw m2, m6
4549
paddw m1, m2
4550
4551
pslldq m1, 6
4552
por m4, m1
4553
vextracti128 xm1, m4, 1
4554
paddw xm4, xm1
4555
mova xm3, [r4]
4556
SBUTTERFLY qdq, 3, 4, 7
4557
paddw xm3, xm4
4558
4559
pslldq m1, m0, 1
4560
vpbroadcastd m0, [r2+7]
4561
palignr m0, m1, 1
4562
pshufb m1, m0, [off(intra8x9_hu1)]
4563
pshufb m2, m0, [off(intra8x9_hu3)]
4564
mova pred(8,0), m1
4565
mova pred(8,1), m2
4566
psadbw m1, m5
4567
psadbw m2, m6
4568
paddw m1, m2
4569
vextracti128 xm2, m1, 1
4570
paddw xm1, xm2
4571
MOVHL xm2, xm1
4572
paddw xm1, xm2
4573
movd r2d, xm1
4574
4575
paddw xm3, [r3]
4576
mova [r4], xm3
4577
add r2w, word [r3+16]
4578
mov [r4+16], r2w
4579
4580
phminposuw xm3, xm3
4581
movd r3d, xm3
4582
add r2d, 8<<16
4583
cmp r3w, r2w
4584
cmovg r3d, r2d
4585
4586
mov r2d, r3d
4587
shr r3, 16
4588
shl r3, 6
4589
add r1, 4*FDEC_STRIDE
4590
mova xm0, [rsp+r3+0x00]
4591
mova xm1, [rsp+r3+0x10]
4592
mova xm2, [rsp+r3+0x20]
4593
mova xm3, [rsp+r3+0x30]
4594
movq [r1+FDEC_STRIDE*-4], xm0
4595
movhps [r1+FDEC_STRIDE*-2], xm0
4596
movq [r1+FDEC_STRIDE*-3], xm1
4597
movhps [r1+FDEC_STRIDE*-1], xm1
4598
movq [r1+FDEC_STRIDE* 0], xm2
4599
movhps [r1+FDEC_STRIDE* 2], xm2
4600
movq [r1+FDEC_STRIDE* 1], xm3
4601
movhps [r1+FDEC_STRIDE* 3], xm3
4602
mov rsp, r6
4603
mov eax, r2d
4604
RET
4605
%endif ; HIGH_BIT_DEPTH
4606
4607
;=============================================================================
4608
; SSIM
4609
;=============================================================================
4610
4611
;-----------------------------------------------------------------------------
4612
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
4613
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
4614
;-----------------------------------------------------------------------------
4615
%macro SSIM_ITER 1
4616
%if HIGH_BIT_DEPTH
4617
movdqu m5, [r0+(%1&1)*r1]
4618
movdqu m6, [r2+(%1&1)*r3]
4619
%else
4620
movq m5, [r0+(%1&1)*r1]
4621
movq m6, [r2+(%1&1)*r3]
4622
punpcklbw m5, m0
4623
punpcklbw m6, m0
4624
%endif
4625
%if %1==1
4626
lea r0, [r0+r1*2]
4627
lea r2, [r2+r3*2]
4628
%endif
4629
%if %1==0
4630
movdqa m1, m5
4631
movdqa m2, m6
4632
%else
4633
paddw m1, m5
4634
paddw m2, m6
4635
%endif
4636
pmaddwd m7, m5, m6
4637
pmaddwd m5, m5
4638
pmaddwd m6, m6
4639
ACCUM paddd, 3, 5, %1
4640
ACCUM paddd, 4, 7, %1
4641
paddd m3, m6
4642
%endmacro
4643
4644
%macro SSIM 0
4645
cglobal pixel_ssim_4x4x2_core, 4,4,8
4646
FIX_STRIDES r1, r3
4647
pxor m0, m0
4648
SSIM_ITER 0
4649
SSIM_ITER 1
4650
SSIM_ITER 2
4651
SSIM_ITER 3
4652
; PHADDW m1, m2
4653
; PHADDD m3, m4
4654
movdqa m7, [pw_1]
4655
pshufd m5, m3, q2301
4656
pmaddwd m1, m7
4657
pmaddwd m2, m7
4658
pshufd m6, m4, q2301
4659
packssdw m1, m2
4660
paddd m3, m5
4661
pshufd m1, m1, q3120
4662
paddd m4, m6
4663
pmaddwd m1, m7
4664
punpckhdq m5, m3, m4
4665
punpckldq m3, m4
4666
4667
%if UNIX64
4668
%define t0 r4
4669
%else
4670
%define t0 rax
4671
mov t0, r4mp
4672
%endif
4673
4674
movq [t0+ 0], m1
4675
movq [t0+ 8], m3
4676
movhps [t0+16], m1
4677
movq [t0+24], m5
4678
RET
4679
4680
;-----------------------------------------------------------------------------
4681
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
4682
;-----------------------------------------------------------------------------
4683
cglobal pixel_ssim_end4, 2,3
4684
mov r2d, r2m
4685
mova m0, [r0+ 0]
4686
mova m1, [r0+16]
4687
mova m2, [r0+32]
4688
mova m3, [r0+48]
4689
mova m4, [r0+64]
4690
paddd m0, [r1+ 0]
4691
paddd m1, [r1+16]
4692
paddd m2, [r1+32]
4693
paddd m3, [r1+48]
4694
paddd m4, [r1+64]
4695
paddd m0, m1
4696
paddd m1, m2
4697
paddd m2, m3
4698
paddd m3, m4
4699
TRANSPOSE4x4D 0, 1, 2, 3, 4
4700
4701
; s1=m0, s2=m1, ss=m2, s12=m3
4702
%if BIT_DEPTH == 10
4703
cvtdq2ps m0, m0
4704
cvtdq2ps m1, m1
4705
cvtdq2ps m2, m2
4706
cvtdq2ps m3, m3
4707
mulps m4, m0, m1 ; s1*s2
4708
mulps m0, m0 ; s1*s1
4709
mulps m1, m1 ; s2*s2
4710
mulps m2, [pf_64] ; ss*64
4711
mulps m3, [pf_128] ; s12*128
4712
addps m4, m4 ; s1*s2*2
4713
addps m0, m1 ; s1*s1 + s2*s2
4714
subps m2, m0 ; vars
4715
subps m3, m4 ; covar*2
4716
movaps m1, [ssim_c1]
4717
addps m4, m1 ; s1*s2*2 + ssim_c1
4718
addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
4719
movaps m1, [ssim_c2]
4720
addps m2, m1 ; vars + ssim_c2
4721
addps m3, m1 ; covar*2 + ssim_c2
4722
%else
4723
pmaddwd m4, m1, m0 ; s1*s2
4724
pslld m1, 16
4725
por m0, m1
4726
pmaddwd m0, m0 ; s1*s1 + s2*s2
4727
pslld m4, 1
4728
pslld m3, 7
4729
pslld m2, 6
4730
psubd m3, m4 ; covar*2
4731
psubd m2, m0 ; vars
4732
mova m1, [ssim_c1]
4733
paddd m0, m1
4734
paddd m4, m1
4735
mova m1, [ssim_c2]
4736
paddd m3, m1
4737
paddd m2, m1
4738
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
4739
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
4740
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
4741
cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
4742
%endif
4743
mulps m4, m3
4744
mulps m0, m2
4745
divps m4, m0 ; ssim
4746
4747
cmp r2d, 4
4748
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
4749
neg r2
4750
4751
%ifdef PIC
4752
lea r3, [mask_ff + 16]
4753
%xdefine %%mask r3
4754
%else
4755
%xdefine %%mask mask_ff + 16
4756
%endif
4757
%if cpuflag(avx)
4758
andps m4, [%%mask + r2*4]
4759
%else
4760
movups m0, [%%mask + r2*4]
4761
andps m4, m0
4762
%endif
4763
4764
.skip:
4765
movhlps m0, m4
4766
addps m0, m4
4767
%if cpuflag(ssse3)
4768
movshdup m4, m0
4769
%else
4770
pshuflw m4, m0, q0032
4771
%endif
4772
addss m0, m4
4773
%if ARCH_X86_64 == 0
4774
movss r0m, m0
4775
fld dword r0m
4776
%endif
4777
RET
4778
%endmacro ; SSIM
4779
4780
INIT_XMM sse2
4781
SSIM
4782
INIT_XMM avx
4783
SSIM
4784
4785
;-----------------------------------------------------------------------------
4786
; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
4787
;-----------------------------------------------------------------------------
4788
%macro ASD8 0
4789
cglobal pixel_asd8, 5,5
4790
pxor m0, m0
4791
pxor m1, m1
4792
.loop:
4793
%if HIGH_BIT_DEPTH
4794
paddw m0, [r0]
4795
paddw m1, [r2]
4796
paddw m0, [r0+2*r1]
4797
paddw m1, [r2+2*r3]
4798
lea r0, [r0+4*r1]
4799
paddw m0, [r0]
4800
paddw m1, [r2+4*r3]
4801
lea r2, [r2+4*r3]
4802
paddw m0, [r0+2*r1]
4803
paddw m1, [r2+2*r3]
4804
lea r0, [r0+4*r1]
4805
lea r2, [r2+4*r3]
4806
%else
4807
movq m2, [r0]
4808
movq m3, [r2]
4809
movhps m2, [r0+r1]
4810
movhps m3, [r2+r3]
4811
lea r0, [r0+2*r1]
4812
psadbw m2, m1
4813
psadbw m3, m1
4814
movq m4, [r0]
4815
movq m5, [r2+2*r3]
4816
lea r2, [r2+2*r3]
4817
movhps m4, [r0+r1]
4818
movhps m5, [r2+r3]
4819
lea r0, [r0+2*r1]
4820
paddw m0, m2
4821
psubw m0, m3
4822
psadbw m4, m1
4823
psadbw m5, m1
4824
lea r2, [r2+2*r3]
4825
paddw m0, m4
4826
psubw m0, m5
4827
%endif
4828
sub r4d, 4
4829
jg .loop
4830
%if HIGH_BIT_DEPTH
4831
psubw m0, m1
4832
HADDW m0, m1
4833
ABSD m1, m0
4834
%else
4835
MOVHL m1, m0
4836
paddw m0, m1
4837
ABSW m1, m0
4838
%endif
4839
movd eax, m1
4840
RET
4841
%endmacro
4842
4843
INIT_XMM sse2
4844
ASD8
4845
INIT_XMM ssse3
4846
ASD8
4847
%if HIGH_BIT_DEPTH
4848
INIT_XMM xop
4849
ASD8
4850
%endif
4851
4852
;=============================================================================
4853
; Successive Elimination ADS
4854
;=============================================================================
4855
4856
%macro ADS_START 0
4857
%if UNIX64
4858
movsxd r5, r5d
4859
%else
4860
mov r5d, r5m
4861
%endif
4862
mov r0d, r5d
4863
lea r6, [r4+r5+(mmsize-1)]
4864
and r6, ~(mmsize-1)
4865
shl r2d, 1
4866
%endmacro
4867
4868
%macro ADS_END 1 ; unroll_size
4869
add r1, 8*%1
4870
add r3, 8*%1
4871
add r6, 4*%1
4872
sub r0d, 4*%1
4873
jg .loop
4874
WIN64_RESTORE_XMM rsp
4875
%if mmsize==32
4876
vzeroupper
4877
%endif
4878
lea r6, [r4+r5+(mmsize-1)]
4879
and r6, ~(mmsize-1)
4880
%if cpuflag(ssse3)
4881
jmp ads_mvs_ssse3
4882
%else
4883
jmp ads_mvs_mmx
4884
%endif
4885
%endmacro
4886
4887
;-----------------------------------------------------------------------------
4888
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
4889
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
4890
;-----------------------------------------------------------------------------
4891
INIT_MMX mmx2
4892
cglobal pixel_ads4, 5,7
4893
mova m6, [r0]
4894
mova m4, [r0+8]
4895
pshufw m7, m6, 0
4896
pshufw m6, m6, q2222
4897
pshufw m5, m4, 0
4898
pshufw m4, m4, q2222
4899
ADS_START
4900
.loop:
4901
movu m0, [r1]
4902
movu m1, [r1+16]
4903
psubw m0, m7
4904
psubw m1, m6
4905
ABSW m0, m0, m2
4906
ABSW m1, m1, m3
4907
movu m2, [r1+r2]
4908
movu m3, [r1+r2+16]
4909
psubw m2, m5
4910
psubw m3, m4
4911
paddw m0, m1
4912
ABSW m2, m2, m1
4913
ABSW m3, m3, m1
4914
paddw m0, m2
4915
paddw m0, m3
4916
pshufw m1, r6m, 0
4917
paddusw m0, [r3]
4918
psubusw m1, m0
4919
packsswb m1, m1
4920
movd [r6], m1
4921
ADS_END 1
4922
4923
cglobal pixel_ads2, 5,7
4924
mova m6, [r0]
4925
pshufw m5, r6m, 0
4926
pshufw m7, m6, 0
4927
pshufw m6, m6, q2222
4928
ADS_START
4929
.loop:
4930
movu m0, [r1]
4931
movu m1, [r1+r2]
4932
psubw m0, m7
4933
psubw m1, m6
4934
ABSW m0, m0, m2
4935
ABSW m1, m1, m3
4936
paddw m0, m1
4937
paddusw m0, [r3]
4938
mova m4, m5
4939
psubusw m4, m0
4940
packsswb m4, m4
4941
movd [r6], m4
4942
ADS_END 1
4943
4944
cglobal pixel_ads1, 5,7
4945
pshufw m7, [r0], 0
4946
pshufw m6, r6m, 0
4947
ADS_START
4948
.loop:
4949
movu m0, [r1]
4950
movu m1, [r1+8]
4951
psubw m0, m7
4952
psubw m1, m7
4953
ABSW m0, m0, m2
4954
ABSW m1, m1, m3
4955
paddusw m0, [r3]
4956
paddusw m1, [r3+8]
4957
mova m4, m6
4958
mova m5, m6
4959
psubusw m4, m0
4960
psubusw m5, m1
4961
packsswb m4, m5
4962
mova [r6], m4
4963
ADS_END 2
4964
4965
%macro ADS_XMM 0
4966
%if mmsize==32
4967
cglobal pixel_ads4, 5,7,8
4968
vpbroadcastw m7, [r0+ 0]
4969
vpbroadcastw m6, [r0+ 4]
4970
vpbroadcastw m5, [r0+ 8]
4971
vpbroadcastw m4, [r0+12]
4972
%else
4973
cglobal pixel_ads4, 5,7,12
4974
mova m4, [r0]
4975
pshuflw m7, m4, q0000
4976
pshuflw m6, m4, q2222
4977
pshufhw m5, m4, q0000
4978
pshufhw m4, m4, q2222
4979
punpcklqdq m7, m7
4980
punpcklqdq m6, m6
4981
punpckhqdq m5, m5
4982
punpckhqdq m4, m4
4983
%endif
4984
%if ARCH_X86_64 && mmsize == 16
4985
movd m8, r6m
4986
SPLATW m8, m8
4987
ADS_START
4988
movu m10, [r1]
4989
movu m11, [r1+r2]
4990
.loop:
4991
psubw m0, m10, m7
4992
movu m10, [r1+16]
4993
psubw m1, m10, m6
4994
ABSW m0, m0, m2
4995
ABSW m1, m1, m3
4996
psubw m2, m11, m5
4997
movu m11, [r1+r2+16]
4998
paddw m0, m1
4999
psubw m3, m11, m4
5000
movu m9, [r3]
5001
ABSW m2, m2, m1
5002
ABSW m3, m3, m1
5003
paddw m0, m2
5004
paddw m0, m3
5005
paddusw m0, m9
5006
psubusw m1, m8, m0
5007
%else
5008
ADS_START
5009
.loop:
5010
movu m0, [r1]
5011
movu m1, [r1+16]
5012
psubw m0, m7
5013
psubw m1, m6
5014
ABSW m0, m0, m2
5015
ABSW m1, m1, m3
5016
movu m2, [r1+r2]
5017
movu m3, [r1+r2+16]
5018
psubw m2, m5
5019
psubw m3, m4
5020
paddw m0, m1
5021
ABSW m2, m2, m1
5022
ABSW m3, m3, m1
5023
paddw m0, m2
5024
paddw m0, m3
5025
movu m2, [r3]
5026
%if mmsize==32
5027
vpbroadcastw m1, r6m
5028
%else
5029
movd m1, r6m
5030
pshuflw m1, m1, 0
5031
punpcklqdq m1, m1
5032
%endif
5033
paddusw m0, m2
5034
psubusw m1, m0
5035
%endif ; ARCH
5036
packsswb m1, m1
5037
%if mmsize==32
5038
vpermq m1, m1, q3120
5039
mova [r6], xm1
5040
%else
5041
movh [r6], m1
5042
%endif
5043
ADS_END mmsize/8
5044
5045
cglobal pixel_ads2, 5,7,8
5046
%if mmsize==32
5047
vpbroadcastw m7, [r0+0]
5048
vpbroadcastw m6, [r0+4]
5049
vpbroadcastw m5, r6m
5050
%else
5051
movq m6, [r0]
5052
movd m5, r6m
5053
pshuflw m7, m6, 0
5054
pshuflw m6, m6, q2222
5055
pshuflw m5, m5, 0
5056
punpcklqdq m7, m7
5057
punpcklqdq m6, m6
5058
punpcklqdq m5, m5
5059
%endif
5060
ADS_START
5061
.loop:
5062
movu m0, [r1]
5063
movu m1, [r1+r2]
5064
psubw m0, m7
5065
psubw m1, m6
5066
movu m4, [r3]
5067
ABSW m0, m0, m2
5068
ABSW m1, m1, m3
5069
paddw m0, m1
5070
paddusw m0, m4
5071
psubusw m1, m5, m0
5072
packsswb m1, m1
5073
%if mmsize==32
5074
vpermq m1, m1, q3120
5075
mova [r6], xm1
5076
%else
5077
movh [r6], m1
5078
%endif
5079
ADS_END mmsize/8
5080
5081
cglobal pixel_ads1, 5,7,8
5082
%if mmsize==32
5083
vpbroadcastw m7, [r0]
5084
vpbroadcastw m6, r6m
5085
%else
5086
movd m7, [r0]
5087
movd m6, r6m
5088
pshuflw m7, m7, 0
5089
pshuflw m6, m6, 0
5090
punpcklqdq m7, m7
5091
punpcklqdq m6, m6
5092
%endif
5093
ADS_START
5094
.loop:
5095
movu m0, [r1]
5096
movu m1, [r1+mmsize]
5097
psubw m0, m7
5098
psubw m1, m7
5099
movu m2, [r3]
5100
movu m3, [r3+mmsize]
5101
ABSW m0, m0, m4
5102
ABSW m1, m1, m5
5103
paddusw m0, m2
5104
paddusw m1, m3
5105
psubusw m4, m6, m0
5106
psubusw m5, m6, m1
5107
packsswb m4, m5
5108
%if mmsize==32
5109
vpermq m4, m4, q3120
5110
%endif
5111
mova [r6], m4
5112
ADS_END mmsize/4
5113
%endmacro
5114
5115
INIT_XMM sse2
5116
ADS_XMM
5117
INIT_XMM ssse3
5118
ADS_XMM
5119
INIT_XMM avx
5120
ADS_XMM
5121
INIT_YMM avx2
5122
ADS_XMM
5123
5124
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
5125
; {
5126
; int nmv=0, i, j;
5127
; *(uint32_t*)(masks+width) = 0;
5128
; for( i=0; i<width; i+=8 )
5129
; {
5130
; uint64_t mask = *(uint64_t*)(masks+i);
5131
; if( !mask ) continue;
5132
; for( j=0; j<8; j++ )
5133
; if( mask & (255<<j*8) )
5134
; mvs[nmv++] = i+j;
5135
; }
5136
; return nmv;
5137
; }
5138
5139
%macro TEST 1
5140
mov [r4+r0*2], r1w
5141
test r2d, 0xff<<(%1*8)
5142
setne r3b
5143
add r0d, r3d
5144
inc r1d
5145
%endmacro
5146
5147
INIT_MMX mmx
5148
cglobal pixel_ads_mvs, 0,7,0
5149
ads_mvs_mmx:
5150
; mvs = r4
5151
; masks = r6
5152
; width = r5
5153
; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
5154
xor r0d, r0d
5155
xor r1d, r1d
5156
mov [r6+r5], r0d
5157
jmp .loopi
5158
ALIGN 16
5159
.loopi0:
5160
add r1d, 8
5161
cmp r1d, r5d
5162
jge .end
5163
.loopi:
5164
mov r2, [r6+r1]
5165
%if ARCH_X86_64
5166
test r2, r2
5167
%else
5168
mov r3, r2
5169
add r3d, [r6+r1+4]
5170
%endif
5171
jz .loopi0
5172
xor r3d, r3d
5173
TEST 0
5174
TEST 1
5175
TEST 2
5176
TEST 3
5177
%if ARCH_X86_64
5178
shr r2, 32
5179
%else
5180
mov r2d, [r6+r1]
5181
%endif
5182
TEST 0
5183
TEST 1
5184
TEST 2
5185
TEST 3
5186
cmp r1d, r5d
5187
jl .loopi
5188
.end:
5189
movifnidn eax, r0d
5190
RET
5191
5192
INIT_XMM ssse3
5193
cglobal pixel_ads_mvs, 0,7,0
5194
ads_mvs_ssse3:
5195
mova m3, [pw_8]
5196
mova m4, [pw_76543210]
5197
pxor m5, m5
5198
add r5, r6
5199
xor r0d, r0d ; nmv
5200
mov [r5], r0d
5201
%ifdef PIC
5202
lea r1, [$$]
5203
%define GLOBAL +r1-$$
5204
%else
5205
%define GLOBAL
5206
%endif
5207
.loop:
5208
movh m0, [r6]
5209
pcmpeqb m0, m5
5210
pmovmskb r2d, m0
5211
xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
5212
movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
5213
add r2d, r2d
5214
; shuffle counters based on mv mask
5215
pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
5216
movu [r4+r0*2], m2
5217
add r0d, r3d
5218
paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
5219
add r6, 8
5220
cmp r6, r5
5221
jl .loop
5222
movifnidn eax, r0d
5223
RET
5224
5225