Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* mc-a2.asm: x86 motion compensation
3
;*****************************************************************************
4
;* Copyright (C) 2005-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Holger Lubitz <holger@lubitz.org>
9
;* Mathieu Monnier <manao@melix.net>
10
;* Oskar Arvidsson <oskar@irock.se>
11
;*
12
;* This program is free software; you can redistribute it and/or modify
13
;* it under the terms of the GNU General Public License as published by
14
;* the Free Software Foundation; either version 2 of the License, or
15
;* (at your option) any later version.
16
;*
17
;* This program is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
;* GNU General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU General Public License
23
;* along with this program; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25
;*
26
;* This program is also available under a commercial proprietary license.
27
;* For more information, contact us at licensing@x264.com.
28
;*****************************************************************************
29
30
%include "x86inc.asm"
31
%include "x86util.asm"
32
33
SECTION_RODATA 32
34
35
pw_1024: times 16 dw 1024
36
filt_mul20: times 32 db 20
37
filt_mul15: times 16 db 1, -5
38
filt_mul51: times 16 db -5, 1
39
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
40
deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
41
42
%if HIGH_BIT_DEPTH
43
copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
44
v210_mask: times 4 dq 0xc00ffc003ff003ff
45
v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
46
v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
47
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
48
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
49
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
50
51
deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
52
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
53
%else
54
copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
55
deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
56
db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
57
58
deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
59
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
60
%endif ; !HIGH_BIT_DEPTH
61
62
pd_16: times 4 dd 16
63
pd_0f: times 4 dd 0xffff
64
65
pad10: times 8 dw 10*PIXEL_MAX
66
pad20: times 8 dw 20*PIXEL_MAX
67
pad30: times 8 dw 30*PIXEL_MAX
68
depad: times 4 dd 32*20*PIXEL_MAX + 512
69
70
tap1: times 4 dw 1, -5
71
tap2: times 4 dw 20, 20
72
tap3: times 4 dw -5, 1
73
74
pw_0xc000: times 8 dw 0xc000
75
pw_31: times 8 dw 31
76
pd_4: times 4 dd 4
77
78
SECTION .text
79
80
cextern pb_0
81
cextern pw_1
82
cextern pw_8
83
cextern pw_16
84
cextern pw_32
85
cextern pw_512
86
cextern pw_00ff
87
cextern pw_3fff
88
cextern pw_pixel_max
89
cextern pw_0to15
90
cextern pd_ffff
91
92
%macro LOAD_ADD 4
93
movh %4, %3
94
movh %1, %2
95
punpcklbw %4, m0
96
punpcklbw %1, m0
97
paddw %1, %4
98
%endmacro
99
100
%macro LOAD_ADD_2 6
101
mova %5, %3
102
mova %1, %4
103
punpckhbw %6, %5, m0
104
punpcklbw %5, m0
105
punpckhbw %2, %1, m0
106
punpcklbw %1, m0
107
paddw %1, %5
108
paddw %2, %6
109
%endmacro
110
111
%macro FILT_V2 6
112
psubw %1, %2 ; a-b
113
psubw %4, %5
114
psubw %2, %3 ; b-c
115
psubw %5, %6
116
psllw %2, 2
117
psllw %5, 2
118
psubw %1, %2 ; a-5*b+4*c
119
psllw %3, 4
120
psubw %4, %5
121
psllw %6, 4
122
paddw %1, %3 ; a-5*b+20*c
123
paddw %4, %6
124
%endmacro
125
126
%macro FILT_H 3
127
psubw %1, %2 ; a-b
128
psraw %1, 2 ; (a-b)/4
129
psubw %1, %2 ; (a-b)/4-b
130
paddw %1, %3 ; (a-b)/4-b+c
131
psraw %1, 2 ; ((a-b)/4-b+c)/4
132
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
133
%endmacro
134
135
%macro FILT_H2 6
136
psubw %1, %2
137
psubw %4, %5
138
psraw %1, 2
139
psraw %4, 2
140
psubw %1, %2
141
psubw %4, %5
142
paddw %1, %3
143
paddw %4, %6
144
psraw %1, 2
145
psraw %4, 2
146
paddw %1, %3
147
paddw %4, %6
148
%endmacro
149
150
%macro FILT_PACK 3-5
151
%if cpuflag(ssse3)
152
pmulhrsw %1, %3
153
pmulhrsw %2, %3
154
%else
155
paddw %1, %3
156
paddw %2, %3
157
%if %0 == 5
158
psubusw %1, %5
159
psubusw %2, %5
160
psrlw %1, %4
161
psrlw %2, %4
162
%else
163
psraw %1, %4
164
psraw %2, %4
165
%endif
166
%endif
167
%if HIGH_BIT_DEPTH == 0
168
packuswb %1, %2
169
%endif
170
%endmacro
171
172
;The hpel_filter routines use non-temporal writes for output.
173
;The following defines may be uncommented for testing.
174
;Doing the hpel_filter temporal may be a win if the last level cache
175
;is big enough (preliminary benching suggests on the order of 4* framesize).
176
177
;%define movntq movq
178
;%define movntps movaps
179
;%define sfence
180
181
%if HIGH_BIT_DEPTH
182
;-----------------------------------------------------------------------------
183
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
184
;-----------------------------------------------------------------------------
185
%macro HPEL_FILTER 0
186
cglobal hpel_filter_v, 5,6,11
187
FIX_STRIDES r3, r4
188
lea r5, [r1+r3]
189
sub r1, r3
190
sub r1, r3
191
%if num_mmregs > 8
192
mova m8, [pad10]
193
mova m9, [pad20]
194
mova m10, [pad30]
195
%define s10 m8
196
%define s20 m9
197
%define s30 m10
198
%else
199
%define s10 [pad10]
200
%define s20 [pad20]
201
%define s30 [pad30]
202
%endif
203
add r0, r4
204
add r2, r4
205
neg r4
206
mova m7, [pw_pixel_max]
207
pxor m0, m0
208
.loop:
209
mova m1, [r1]
210
mova m2, [r1+r3]
211
mova m3, [r1+r3*2]
212
mova m4, [r1+mmsize]
213
mova m5, [r1+r3+mmsize]
214
mova m6, [r1+r3*2+mmsize]
215
paddw m1, [r5+r3*2]
216
paddw m2, [r5+r3]
217
paddw m3, [r5]
218
paddw m4, [r5+r3*2+mmsize]
219
paddw m5, [r5+r3+mmsize]
220
paddw m6, [r5+mmsize]
221
add r1, 2*mmsize
222
add r5, 2*mmsize
223
FILT_V2 m1, m2, m3, m4, m5, m6
224
mova m6, [pw_16]
225
psubw m1, s20
226
psubw m4, s20
227
mova [r2+r4], m1
228
mova [r2+r4+mmsize], m4
229
paddw m1, s30
230
paddw m4, s30
231
FILT_PACK m1, m4, m6, 5, s10
232
CLIPW m1, m0, m7
233
CLIPW m4, m0, m7
234
mova [r0+r4], m1
235
mova [r0+r4+mmsize], m4
236
add r4, 2*mmsize
237
jl .loop
238
RET
239
240
;-----------------------------------------------------------------------------
241
; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
242
;-----------------------------------------------------------------------------
243
cglobal hpel_filter_c, 3,3,10
244
add r2, r2
245
add r0, r2
246
add r1, r2
247
neg r2
248
mova m0, [tap1]
249
mova m7, [tap3]
250
%if num_mmregs > 8
251
mova m8, [tap2]
252
mova m9, [depad]
253
%define s1 m8
254
%define s2 m9
255
%else
256
%define s1 [tap2]
257
%define s2 [depad]
258
%endif
259
.loop:
260
movu m1, [r1+r2-4]
261
movu m2, [r1+r2-2]
262
mova m3, [r1+r2+0]
263
movu m4, [r1+r2+2]
264
movu m5, [r1+r2+4]
265
movu m6, [r1+r2+6]
266
pmaddwd m1, m0
267
pmaddwd m2, m0
268
pmaddwd m3, s1
269
pmaddwd m4, s1
270
pmaddwd m5, m7
271
pmaddwd m6, m7
272
paddd m1, s2
273
paddd m2, s2
274
paddd m3, m5
275
paddd m4, m6
276
paddd m1, m3
277
paddd m2, m4
278
psrad m1, 10
279
psrad m2, 10
280
pslld m2, 16
281
pand m1, [pd_0f]
282
por m1, m2
283
CLIPW m1, [pb_0], [pw_pixel_max]
284
mova [r0+r2], m1
285
add r2, mmsize
286
jl .loop
287
RET
288
289
;-----------------------------------------------------------------------------
290
; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
291
;-----------------------------------------------------------------------------
292
cglobal hpel_filter_h, 3,4,8
293
%define src r1+r2
294
add r2, r2
295
add r0, r2
296
add r1, r2
297
neg r2
298
mova m0, [pw_pixel_max]
299
.loop:
300
movu m1, [src-4]
301
movu m2, [src-2]
302
mova m3, [src+0]
303
movu m6, [src+2]
304
movu m4, [src+4]
305
movu m5, [src+6]
306
paddw m3, m6 ; c0
307
paddw m2, m4 ; b0
308
paddw m1, m5 ; a0
309
%if mmsize == 16
310
movu m4, [src-4+mmsize]
311
movu m5, [src-2+mmsize]
312
%endif
313
movu m7, [src+4+mmsize]
314
movu m6, [src+6+mmsize]
315
paddw m5, m7 ; b1
316
paddw m4, m6 ; a1
317
movu m7, [src+2+mmsize]
318
mova m6, [src+0+mmsize]
319
paddw m6, m7 ; c1
320
FILT_H2 m1, m2, m3, m4, m5, m6
321
mova m7, [pw_1]
322
pxor m2, m2
323
FILT_PACK m1, m4, m7, 1
324
CLIPW m1, m2, m0
325
CLIPW m4, m2, m0
326
mova [r0+r2], m1
327
mova [r0+r2+mmsize], m4
328
add r2, mmsize*2
329
jl .loop
330
RET
331
%endmacro ; HPEL_FILTER
332
333
INIT_MMX mmx2
334
HPEL_FILTER
335
INIT_XMM sse2
336
HPEL_FILTER
337
%endif ; HIGH_BIT_DEPTH
338
339
%if HIGH_BIT_DEPTH == 0
340
%macro HPEL_V 1
341
;-----------------------------------------------------------------------------
342
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
343
;-----------------------------------------------------------------------------
344
cglobal hpel_filter_v, 5,6,%1
345
lea r5, [r1+r3]
346
sub r1, r3
347
sub r1, r3
348
add r0, r4
349
lea r2, [r2+r4*2]
350
neg r4
351
%if cpuflag(ssse3)
352
mova m0, [filt_mul15]
353
%else
354
pxor m0, m0
355
%endif
356
.loop:
357
%if cpuflag(ssse3)
358
mova m1, [r1]
359
mova m4, [r1+r3]
360
mova m2, [r5+r3*2]
361
mova m5, [r5+r3]
362
mova m3, [r1+r3*2]
363
mova m6, [r5]
364
SBUTTERFLY bw, 1, 4, 7
365
SBUTTERFLY bw, 2, 5, 7
366
SBUTTERFLY bw, 3, 6, 7
367
pmaddubsw m1, m0
368
pmaddubsw m4, m0
369
pmaddubsw m2, m0
370
pmaddubsw m5, m0
371
pmaddubsw m3, [filt_mul20]
372
pmaddubsw m6, [filt_mul20]
373
paddw m1, m2
374
paddw m4, m5
375
paddw m1, m3
376
paddw m4, m6
377
mova m7, [pw_1024]
378
%else
379
LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
380
LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
381
LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
382
LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
383
FILT_V2 m1, m2, m3, m4, m5, m6
384
mova m7, [pw_16]
385
%endif
386
%if mmsize==32
387
mova [r2+r4*2], xm1
388
mova [r2+r4*2+mmsize/2], xm4
389
vextracti128 [r2+r4*2+mmsize], m1, 1
390
vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
391
%else
392
mova [r2+r4*2], m1
393
mova [r2+r4*2+mmsize], m4
394
%endif
395
FILT_PACK m1, m4, m7, 5
396
movnta [r0+r4], m1
397
add r1, mmsize
398
add r5, mmsize
399
add r4, mmsize
400
jl .loop
401
RET
402
%endmacro
403
404
;-----------------------------------------------------------------------------
405
; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
406
;-----------------------------------------------------------------------------
407
INIT_MMX mmx2
408
cglobal hpel_filter_c, 3,3
409
add r0, r2
410
lea r1, [r1+r2*2]
411
neg r2
412
%define src r1+r2*2
413
movq m7, [pw_32]
414
.loop:
415
movq m1, [src-4]
416
movq m2, [src-2]
417
movq m3, [src ]
418
movq m4, [src+4]
419
movq m5, [src+6]
420
paddw m3, [src+2] ; c0
421
paddw m2, m4 ; b0
422
paddw m1, m5 ; a0
423
movq m6, [src+8]
424
paddw m4, [src+14] ; a1
425
paddw m5, [src+12] ; b1
426
paddw m6, [src+10] ; c1
427
FILT_H2 m1, m2, m3, m4, m5, m6
428
FILT_PACK m1, m4, m7, 6
429
movntq [r0+r2], m1
430
add r2, 8
431
jl .loop
432
RET
433
434
;-----------------------------------------------------------------------------
435
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
436
;-----------------------------------------------------------------------------
437
INIT_MMX mmx2
438
cglobal hpel_filter_h, 3,3
439
add r0, r2
440
add r1, r2
441
neg r2
442
%define src r1+r2
443
pxor m0, m0
444
.loop:
445
movd m1, [src-2]
446
movd m2, [src-1]
447
movd m3, [src ]
448
movd m6, [src+1]
449
movd m4, [src+2]
450
movd m5, [src+3]
451
punpcklbw m1, m0
452
punpcklbw m2, m0
453
punpcklbw m3, m0
454
punpcklbw m6, m0
455
punpcklbw m4, m0
456
punpcklbw m5, m0
457
paddw m3, m6 ; c0
458
paddw m2, m4 ; b0
459
paddw m1, m5 ; a0
460
movd m7, [src+7]
461
movd m6, [src+6]
462
punpcklbw m7, m0
463
punpcklbw m6, m0
464
paddw m4, m7 ; c1
465
paddw m5, m6 ; b1
466
movd m7, [src+5]
467
movd m6, [src+4]
468
punpcklbw m7, m0
469
punpcklbw m6, m0
470
paddw m6, m7 ; a1
471
movq m7, [pw_1]
472
FILT_H2 m1, m2, m3, m4, m5, m6
473
FILT_PACK m1, m4, m7, 1
474
movntq [r0+r2], m1
475
add r2, 8
476
jl .loop
477
RET
478
479
%macro HPEL_C 0
480
;-----------------------------------------------------------------------------
481
; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
482
;-----------------------------------------------------------------------------
483
cglobal hpel_filter_c, 3,3,9
484
add r0, r2
485
lea r1, [r1+r2*2]
486
neg r2
487
%define src r1+r2*2
488
%ifnidn cpuname, sse2
489
%if cpuflag(ssse3)
490
mova m7, [pw_512]
491
%else
492
mova m7, [pw_32]
493
%endif
494
%define pw_rnd m7
495
%elif ARCH_X86_64
496
mova m8, [pw_32]
497
%define pw_rnd m8
498
%else
499
%define pw_rnd [pw_32]
500
%endif
501
; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
502
%if mmsize==32
503
.loop:
504
movu m4, [src-4]
505
movu m5, [src-2]
506
mova m6, [src+0]
507
movu m3, [src-4+mmsize]
508
movu m2, [src-2+mmsize]
509
mova m1, [src+0+mmsize]
510
paddw m4, [src+6]
511
paddw m5, [src+4]
512
paddw m6, [src+2]
513
paddw m3, [src+6+mmsize]
514
paddw m2, [src+4+mmsize]
515
paddw m1, [src+2+mmsize]
516
FILT_H2 m4, m5, m6, m3, m2, m1
517
%else
518
mova m0, [src-16]
519
mova m1, [src]
520
.loop:
521
mova m2, [src+16]
522
PALIGNR m4, m1, m0, 12, m7
523
PALIGNR m5, m1, m0, 14, m0
524
PALIGNR m0, m2, m1, 6, m7
525
paddw m4, m0
526
PALIGNR m0, m2, m1, 4, m7
527
paddw m5, m0
528
PALIGNR m6, m2, m1, 2, m7
529
paddw m6, m1
530
FILT_H m4, m5, m6
531
532
mova m0, m2
533
mova m5, m2
534
PALIGNR m2, m1, 12, m7
535
PALIGNR m5, m1, 14, m1
536
mova m1, [src+32]
537
PALIGNR m3, m1, m0, 6, m7
538
paddw m3, m2
539
PALIGNR m6, m1, m0, 4, m7
540
paddw m5, m6
541
PALIGNR m6, m1, m0, 2, m7
542
paddw m6, m0
543
FILT_H m3, m5, m6
544
%endif
545
FILT_PACK m4, m3, pw_rnd, 6
546
%if mmsize==32
547
vpermq m4, m4, q3120
548
%endif
549
movnta [r0+r2], m4
550
add r2, mmsize
551
jl .loop
552
RET
553
%endmacro
554
555
;-----------------------------------------------------------------------------
556
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
557
;-----------------------------------------------------------------------------
558
INIT_XMM sse2
559
cglobal hpel_filter_h, 3,3,8
560
add r0, r2
561
add r1, r2
562
neg r2
563
%define src r1+r2
564
pxor m0, m0
565
.loop:
566
movh m1, [src-2]
567
movh m2, [src-1]
568
movh m3, [src ]
569
movh m4, [src+1]
570
movh m5, [src+2]
571
movh m6, [src+3]
572
punpcklbw m1, m0
573
punpcklbw m2, m0
574
punpcklbw m3, m0
575
punpcklbw m4, m0
576
punpcklbw m5, m0
577
punpcklbw m6, m0
578
paddw m3, m4 ; c0
579
paddw m2, m5 ; b0
580
paddw m1, m6 ; a0
581
movh m4, [src+6]
582
movh m5, [src+7]
583
movh m6, [src+10]
584
movh m7, [src+11]
585
punpcklbw m4, m0
586
punpcklbw m5, m0
587
punpcklbw m6, m0
588
punpcklbw m7, m0
589
paddw m5, m6 ; b1
590
paddw m4, m7 ; a1
591
movh m6, [src+8]
592
movh m7, [src+9]
593
punpcklbw m6, m0
594
punpcklbw m7, m0
595
paddw m6, m7 ; c1
596
mova m7, [pw_1] ; FIXME xmm8
597
FILT_H2 m1, m2, m3, m4, m5, m6
598
FILT_PACK m1, m4, m7, 1
599
movntps [r0+r2], m1
600
add r2, 16
601
jl .loop
602
RET
603
604
;-----------------------------------------------------------------------------
605
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
606
;-----------------------------------------------------------------------------
607
%macro HPEL_H 0
608
cglobal hpel_filter_h, 3,3
609
add r0, r2
610
add r1, r2
611
neg r2
612
%define src r1+r2
613
mova m0, [src-16]
614
mova m1, [src]
615
mova m7, [pw_1024]
616
.loop:
617
mova m2, [src+16]
618
; Using unaligned loads instead of palignr is marginally slower on SB and significantly
619
; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
620
; the repeated loads of constants for pmaddubsw.
621
palignr m3, m1, m0, 14
622
palignr m4, m1, m0, 15
623
palignr m0, m2, m1, 2
624
pmaddubsw m3, [filt_mul15]
625
pmaddubsw m4, [filt_mul15]
626
pmaddubsw m0, [filt_mul51]
627
palignr m5, m2, m1, 1
628
palignr m6, m2, m1, 3
629
paddw m3, m0
630
mova m0, m1
631
pmaddubsw m1, [filt_mul20]
632
pmaddubsw m5, [filt_mul20]
633
pmaddubsw m6, [filt_mul51]
634
paddw m3, m1
635
paddw m4, m5
636
paddw m4, m6
637
FILT_PACK m3, m4, m7, 5
638
pshufb m3, [hpel_shuf]
639
mova m1, m2
640
movntps [r0+r2], m3
641
add r2, 16
642
jl .loop
643
RET
644
%endmacro
645
646
INIT_MMX mmx2
647
HPEL_V 0
648
INIT_XMM sse2
649
HPEL_V 8
650
%if ARCH_X86_64 == 0
651
INIT_XMM sse2
652
HPEL_C
653
INIT_XMM ssse3
654
HPEL_C
655
HPEL_V 0
656
HPEL_H
657
INIT_XMM avx
658
HPEL_C
659
HPEL_V 0
660
HPEL_H
661
INIT_YMM avx2
662
HPEL_V 8
663
HPEL_C
664
665
INIT_YMM avx2
666
cglobal hpel_filter_h, 3,3,8
667
add r0, r2
668
add r1, r2
669
neg r2
670
%define src r1+r2
671
mova m5, [filt_mul15]
672
mova m6, [filt_mul20]
673
mova m7, [filt_mul51]
674
.loop:
675
movu m0, [src-2]
676
movu m1, [src-1]
677
movu m2, [src+2]
678
pmaddubsw m0, m5
679
pmaddubsw m1, m5
680
pmaddubsw m2, m7
681
paddw m0, m2
682
683
mova m2, [src+0]
684
movu m3, [src+1]
685
movu m4, [src+3]
686
pmaddubsw m2, m6
687
pmaddubsw m3, m6
688
pmaddubsw m4, m7
689
paddw m0, m2
690
paddw m1, m3
691
paddw m1, m4
692
693
mova m2, [pw_1024]
694
FILT_PACK m0, m1, m2, 5
695
pshufb m0, [hpel_shuf]
696
movnta [r0+r2], m0
697
add r2, mmsize
698
jl .loop
699
RET
700
%endif
701
702
%if ARCH_X86_64
703
%macro DO_FILT_V 5
704
;The optimum prefetch distance is difficult to determine in checkasm:
705
;any prefetch seems slower than not prefetching.
706
;In real use, the prefetch seems to be a slight win.
707
;+mmsize is picked somewhat arbitrarily here based on the fact that even one
708
;loop iteration is going to take longer than the prefetch.
709
prefetcht0 [r1+r2*2+mmsize]
710
%if cpuflag(ssse3)
711
mova m1, [r3]
712
mova m2, [r3+r2]
713
mova %3, [r3+r2*2]
714
mova m3, [r1]
715
mova %1, [r1+r2]
716
mova %2, [r1+r2*2]
717
punpckhbw m4, m1, m2
718
punpcklbw m1, m2
719
punpckhbw m2, %1, %2
720
punpcklbw %1, %2
721
punpckhbw %2, m3, %3
722
punpcklbw m3, %3
723
724
pmaddubsw m1, m12
725
pmaddubsw m4, m12
726
pmaddubsw %1, m0
727
pmaddubsw m2, m0
728
pmaddubsw m3, m14
729
pmaddubsw %2, m14
730
731
paddw m1, %1
732
paddw m4, m2
733
paddw m1, m3
734
paddw m4, %2
735
%else
736
LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
737
LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
738
LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
739
packuswb %3, %4
740
FILT_V2 m1, m2, m3, m4, m5, m6
741
%endif
742
add r3, mmsize
743
add r1, mmsize
744
%if mmsize==32
745
vinserti128 %1, m1, xm4, 1
746
vperm2i128 %2, m1, m4, q0301
747
%else
748
mova %1, m1
749
mova %2, m4
750
%endif
751
FILT_PACK m1, m4, m15, 5
752
movntps [r8+r4+%5], m1
753
%endmacro
754
755
%macro FILT_C 3
756
%if mmsize==32
757
vperm2i128 m3, %2, %1, q0003
758
%endif
759
PALIGNR m1, %2, %1, (mmsize-4), m3
760
PALIGNR m2, %2, %1, (mmsize-2), m3
761
%if mmsize==32
762
vperm2i128 %1, %3, %2, q0003
763
%endif
764
PALIGNR m3, %3, %2, 4, %1
765
PALIGNR m4, %3, %2, 2, %1
766
paddw m3, m2
767
%if mmsize==32
768
mova m2, %1
769
%endif
770
mova %1, %3
771
PALIGNR %3, %3, %2, 6, m2
772
paddw m4, %2
773
paddw %3, m1
774
FILT_H %3, m3, m4
775
%endmacro
776
777
%macro DO_FILT_C 4
778
FILT_C %1, %2, %3
779
FILT_C %2, %1, %4
780
FILT_PACK %3, %4, m15, 6
781
%if mmsize==32
782
vpermq %3, %3, q3120
783
%endif
784
movntps [r5+r4], %3
785
%endmacro
786
787
%macro ADD8TO16 5
788
punpckhbw %3, %1, %5
789
punpcklbw %1, %5
790
punpcklbw %4, %2, %5
791
punpckhbw %2, %5
792
paddw %2, %3
793
paddw %1, %4
794
%endmacro
795
796
%macro DO_FILT_H 3
797
%if mmsize==32
798
vperm2i128 m3, %2, %1, q0003
799
%endif
800
PALIGNR m1, %2, %1, (mmsize-2), m3
801
PALIGNR m2, %2, %1, (mmsize-1), m3
802
%if mmsize==32
803
vperm2i128 m3, %3, %2, q0003
804
%endif
805
PALIGNR m4, %3, %2, 1 , m3
806
PALIGNR m5, %3, %2, 2 , m3
807
PALIGNR m6, %3, %2, 3 , m3
808
mova %1, %2
809
%if cpuflag(ssse3)
810
pmaddubsw m1, m12
811
pmaddubsw m2, m12
812
pmaddubsw %2, m14
813
pmaddubsw m4, m14
814
pmaddubsw m5, m0
815
pmaddubsw m6, m0
816
paddw m1, %2
817
paddw m2, m4
818
paddw m1, m5
819
paddw m2, m6
820
FILT_PACK m1, m2, m15, 5
821
pshufb m1, [hpel_shuf]
822
%else ; ssse3, avx
823
ADD8TO16 m1, m6, m12, m3, m0 ; a
824
ADD8TO16 m2, m5, m12, m3, m0 ; b
825
ADD8TO16 %2, m4, m12, m3, m0 ; c
826
FILT_V2 m1, m2, %2, m6, m5, m4
827
FILT_PACK m1, m6, m15, 5
828
%endif
829
movntps [r0+r4], m1
830
mova %2, %3
831
%endmacro
832
833
%macro HPEL 0
834
;-----------------------------------------------------------------------------
835
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
836
; uint8_t *src, intptr_t stride, int width, int height )
837
;-----------------------------------------------------------------------------
838
cglobal hpel_filter, 7,9,16
839
mov r7, r3
840
sub r5d, mmsize
841
mov r8, r1
842
and r7, mmsize-1
843
sub r3, r7
844
add r0, r5
845
add r8, r5
846
add r7, r5
847
add r5, r2
848
mov r2, r4
849
neg r7
850
lea r1, [r3+r2]
851
sub r3, r2
852
sub r3, r2
853
mov r4, r7
854
%if cpuflag(ssse3)
855
mova m0, [filt_mul51]
856
mova m12, [filt_mul15]
857
mova m14, [filt_mul20]
858
mova m15, [pw_1024]
859
%else
860
pxor m0, m0
861
mova m15, [pw_16]
862
%endif
863
;ALIGN 16
864
.loopy:
865
; first filter_v
866
DO_FILT_V m8, m7, m13, m12, 0
867
;ALIGN 16
868
.loopx:
869
DO_FILT_V m6, m5, m11, m12, mmsize
870
.lastx:
871
%if cpuflag(ssse3)
872
psrlw m15, 1 ; pw_512
873
%else
874
paddw m15, m15 ; pw_32
875
%endif
876
DO_FILT_C m9, m8, m7, m6
877
%if cpuflag(ssse3)
878
paddw m15, m15 ; pw_1024
879
%else
880
psrlw m15, 1 ; pw_16
881
%endif
882
mova m7, m5
883
DO_FILT_H m10, m13, m11
884
add r4, mmsize
885
jl .loopx
886
cmp r4, mmsize
887
jl .lastx
888
; setup regs for next y
889
sub r4, r7
890
sub r4, r2
891
sub r1, r4
892
sub r3, r4
893
add r0, r2
894
add r8, r2
895
add r5, r2
896
mov r4, r7
897
sub r6d, 1
898
jg .loopy
899
sfence
900
RET
901
%endmacro
902
903
INIT_XMM sse2
904
HPEL
905
INIT_XMM ssse3
906
HPEL
907
INIT_XMM avx
908
HPEL
909
INIT_YMM avx2
910
HPEL
911
%endif ; ARCH_X86_64
912
913
%undef movntq
914
%undef movntps
915
%undef sfence
916
%endif ; !HIGH_BIT_DEPTH
917
918
%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
919
%assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
920
%rep (%2+63) / 64 ; assume 64 byte cache lines
921
prefetchnta [%1+%%i]
922
%assign %%i %%i + 64
923
%endrep
924
%endmacro
925
926
;-----------------------------------------------------------------------------
927
; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
928
; pixel *src, intptr_t i_src, int w, int h )
929
;-----------------------------------------------------------------------------
930
; assumes i_dst and w are multiples of mmsize, and i_dst>w
931
%macro PLANE_COPY_CORE 1 ; swap
932
%if %1
933
cglobal plane_copy_swap_core, 6,7
934
mova m4, [copy_swap_shuf]
935
%else
936
cglobal plane_copy_core, 6,7
937
%endif
938
FIX_STRIDES r1, r3
939
%if %1 && HIGH_BIT_DEPTH
940
shl r4d, 2
941
%elif %1 || HIGH_BIT_DEPTH
942
add r4d, r4d
943
%else
944
movsxdifnidn r4, r4d
945
%endif
946
add r0, r4
947
add r2, r4
948
neg r4
949
.loopy:
950
lea r6, [r4+4*mmsize]
951
%if %1
952
test r6d, r6d
953
jg .skip
954
%endif
955
.loopx:
956
PREFETCHNT_ITER r2+r6, 4*mmsize
957
movu m0, [r2+r6-4*mmsize]
958
movu m1, [r2+r6-3*mmsize]
959
movu m2, [r2+r6-2*mmsize]
960
movu m3, [r2+r6-1*mmsize]
961
%if %1
962
pshufb m0, m4
963
pshufb m1, m4
964
pshufb m2, m4
965
pshufb m3, m4
966
%endif
967
movnta [r0+r6-4*mmsize], m0
968
movnta [r0+r6-3*mmsize], m1
969
movnta [r0+r6-2*mmsize], m2
970
movnta [r0+r6-1*mmsize], m3
971
add r6, 4*mmsize
972
jle .loopx
973
.skip:
974
PREFETCHNT_ITER r2+r6, 4*mmsize
975
sub r6, 4*mmsize
976
jz .end
977
.loop_end:
978
movu m0, [r2+r6]
979
%if %1
980
pshufb m0, m4
981
%endif
982
movnta [r0+r6], m0
983
add r6, mmsize
984
jl .loop_end
985
.end:
986
add r0, r1
987
add r2, r3
988
dec r5d
989
jg .loopy
990
sfence
991
RET
992
%endmacro
993
994
INIT_XMM sse
995
PLANE_COPY_CORE 0
996
INIT_XMM ssse3
997
PLANE_COPY_CORE 1
998
INIT_YMM avx
999
PLANE_COPY_CORE 0
1000
INIT_YMM avx2
1001
PLANE_COPY_CORE 1
1002
1003
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
1004
%if HIGH_BIT_DEPTH
1005
%assign x 0
1006
%rep 16/mmsize
1007
mov%4 m0, [%2+(x/2)*mmsize]
1008
mov%4 m1, [%3+(x/2)*mmsize]
1009
punpckhwd m2, m0, m1
1010
punpcklwd m0, m1
1011
mov%5a [%1+(x+0)*mmsize], m0
1012
mov%5a [%1+(x+1)*mmsize], m2
1013
%assign x (x+2)
1014
%endrep
1015
%else
1016
movq m0, [%2]
1017
%if mmsize==16
1018
%ifidn %4, a
1019
punpcklbw m0, [%3]
1020
%else
1021
movq m1, [%3]
1022
punpcklbw m0, m1
1023
%endif
1024
mov%5a [%1], m0
1025
%else
1026
movq m1, [%3]
1027
punpckhbw m2, m0, m1
1028
punpcklbw m0, m1
1029
mov%5a [%1+0], m0
1030
mov%5a [%1+8], m2
1031
%endif
1032
%endif ; HIGH_BIT_DEPTH
1033
%endmacro
1034
1035
%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
1036
%if HIGH_BIT_DEPTH
1037
%assign n 0
1038
%rep 16/mmsize
1039
mova m0, [%3+(n+0)*mmsize]
1040
mova m1, [%3+(n+1)*mmsize]
1041
psrld m2, m0, 16
1042
psrld m3, m1, 16
1043
pand m0, %5
1044
pand m1, %5
1045
packssdw m0, m1
1046
packssdw m2, m3
1047
mov%6 [%1+(n/2)*mmsize], m0
1048
mov%6 [%2+(n/2)*mmsize], m2
1049
%assign n (n+2)
1050
%endrep
1051
%else ; !HIGH_BIT_DEPTH
1052
%if mmsize==16
1053
mova m0, [%3]
1054
%if cpuflag(ssse3)
1055
pshufb m0, %5
1056
%else
1057
mova m1, m0
1058
pand m0, %5
1059
psrlw m1, 8
1060
packuswb m0, m1
1061
%endif
1062
%if %4
1063
mova [%1], m0
1064
%else
1065
movq [%1], m0
1066
movhps [%2], m0
1067
%endif
1068
%else
1069
mova m0, [%3]
1070
mova m1, [%3+8]
1071
mova m2, m0
1072
mova m3, m1
1073
pand m0, %5
1074
pand m1, %5
1075
psrlw m2, 8
1076
psrlw m3, 8
1077
packuswb m0, m1
1078
packuswb m2, m3
1079
mova [%1], m0
1080
mova [%2], m2
1081
%endif ; mmsize == 16
1082
%endif ; HIGH_BIT_DEPTH
1083
%endmacro
1084
1085
%macro PLANE_INTERLEAVE 0
1086
;-----------------------------------------------------------------------------
1087
; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
1088
; uint8_t *srcu, intptr_t i_srcu,
1089
; uint8_t *srcv, intptr_t i_srcv, int w, int h )
1090
;-----------------------------------------------------------------------------
1091
; assumes i_dst and w are multiples of 16, and i_dst>2*w
1092
cglobal plane_copy_interleave_core, 6,9
1093
mov r6d, r6m
1094
%if HIGH_BIT_DEPTH
1095
FIX_STRIDES r1, r3, r5, r6d
1096
movifnidn r1mp, r1
1097
movifnidn r3mp, r3
1098
mov r6m, r6d
1099
%endif
1100
lea r0, [r0+r6*2]
1101
add r2, r6
1102
add r4, r6
1103
%if ARCH_X86_64
1104
DECLARE_REG_TMP 7,8
1105
%else
1106
DECLARE_REG_TMP 1,3
1107
%endif
1108
mov t1, r1
1109
shr t1, SIZEOF_PIXEL
1110
sub t1, r6
1111
mov t0d, r7m
1112
.loopy:
1113
mov r6d, r6m
1114
neg r6
1115
.prefetch:
1116
prefetchnta [r2+r6]
1117
prefetchnta [r4+r6]
1118
add r6, 64
1119
jl .prefetch
1120
mov r6d, r6m
1121
neg r6
1122
.loopx:
1123
INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
1124
INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
1125
add r6, 16*SIZEOF_PIXEL
1126
jl .loopx
1127
.pad:
1128
%assign n 0
1129
%rep SIZEOF_PIXEL
1130
%if mmsize==8
1131
movntq [r0+r6*2+(n+ 0)], m0
1132
movntq [r0+r6*2+(n+ 8)], m0
1133
movntq [r0+r6*2+(n+16)], m0
1134
movntq [r0+r6*2+(n+24)], m0
1135
%else
1136
movntdq [r0+r6*2+(n+ 0)], m0
1137
movntdq [r0+r6*2+(n+16)], m0
1138
%endif
1139
%assign n n+32
1140
%endrep
1141
add r6, 16*SIZEOF_PIXEL
1142
cmp r6, t1
1143
jl .pad
1144
add r0, r1mp
1145
add r2, r3mp
1146
add r4, r5
1147
dec t0d
1148
jg .loopy
1149
sfence
1150
emms
1151
RET
1152
1153
;-----------------------------------------------------------------------------
1154
; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
1155
;-----------------------------------------------------------------------------
1156
cglobal store_interleave_chroma, 5,5
1157
FIX_STRIDES r1
1158
.loop:
1159
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
1160
INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
1161
add r2, FDEC_STRIDEB*2
1162
add r3, FDEC_STRIDEB*2
1163
lea r0, [r0+r1*2]
1164
sub r4d, 2
1165
jg .loop
1166
RET
1167
%endmacro ; PLANE_INTERLEAVE
1168
1169
%macro DEINTERLEAVE_START 0
1170
%if HIGH_BIT_DEPTH
1171
mova m4, [pd_ffff]
1172
%elif cpuflag(ssse3)
1173
mova m4, [deinterleave_shuf]
1174
%else
1175
mova m4, [pw_00ff]
1176
%endif ; HIGH_BIT_DEPTH
1177
%endmacro
1178
1179
%macro PLANE_DEINTERLEAVE 0
1180
;-----------------------------------------------------------------------------
1181
; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
1182
; pixel *dstv, intptr_t i_dstv,
1183
; pixel *src, intptr_t i_src, int w, int h )
1184
;-----------------------------------------------------------------------------
1185
cglobal plane_copy_deinterleave, 6,7
1186
DEINTERLEAVE_START
1187
mov r6d, r6m
1188
FIX_STRIDES r1, r3, r5, r6d
1189
%if HIGH_BIT_DEPTH
1190
mov r6m, r6d
1191
%endif
1192
add r0, r6
1193
add r2, r6
1194
lea r4, [r4+r6*2]
1195
.loopy:
1196
mov r6d, r6m
1197
neg r6
1198
.loopx:
1199
DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
1200
DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
1201
add r6, 16*SIZEOF_PIXEL
1202
jl .loopx
1203
add r0, r1
1204
add r2, r3
1205
add r4, r5
1206
dec dword r7m
1207
jg .loopy
1208
RET
1209
1210
;-----------------------------------------------------------------------------
1211
; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
1212
;-----------------------------------------------------------------------------
1213
cglobal load_deinterleave_chroma_fenc, 4,4
1214
DEINTERLEAVE_START
1215
FIX_STRIDES r2
1216
.loop:
1217
DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
1218
DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
1219
add r0, FENC_STRIDEB*2
1220
lea r1, [r1+r2*2]
1221
sub r3d, 2
1222
jg .loop
1223
RET
1224
1225
;-----------------------------------------------------------------------------
1226
; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
1227
;-----------------------------------------------------------------------------
1228
cglobal load_deinterleave_chroma_fdec, 4,4
1229
DEINTERLEAVE_START
1230
FIX_STRIDES r2
1231
.loop:
1232
DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
1233
DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
1234
add r0, FDEC_STRIDEB*2
1235
lea r1, [r1+r2*2]
1236
sub r3d, 2
1237
jg .loop
1238
RET
1239
%endmacro ; PLANE_DEINTERLEAVE
1240
1241
%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
1242
%if cpuflag(ssse3)
1243
mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
1244
%endif
1245
%%loopy:
1246
mov %8, r6
1247
mov %9, %6
1248
%%loopx:
1249
movu m0, [%8]
1250
movu m1, [%8+%1*mmsize/4]
1251
%if cpuflag(ssse3)
1252
pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1253
pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1254
%elif %1 == 3
1255
psrldq m2, m0, 6
1256
punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
1257
psrldq m1, 6
1258
punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
1259
psrlq m3, m0, 24
1260
psrlq m4, m2, 24
1261
punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5
1262
punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1263
punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7
1264
punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3
1265
punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1266
punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1267
%else
1268
pshufd m3, m0, q2301
1269
pshufd m4, m1, q2301
1270
punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3
1271
punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1
1272
punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7
1273
punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5
1274
punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
1275
punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
1276
%endif
1277
punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
1278
punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7
1279
movh [r0+%9], m2
1280
movhps [r2+%9], m2
1281
movh [r4+%9], m0
1282
add %8, %1*mmsize/2
1283
add %9, mmsize/2
1284
jl %%loopx
1285
add r0, %2
1286
add r2, %3
1287
add r4, %4
1288
add r6, %5
1289
dec %7d
1290
jg %%loopy
1291
%endmacro
1292
1293
%macro PLANE_DEINTERLEAVE_RGB 0
1294
;-----------------------------------------------------------------------------
1295
; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
1296
; pixel *dstb, intptr_t i_dstb,
1297
; pixel *dstc, intptr_t i_dstc,
1298
; pixel *src, intptr_t i_src, int pw, int w, int h )
1299
;-----------------------------------------------------------------------------
1300
%if ARCH_X86_64
1301
cglobal plane_copy_deinterleave_rgb, 8,12
1302
%define %%args r1, r3, r5, r7, r8, r9, r10, r11
1303
mov r8d, r9m
1304
mov r9d, r10m
1305
add r0, r8
1306
add r2, r8
1307
add r4, r8
1308
neg r8
1309
%else
1310
cglobal plane_copy_deinterleave_rgb, 1,7
1311
%define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
1312
mov r1, r9m
1313
mov r2, r2m
1314
mov r4, r4m
1315
mov r6, r6m
1316
add r0, r1
1317
add r2, r1
1318
add r4, r1
1319
neg r1
1320
mov r9m, r1
1321
mov r1, r10m
1322
%endif
1323
cmp dword r8m, 4
1324
je .pw4
1325
PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
1326
jmp .ret
1327
.pw4:
1328
PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
1329
.ret:
1330
REP_RET
1331
%endmacro
1332
1333
%if HIGH_BIT_DEPTH == 0
1334
INIT_XMM sse2
1335
PLANE_DEINTERLEAVE_RGB
1336
INIT_XMM ssse3
1337
PLANE_DEINTERLEAVE_RGB
1338
%endif ; !HIGH_BIT_DEPTH
1339
1340
%macro PLANE_DEINTERLEAVE_V210 0
1341
;-----------------------------------------------------------------------------
1342
; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
1343
; uint16_t *dstc, intptr_t i_dstc,
1344
; uint32_t *src, intptr_t i_src, int w, int h )
1345
;-----------------------------------------------------------------------------
1346
%if ARCH_X86_64
1347
cglobal plane_copy_deinterleave_v210, 8,10,7
1348
%define src r8
1349
%define org_w r9
1350
%define h r7d
1351
%else
1352
cglobal plane_copy_deinterleave_v210, 7,7,7
1353
%define src r4m
1354
%define org_w r6m
1355
%define h dword r7m
1356
%endif
1357
FIX_STRIDES r1, r3, r6d
1358
shl r5, 2
1359
add r0, r6
1360
add r2, r6
1361
neg r6
1362
mov src, r4
1363
mov org_w, r6
1364
mova m2, [v210_mask]
1365
mova m3, [v210_luma_shuf]
1366
mova m4, [v210_chroma_shuf]
1367
mova m5, [v210_mult] ; also functions as vpermd index for avx2
1368
pshufd m6, m5, q1102
1369
1370
ALIGN 16
1371
.loop:
1372
movu m1, [r4]
1373
pandn m0, m2, m1
1374
pand m1, m2
1375
pshufb m0, m3
1376
pshufb m1, m4
1377
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
1378
pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
1379
%if mmsize == 32
1380
vpermd m0, m5, m0
1381
vpermd m1, m5, m1
1382
%endif
1383
movu [r0+r6], m0
1384
movu [r2+r6], m1
1385
add r4, mmsize
1386
add r6, 3*mmsize/4
1387
jl .loop
1388
add r0, r1
1389
add r2, r3
1390
add src, r5
1391
mov r4, src
1392
mov r6, org_w
1393
dec h
1394
jg .loop
1395
RET
1396
%endmacro ; PLANE_DEINTERLEAVE_V210
1397
1398
%if HIGH_BIT_DEPTH
1399
INIT_MMX mmx2
1400
PLANE_INTERLEAVE
1401
INIT_MMX mmx
1402
PLANE_DEINTERLEAVE
1403
INIT_XMM sse2
1404
PLANE_INTERLEAVE
1405
PLANE_DEINTERLEAVE
1406
INIT_XMM ssse3
1407
PLANE_DEINTERLEAVE_V210
1408
INIT_XMM avx
1409
PLANE_INTERLEAVE
1410
PLANE_DEINTERLEAVE
1411
PLANE_DEINTERLEAVE_V210
1412
INIT_YMM avx2
1413
PLANE_DEINTERLEAVE_V210
1414
%else
1415
INIT_MMX mmx2
1416
PLANE_INTERLEAVE
1417
INIT_MMX mmx
1418
PLANE_DEINTERLEAVE
1419
INIT_XMM sse2
1420
PLANE_INTERLEAVE
1421
PLANE_DEINTERLEAVE
1422
INIT_XMM ssse3
1423
PLANE_DEINTERLEAVE
1424
%endif
1425
1426
; These functions are not general-use; not only do the SSE ones require aligned input,
1427
; but they also will fail if given a non-mod16 size.
1428
; memzero SSE will fail for non-mod128.
1429
1430
;-----------------------------------------------------------------------------
1431
; void *memcpy_aligned( void *dst, const void *src, size_t n );
1432
;-----------------------------------------------------------------------------
1433
%macro MEMCPY 0
1434
cglobal memcpy_aligned, 3,3
1435
%if mmsize == 16
1436
test r2d, 16
1437
jz .copy2
1438
mova m0, [r1+r2-16]
1439
mova [r0+r2-16], m0
1440
sub r2d, 16
1441
.copy2:
1442
%endif
1443
test r2d, 2*mmsize
1444
jz .copy4start
1445
mova m0, [r1+r2-1*mmsize]
1446
mova m1, [r1+r2-2*mmsize]
1447
mova [r0+r2-1*mmsize], m0
1448
mova [r0+r2-2*mmsize], m1
1449
sub r2d, 2*mmsize
1450
.copy4start:
1451
test r2d, r2d
1452
jz .ret
1453
.copy4:
1454
mova m0, [r1+r2-1*mmsize]
1455
mova m1, [r1+r2-2*mmsize]
1456
mova m2, [r1+r2-3*mmsize]
1457
mova m3, [r1+r2-4*mmsize]
1458
mova [r0+r2-1*mmsize], m0
1459
mova [r0+r2-2*mmsize], m1
1460
mova [r0+r2-3*mmsize], m2
1461
mova [r0+r2-4*mmsize], m3
1462
sub r2d, 4*mmsize
1463
jg .copy4
1464
.ret:
1465
REP_RET
1466
%endmacro
1467
1468
INIT_MMX mmx
1469
MEMCPY
1470
INIT_XMM sse
1471
MEMCPY
1472
1473
;-----------------------------------------------------------------------------
1474
; void *memzero_aligned( void *dst, size_t n );
1475
;-----------------------------------------------------------------------------
1476
%macro MEMZERO 1
1477
cglobal memzero_aligned, 2,2
1478
add r0, r1
1479
neg r1
1480
%if mmsize == 8
1481
pxor m0, m0
1482
%else
1483
xorps m0, m0
1484
%endif
1485
.loop:
1486
%assign i 0
1487
%rep %1
1488
mova [r0 + r1 + i], m0
1489
%assign i i+mmsize
1490
%endrep
1491
add r1, mmsize*%1
1492
jl .loop
1493
RET
1494
%endmacro
1495
1496
INIT_MMX mmx
1497
MEMZERO 8
1498
INIT_XMM sse
1499
MEMZERO 8
1500
INIT_YMM avx
1501
MEMZERO 4
1502
1503
%if HIGH_BIT_DEPTH == 0
1504
;-----------------------------------------------------------------------------
1505
; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
1506
;-----------------------------------------------------------------------------
1507
%macro INTEGRAL_INIT4H 0
1508
cglobal integral_init4h, 3,4
1509
lea r3, [r0+r2*2]
1510
add r1, r2
1511
neg r2
1512
pxor m4, m4
1513
.loop:
1514
mova xm0, [r1+r2]
1515
mova xm1, [r1+r2+16]
1516
%if mmsize==32
1517
vinserti128 m0, m0, [r1+r2+ 8], 1
1518
vinserti128 m1, m1, [r1+r2+24], 1
1519
%else
1520
palignr m1, m0, 8
1521
%endif
1522
mpsadbw m0, m4, 0
1523
mpsadbw m1, m4, 0
1524
paddw m0, [r0+r2*2]
1525
paddw m1, [r0+r2*2+mmsize]
1526
mova [r3+r2*2 ], m0
1527
mova [r3+r2*2+mmsize], m1
1528
add r2, mmsize
1529
jl .loop
1530
RET
1531
%endmacro
1532
1533
INIT_XMM sse4
1534
INTEGRAL_INIT4H
1535
INIT_YMM avx2
1536
INTEGRAL_INIT4H
1537
1538
%macro INTEGRAL_INIT8H 0
1539
cglobal integral_init8h, 3,4
1540
lea r3, [r0+r2*2]
1541
add r1, r2
1542
neg r2
1543
pxor m4, m4
1544
.loop:
1545
mova xm0, [r1+r2]
1546
mova xm1, [r1+r2+16]
1547
%if mmsize==32
1548
vinserti128 m0, m0, [r1+r2+ 8], 1
1549
vinserti128 m1, m1, [r1+r2+24], 1
1550
mpsadbw m2, m0, m4, 100100b
1551
mpsadbw m3, m1, m4, 100100b
1552
%else
1553
palignr m1, m0, 8
1554
mpsadbw m2, m0, m4, 100b
1555
mpsadbw m3, m1, m4, 100b
1556
%endif
1557
mpsadbw m0, m4, 0
1558
mpsadbw m1, m4, 0
1559
paddw m0, [r0+r2*2]
1560
paddw m1, [r0+r2*2+mmsize]
1561
paddw m0, m2
1562
paddw m1, m3
1563
mova [r3+r2*2 ], m0
1564
mova [r3+r2*2+mmsize], m1
1565
add r2, mmsize
1566
jl .loop
1567
RET
1568
%endmacro
1569
1570
INIT_XMM sse4
1571
INTEGRAL_INIT8H
1572
INIT_XMM avx
1573
INTEGRAL_INIT8H
1574
INIT_YMM avx2
1575
INTEGRAL_INIT8H
1576
%endif ; !HIGH_BIT_DEPTH
1577
1578
%macro INTEGRAL_INIT_8V 0
1579
;-----------------------------------------------------------------------------
1580
; void integral_init8v( uint16_t *sum8, intptr_t stride )
1581
;-----------------------------------------------------------------------------
1582
cglobal integral_init8v, 3,3
1583
add r1, r1
1584
add r0, r1
1585
lea r2, [r0+r1*8]
1586
neg r1
1587
.loop:
1588
mova m0, [r2+r1]
1589
mova m1, [r2+r1+mmsize]
1590
psubw m0, [r0+r1]
1591
psubw m1, [r0+r1+mmsize]
1592
mova [r0+r1], m0
1593
mova [r0+r1+mmsize], m1
1594
add r1, 2*mmsize
1595
jl .loop
1596
RET
1597
%endmacro
1598
1599
INIT_MMX mmx
1600
INTEGRAL_INIT_8V
1601
INIT_XMM sse2
1602
INTEGRAL_INIT_8V
1603
INIT_YMM avx2
1604
INTEGRAL_INIT_8V
1605
1606
;-----------------------------------------------------------------------------
1607
; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
1608
;-----------------------------------------------------------------------------
1609
INIT_MMX mmx
1610
cglobal integral_init4v, 3,5
1611
shl r2, 1
1612
lea r3, [r0+r2*4]
1613
lea r4, [r0+r2*8]
1614
mova m0, [r0+r2]
1615
mova m4, [r4+r2]
1616
.loop:
1617
mova m1, m4
1618
psubw m1, m0
1619
mova m4, [r4+r2-8]
1620
mova m0, [r0+r2-8]
1621
paddw m1, m4
1622
mova m3, [r3+r2-8]
1623
psubw m1, m0
1624
psubw m3, m0
1625
mova [r0+r2-8], m1
1626
mova [r1+r2-8], m3
1627
sub r2, 8
1628
jge .loop
1629
RET
1630
1631
INIT_XMM sse2
1632
cglobal integral_init4v, 3,5
1633
shl r2, 1
1634
add r0, r2
1635
add r1, r2
1636
lea r3, [r0+r2*4]
1637
lea r4, [r0+r2*8]
1638
neg r2
1639
.loop:
1640
mova m0, [r0+r2]
1641
mova m1, [r4+r2]
1642
mova m2, m0
1643
mova m4, m1
1644
shufpd m0, [r0+r2+16], 1
1645
shufpd m1, [r4+r2+16], 1
1646
paddw m0, m2
1647
paddw m1, m4
1648
mova m3, [r3+r2]
1649
psubw m1, m0
1650
psubw m3, m2
1651
mova [r0+r2], m1
1652
mova [r1+r2], m3
1653
add r2, 16
1654
jl .loop
1655
RET
1656
1657
INIT_XMM ssse3
1658
cglobal integral_init4v, 3,5
1659
shl r2, 1
1660
add r0, r2
1661
add r1, r2
1662
lea r3, [r0+r2*4]
1663
lea r4, [r0+r2*8]
1664
neg r2
1665
.loop:
1666
mova m2, [r0+r2]
1667
mova m0, [r0+r2+16]
1668
mova m4, [r4+r2]
1669
mova m1, [r4+r2+16]
1670
palignr m0, m2, 8
1671
palignr m1, m4, 8
1672
paddw m0, m2
1673
paddw m1, m4
1674
mova m3, [r3+r2]
1675
psubw m1, m0
1676
psubw m3, m2
1677
mova [r0+r2], m1
1678
mova [r1+r2], m3
1679
add r2, 16
1680
jl .loop
1681
RET
1682
1683
INIT_YMM avx2
1684
cglobal integral_init4v, 3,5
1685
add r2, r2
1686
add r0, r2
1687
add r1, r2
1688
lea r3, [r0+r2*4]
1689
lea r4, [r0+r2*8]
1690
neg r2
1691
.loop:
1692
mova m2, [r0+r2]
1693
movu m1, [r4+r2+8]
1694
paddw m0, m2, [r0+r2+8]
1695
paddw m1, [r4+r2]
1696
mova m3, [r3+r2]
1697
psubw m1, m0
1698
psubw m3, m2
1699
mova [r0+r2], m1
1700
mova [r1+r2], m3
1701
add r2, 32
1702
jl .loop
1703
RET
1704
1705
%macro FILT8x4 7
1706
mova %3, [r0+%7]
1707
mova %4, [r0+r5+%7]
1708
pavgb %3, %4
1709
pavgb %4, [r0+r5*2+%7]
1710
PALIGNR %1, %3, 1, m6
1711
PALIGNR %2, %4, 1, m6
1712
%if cpuflag(xop)
1713
pavgb %1, %3
1714
pavgb %2, %4
1715
%else
1716
pavgb %1, %3
1717
pavgb %2, %4
1718
psrlw %5, %1, 8
1719
psrlw %6, %2, 8
1720
pand %1, m7
1721
pand %2, m7
1722
%endif
1723
%endmacro
1724
1725
%macro FILT32x4U 4
1726
mova m1, [r0+r5]
1727
pavgb m0, m1, [r0]
1728
movu m3, [r0+r5+1]
1729
pavgb m2, m3, [r0+1]
1730
pavgb m1, [r0+r5*2]
1731
pavgb m3, [r0+r5*2+1]
1732
pavgb m0, m2
1733
pavgb m1, m3
1734
1735
mova m3, [r0+r5+mmsize]
1736
pavgb m2, m3, [r0+mmsize]
1737
movu m5, [r0+r5+1+mmsize]
1738
pavgb m4, m5, [r0+1+mmsize]
1739
pavgb m3, [r0+r5*2+mmsize]
1740
pavgb m5, [r0+r5*2+1+mmsize]
1741
pavgb m2, m4
1742
pavgb m3, m5
1743
1744
pshufb m0, m7
1745
pshufb m1, m7
1746
pshufb m2, m7
1747
pshufb m3, m7
1748
punpckhqdq m4, m0, m2
1749
punpcklqdq m0, m0, m2
1750
punpckhqdq m5, m1, m3
1751
punpcklqdq m2, m1, m3
1752
vpermq m0, m0, q3120
1753
vpermq m1, m4, q3120
1754
vpermq m2, m2, q3120
1755
vpermq m3, m5, q3120
1756
mova [%1], m0
1757
mova [%2], m1
1758
mova [%3], m2
1759
mova [%4], m3
1760
%endmacro
1761
1762
%macro FILT16x2 4
1763
mova m3, [r0+%4+mmsize]
1764
mova m2, [r0+%4]
1765
pavgb m3, [r0+%4+r5+mmsize]
1766
pavgb m2, [r0+%4+r5]
1767
PALIGNR %1, m3, 1, m6
1768
pavgb %1, m3
1769
PALIGNR m3, m2, 1, m6
1770
pavgb m3, m2
1771
%if cpuflag(xop)
1772
vpperm m5, m3, %1, m7
1773
vpperm m3, m3, %1, m6
1774
%else
1775
psrlw m5, m3, 8
1776
psrlw m4, %1, 8
1777
pand m3, m7
1778
pand %1, m7
1779
packuswb m3, %1
1780
packuswb m5, m4
1781
%endif
1782
mova [%2], m3
1783
mova [%3], m5
1784
mova %1, m2
1785
%endmacro
1786
1787
%macro FILT8x2U 3
1788
mova m3, [r0+%3+8]
1789
mova m2, [r0+%3]
1790
pavgb m3, [r0+%3+r5+8]
1791
pavgb m2, [r0+%3+r5]
1792
mova m1, [r0+%3+9]
1793
mova m0, [r0+%3+1]
1794
pavgb m1, [r0+%3+r5+9]
1795
pavgb m0, [r0+%3+r5+1]
1796
pavgb m1, m3
1797
pavgb m0, m2
1798
psrlw m3, m1, 8
1799
psrlw m2, m0, 8
1800
pand m1, m7
1801
pand m0, m7
1802
packuswb m0, m1
1803
packuswb m2, m3
1804
mova [%1], m0
1805
mova [%2], m2
1806
%endmacro
1807
1808
%macro FILT8xU 3
1809
mova m3, [r0+%3+8]
1810
mova m2, [r0+%3]
1811
pavgw m3, [r0+%3+r5+8]
1812
pavgw m2, [r0+%3+r5]
1813
movu m1, [r0+%3+10]
1814
movu m0, [r0+%3+2]
1815
pavgw m1, [r0+%3+r5+10]
1816
pavgw m0, [r0+%3+r5+2]
1817
pavgw m1, m3
1818
pavgw m0, m2
1819
psrld m3, m1, 16
1820
psrld m2, m0, 16
1821
pand m1, m7
1822
pand m0, m7
1823
packssdw m0, m1
1824
packssdw m2, m3
1825
movu [%1], m0
1826
mova [%2], m2
1827
%endmacro
1828
1829
%macro FILT8xA 4
1830
mova m3, [r0+%4+mmsize]
1831
mova m2, [r0+%4]
1832
pavgw m3, [r0+%4+r5+mmsize]
1833
pavgw m2, [r0+%4+r5]
1834
PALIGNR %1, m3, 2, m6
1835
pavgw %1, m3
1836
PALIGNR m3, m2, 2, m6
1837
pavgw m3, m2
1838
%if cpuflag(xop)
1839
vpperm m5, m3, %1, m7
1840
vpperm m3, m3, %1, m6
1841
%else
1842
psrld m5, m3, 16
1843
psrld m4, %1, 16
1844
pand m3, m7
1845
pand %1, m7
1846
packssdw m3, %1
1847
packssdw m5, m4
1848
%endif
1849
mova [%2], m3
1850
mova [%3], m5
1851
mova %1, m2
1852
%endmacro
1853
1854
;-----------------------------------------------------------------------------
1855
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1856
; intptr_t src_stride, intptr_t dst_stride, int width, int height )
1857
;-----------------------------------------------------------------------------
1858
%macro FRAME_INIT_LOWRES 0
1859
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
1860
%if HIGH_BIT_DEPTH
1861
shl dword r6m, 1
1862
FIX_STRIDES r5
1863
shl dword r7m, 1
1864
%endif
1865
%if mmsize >= 16
1866
add dword r7m, mmsize-1
1867
and dword r7m, ~(mmsize-1)
1868
%endif
1869
; src += 2*(height-1)*stride + 2*width
1870
mov r6d, r8m
1871
dec r6d
1872
imul r6d, r5d
1873
add r6d, r7m
1874
lea r0, [r0+r6*2]
1875
; dst += (height-1)*stride + width
1876
mov r6d, r8m
1877
dec r6d
1878
imul r6d, r6m
1879
add r6d, r7m
1880
add r1, r6
1881
add r2, r6
1882
add r3, r6
1883
add r4, r6
1884
; gap = stride - width
1885
mov r6d, r6m
1886
sub r6d, r7m
1887
PUSH r6
1888
%define dst_gap [rsp+gprsize]
1889
mov r6d, r5d
1890
sub r6d, r7m
1891
shl r6d, 1
1892
PUSH r6
1893
%define src_gap [rsp]
1894
%if HIGH_BIT_DEPTH
1895
%if cpuflag(xop)
1896
mova m6, [deinterleave_shuf32a]
1897
mova m7, [deinterleave_shuf32b]
1898
%else
1899
pcmpeqw m7, m7
1900
psrld m7, 16
1901
%endif
1902
.vloop:
1903
mov r6d, r7m
1904
%ifnidn cpuname, mmx2
1905
mova m0, [r0]
1906
mova m1, [r0+r5]
1907
pavgw m0, m1
1908
pavgw m1, [r0+r5*2]
1909
%endif
1910
.hloop:
1911
sub r0, mmsize*2
1912
sub r1, mmsize
1913
sub r2, mmsize
1914
sub r3, mmsize
1915
sub r4, mmsize
1916
%ifidn cpuname, mmx2
1917
FILT8xU r1, r2, 0
1918
FILT8xU r3, r4, r5
1919
%else
1920
FILT8xA m0, r1, r2, 0
1921
FILT8xA m1, r3, r4, r5
1922
%endif
1923
sub r6d, mmsize
1924
jg .hloop
1925
%else ; !HIGH_BIT_DEPTH
1926
%if cpuflag(avx2)
1927
mova m7, [deinterleave_shuf]
1928
%elif cpuflag(xop)
1929
mova m6, [deinterleave_shuf32a]
1930
mova m7, [deinterleave_shuf32b]
1931
%else
1932
pcmpeqb m7, m7
1933
psrlw m7, 8
1934
%endif
1935
.vloop:
1936
mov r6d, r7m
1937
%ifnidn cpuname, mmx2
1938
%if mmsize <= 16
1939
mova m0, [r0]
1940
mova m1, [r0+r5]
1941
pavgb m0, m1
1942
pavgb m1, [r0+r5*2]
1943
%endif
1944
%endif
1945
.hloop:
1946
sub r0, mmsize*2
1947
sub r1, mmsize
1948
sub r2, mmsize
1949
sub r3, mmsize
1950
sub r4, mmsize
1951
%if mmsize==32
1952
FILT32x4U r1, r2, r3, r4
1953
%elifdef m8
1954
FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1955
mova m8, m0
1956
mova m9, m1
1957
FILT8x4 m2, m3, m0, m1, m4, m5, 0
1958
%if cpuflag(xop)
1959
vpperm m4, m2, m8, m7
1960
vpperm m2, m2, m8, m6
1961
vpperm m5, m3, m9, m7
1962
vpperm m3, m3, m9, m6
1963
%else
1964
packuswb m2, m8
1965
packuswb m3, m9
1966
packuswb m4, m10
1967
packuswb m5, m11
1968
%endif
1969
mova [r1], m2
1970
mova [r2], m4
1971
mova [r3], m3
1972
mova [r4], m5
1973
%elifidn cpuname, mmx2
1974
FILT8x2U r1, r2, 0
1975
FILT8x2U r3, r4, r5
1976
%else
1977
FILT16x2 m0, r1, r2, 0
1978
FILT16x2 m1, r3, r4, r5
1979
%endif
1980
sub r6d, mmsize
1981
jg .hloop
1982
%endif ; HIGH_BIT_DEPTH
1983
.skip:
1984
mov r6, dst_gap
1985
sub r0, src_gap
1986
sub r1, r6
1987
sub r2, r6
1988
sub r3, r6
1989
sub r4, r6
1990
dec dword r8m
1991
jg .vloop
1992
ADD rsp, 2*gprsize
1993
emms
1994
RET
1995
%endmacro ; FRAME_INIT_LOWRES
1996
1997
INIT_MMX mmx2
1998
FRAME_INIT_LOWRES
1999
%if ARCH_X86_64 == 0
2000
INIT_MMX cache32, mmx2
2001
FRAME_INIT_LOWRES
2002
%endif
2003
INIT_XMM sse2
2004
FRAME_INIT_LOWRES
2005
INIT_XMM ssse3
2006
FRAME_INIT_LOWRES
2007
INIT_XMM avx
2008
FRAME_INIT_LOWRES
2009
INIT_XMM xop
2010
FRAME_INIT_LOWRES
2011
%if HIGH_BIT_DEPTH==0
2012
INIT_YMM avx2
2013
FRAME_INIT_LOWRES
2014
%endif
2015
2016
;-----------------------------------------------------------------------------
2017
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
2018
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
2019
;-----------------------------------------------------------------------------
2020
%macro MBTREE 0
2021
cglobal mbtree_propagate_cost, 6,6,7
2022
movss m6, [r5]
2023
mov r5d, r6m
2024
lea r0, [r0+r5*2]
2025
add r5d, r5d
2026
add r1, r5
2027
add r2, r5
2028
add r3, r5
2029
add r4, r5
2030
neg r5
2031
pxor m4, m4
2032
shufps m6, m6, 0
2033
mova m5, [pw_3fff]
2034
.loop:
2035
movq m2, [r2+r5] ; intra
2036
movq m0, [r4+r5] ; invq
2037
movq m3, [r3+r5] ; inter
2038
movq m1, [r1+r5] ; prop
2039
pand m3, m5
2040
pminsw m3, m2
2041
punpcklwd m2, m4
2042
punpcklwd m0, m4
2043
pmaddwd m0, m2
2044
punpcklwd m1, m4
2045
punpcklwd m3, m4
2046
%if cpuflag(fma4)
2047
cvtdq2ps m0, m0
2048
cvtdq2ps m1, m1
2049
fmaddps m0, m0, m6, m1
2050
cvtdq2ps m1, m2
2051
psubd m2, m3
2052
cvtdq2ps m2, m2
2053
rcpps m3, m1
2054
mulps m1, m3
2055
mulps m0, m2
2056
addps m2, m3, m3
2057
fnmaddps m3, m1, m3, m2
2058
mulps m0, m3
2059
%else
2060
cvtdq2ps m0, m0
2061
mulps m0, m6 ; intra*invq*fps_factor>>8
2062
cvtdq2ps m1, m1 ; prop
2063
addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
2064
cvtdq2ps m1, m2 ; intra
2065
psubd m2, m3 ; intra - inter
2066
cvtdq2ps m2, m2 ; intra - inter
2067
rcpps m3, m1 ; 1 / intra 1st approximation
2068
mulps m1, m3 ; intra * (1/intra 1st approx)
2069
mulps m1, m3 ; intra * (1/intra 1st approx)^2
2070
mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2071
addps m3, m3 ; 2 * (1/intra 1st approx)
2072
subps m3, m1 ; 2nd approximation for 1/intra
2073
mulps m0, m3 ; / intra
2074
%endif
2075
cvtps2dq m0, m0
2076
packssdw m0, m0
2077
movh [r0+r5], m0
2078
add r5, 8
2079
jl .loop
2080
RET
2081
%endmacro
2082
2083
INIT_XMM sse2
2084
MBTREE
2085
; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
2086
INIT_XMM fma4
2087
MBTREE
2088
2089
%macro INT16_UNPACK 1
2090
punpckhwd xm4, xm%1, xm7
2091
punpcklwd xm%1, xm7
2092
vinsertf128 m%1, m%1, xm4, 1
2093
%endmacro
2094
2095
; FIXME: align loads to 16 bytes
2096
%macro MBTREE_AVX 0
2097
cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2)
2098
vbroadcastss m6, [r5]
2099
mov r5d, r6m
2100
lea r0, [r0+r5*2]
2101
add r5d, r5d
2102
add r1, r5
2103
add r2, r5
2104
add r3, r5
2105
add r4, r5
2106
neg r5
2107
mova xm5, [pw_3fff]
2108
%if notcpuflag(avx2)
2109
pxor xm7, xm7
2110
%endif
2111
.loop:
2112
%if cpuflag(avx2)
2113
pmovzxwd m0, [r2+r5] ; intra
2114
pmovzxwd m1, [r4+r5] ; invq
2115
pmovzxwd m2, [r1+r5] ; prop
2116
pand xm3, xm5, [r3+r5] ; inter
2117
pmovzxwd m3, xm3
2118
pminsd m3, m0
2119
pmaddwd m1, m0
2120
psubd m4, m0, m3
2121
cvtdq2ps m0, m0
2122
cvtdq2ps m1, m1
2123
cvtdq2ps m2, m2
2124
cvtdq2ps m4, m4
2125
fmaddps m1, m1, m6, m2
2126
rcpps m3, m0
2127
mulps m2, m0, m3
2128
mulps m1, m4
2129
addps m4, m3, m3
2130
fnmaddps m4, m2, m3, m4
2131
mulps m1, m4
2132
%else
2133
movu xm0, [r2+r5]
2134
movu xm1, [r4+r5]
2135
movu xm2, [r1+r5]
2136
pand xm3, xm5, [r3+r5]
2137
pminsw xm3, xm0
2138
INT16_UNPACK 0
2139
INT16_UNPACK 1
2140
INT16_UNPACK 2
2141
INT16_UNPACK 3
2142
cvtdq2ps m0, m0
2143
cvtdq2ps m1, m1
2144
cvtdq2ps m2, m2
2145
cvtdq2ps m3, m3
2146
mulps m1, m0
2147
subps m4, m0, m3
2148
mulps m1, m6 ; intra*invq*fps_factor>>8
2149
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
2150
rcpps m3, m0 ; 1 / intra 1st approximation
2151
mulps m2, m0, m3 ; intra * (1/intra 1st approx)
2152
mulps m2, m3 ; intra * (1/intra 1st approx)^2
2153
mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
2154
addps m3, m3 ; 2 * (1/intra 1st approx)
2155
subps m3, m2 ; 2nd approximation for 1/intra
2156
mulps m1, m3 ; / intra
2157
%endif
2158
vcvtps2dq m1, m1
2159
vextractf128 xm2, m1, 1
2160
packssdw xm1, xm2
2161
mova [r0+r5], xm1
2162
add r5, 16
2163
jl .loop
2164
RET
2165
%endmacro
2166
2167
INIT_YMM avx
2168
MBTREE_AVX
2169
INIT_YMM avx2
2170
MBTREE_AVX
2171
2172
%macro MBTREE_PROPAGATE_LIST 0
2173
;-----------------------------------------------------------------------------
2174
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
2175
; int16_t *output, int bipred_weight, int mb_y, int len )
2176
;-----------------------------------------------------------------------------
2177
cglobal mbtree_propagate_list_internal, 4,6,8
2178
movh m6, [pw_0to15] ; mb_x
2179
movd m7, r5m
2180
pshuflw m7, m7, 0
2181
punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
2182
movd m7, r4m
2183
SPLATW m7, m7 ; bipred_weight
2184
psllw m7, 9 ; bipred_weight << 9
2185
2186
mov r5d, r6m
2187
xor r4d, r4d
2188
.loop:
2189
mova m3, [r1+r4*2]
2190
movu m4, [r2+r4*2]
2191
mova m5, [pw_0xc000]
2192
pand m4, m5
2193
pcmpeqw m4, m5
2194
pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2195
%if cpuflag(avx)
2196
pblendvb m5, m3, m5, m4
2197
%else
2198
pand m5, m4
2199
pandn m4, m3
2200
por m5, m4 ; if( lists_used == 3 )
2201
; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
2202
%endif
2203
2204
movu m0, [r0+r4*4] ; x,y
2205
movu m1, [r0+r4*4+mmsize]
2206
2207
psraw m2, m0, 5
2208
psraw m3, m1, 5
2209
mova m4, [pd_4]
2210
paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2211
paddw m6, m4 ; {mbx, mby} += {4, 0}
2212
paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
2213
paddw m6, m4 ; {mbx, mby} += {4, 0}
2214
2215
mova [r3+mmsize*0], m2
2216
mova [r3+mmsize*1], m3
2217
2218
mova m3, [pw_31]
2219
pand m0, m3 ; x &= 31
2220
pand m1, m3 ; y &= 31
2221
packuswb m0, m1
2222
psrlw m1, m0, 3
2223
pand m0, m3 ; x
2224
SWAP 1, 3
2225
pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
2226
2227
mova m3, [pw_32]
2228
psubw m3, m0 ; 32 - x
2229
mova m4, [pw_1024]
2230
psubw m4, m1 ; (32 - y) << 5
2231
2232
pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
2233
pmullw m4, m0 ; idx1weight = (32-y)*x << 5
2234
pmullw m0, m1 ; idx3weight = y*x << 5
2235
pmullw m1, m3 ; idx2weight = y*(32-x) << 5
2236
2237
; avoid overflow in the input to pmulhrsw
2238
psrlw m3, m2, 15
2239
psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
2240
2241
pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
2242
pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
2243
pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
2244
pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
2245
2246
SBUTTERFLY wd, 2, 4, 3
2247
SBUTTERFLY wd, 1, 0, 3
2248
mova [r3+mmsize*2], m2
2249
mova [r3+mmsize*3], m4
2250
mova [r3+mmsize*4], m1
2251
mova [r3+mmsize*5], m0
2252
add r4d, mmsize/2
2253
add r3, mmsize*6
2254
cmp r4d, r5d
2255
jl .loop
2256
REP_RET
2257
%endmacro
2258
2259
INIT_XMM ssse3
2260
MBTREE_PROPAGATE_LIST
2261
INIT_XMM avx
2262
MBTREE_PROPAGATE_LIST
2263
2264