Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* deblock-a.asm: x86 deblocking
3
;*****************************************************************************
4
;* Copyright (C) 2005-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Oskar Arvidsson <oskar@irock.se>
9
;*
10
;* This program is free software; you can redistribute it and/or modify
11
;* it under the terms of the GNU General Public License as published by
12
;* the Free Software Foundation; either version 2 of the License, or
13
;* (at your option) any later version.
14
;*
15
;* This program is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
;* GNU General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU General Public License
21
;* along with this program; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
;*
24
;* This program is also available under a commercial proprietary license.
25
;* For more information, contact us at licensing@x264.com.
26
;*****************************************************************************
27
28
%include "x86inc.asm"
29
%include "x86util.asm"
30
31
SECTION_RODATA 32
32
33
load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
34
insert_top_shuf: dd 0,1,4,5,7,2,3,6
35
transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
36
37
SECTION .text
38
39
cextern pb_0
40
cextern pb_1
41
cextern pb_3
42
cextern pb_a1
43
cextern pw_2
44
cextern pw_4
45
cextern pw_00ff
46
cextern pw_pixel_max
47
cextern pb_unpackbd1
48
49
%if HIGH_BIT_DEPTH
50
; out: %4 = |%1-%2|-%3
51
; clobbers: %5
52
%macro ABS_SUB 5
53
psubusw %5, %2, %1
54
psubusw %4, %1, %2
55
por %4, %5
56
psubw %4, %3
57
%endmacro
58
59
; out: %4 = |%1-%2|<%3
60
%macro DIFF_LT 5
61
psubusw %4, %2, %1
62
psubusw %5, %1, %2
63
por %5, %4 ; |%1-%2|
64
pxor %4, %4
65
psubw %5, %3 ; |%1-%2|-%3
66
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
67
%endmacro
68
69
%macro LOAD_AB 4
70
movd %1, %3
71
movd %2, %4
72
SPLATW %1, %1
73
SPLATW %2, %2
74
%endmacro
75
76
; in: %2=tc reg
77
; out: %1=splatted tc
78
%macro LOAD_TC 2
79
%if mmsize == 8
80
pshufw %1, [%2-1], 0
81
%else
82
movd %1, [%2]
83
punpcklbw %1, %1
84
pshuflw %1, %1, q1100
85
pshufd %1, %1, q1100
86
%endif
87
psraw %1, 8
88
%endmacro
89
90
; in: %1=p1, %2=p0, %3=q0, %4=q1
91
; %5=alpha, %6=beta, %7-%9=tmp
92
; out: %7=mask
93
%macro LOAD_MASK 9
94
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
95
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
96
pand %8, %9
97
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
98
pxor %7, %7
99
pand %8, %9
100
pcmpgtw %7, %8
101
%endmacro
102
103
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
104
; out: %1=p0', m2=q0'
105
%macro DEBLOCK_P0_Q0 7
106
psubw %3, %4
107
pxor %7, %7
108
paddw %3, [pw_4]
109
psubw %7, %5
110
psubw %6, %2, %1
111
psllw %6, 2
112
paddw %3, %6
113
psraw %3, 3
114
mova %6, [pw_pixel_max]
115
CLIPW %3, %7, %5
116
pxor %7, %7
117
paddw %1, %3
118
psubw %2, %3
119
CLIPW %1, %7, %6
120
CLIPW %2, %7, %6
121
%endmacro
122
123
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
124
%macro LUMA_Q1 6
125
pavgw %6, %3, %4 ; (p0+q0+1)>>1
126
paddw %1, %6
127
pxor %6, %6
128
psraw %1, 1
129
psubw %6, %5
130
psubw %1, %2
131
CLIPW %1, %6, %5
132
paddw %1, %2
133
%endmacro
134
135
%macro LUMA_DEBLOCK_ONE 3
136
DIFF_LT m5, %1, bm, m4, m6
137
pxor m6, m6
138
mova %3, m4
139
pcmpgtw m6, tcm
140
pand m4, tcm
141
pandn m6, m7
142
pand m4, m6
143
LUMA_Q1 m5, %2, m1, m2, m4, m6
144
%endmacro
145
146
%macro LUMA_H_STORE 2
147
%if mmsize == 8
148
movq [r0-4], m0
149
movq [r0+r1-4], m1
150
movq [r0+r1*2-4], m2
151
movq [r0+%2-4], m3
152
%else
153
movq [r0-4], m0
154
movhps [r0+r1-4], m0
155
movq [r0+r1*2-4], m1
156
movhps [%1-4], m1
157
movq [%1+r1-4], m2
158
movhps [%1+r1*2-4], m2
159
movq [%1+%2-4], m3
160
movhps [%1+r1*4-4], m3
161
%endif
162
%endmacro
163
164
%macro DEBLOCK_LUMA 0
165
;-----------------------------------------------------------------------------
166
; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
167
;-----------------------------------------------------------------------------
168
cglobal deblock_v_luma, 5,5,8,0-5*mmsize
169
%define tcm [rsp]
170
%define ms1 [rsp+mmsize]
171
%define ms2 [rsp+mmsize*2]
172
%define am [rsp+mmsize*3]
173
%define bm [rsp+mmsize*4]
174
add r1, r1
175
LOAD_AB m4, m5, r2d, r3d
176
mov r3, 32/mmsize
177
mov r2, r0
178
sub r0, r1
179
mova am, m4
180
sub r0, r1
181
mova bm, m5
182
sub r0, r1
183
.loop:
184
mova m0, [r0+r1]
185
mova m1, [r0+r1*2]
186
mova m2, [r2]
187
mova m3, [r2+r1]
188
189
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
190
LOAD_TC m6, r4
191
mova tcm, m6
192
193
mova m5, [r0]
194
LUMA_DEBLOCK_ONE m1, m0, ms1
195
mova [r0+r1], m5
196
197
mova m5, [r2+r1*2]
198
LUMA_DEBLOCK_ONE m2, m3, ms2
199
mova [r2+r1], m5
200
201
pxor m5, m5
202
mova m6, tcm
203
pcmpgtw m5, tcm
204
psubw m6, ms1
205
pandn m5, m7
206
psubw m6, ms2
207
pand m5, m6
208
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
209
mova [r0+r1*2], m1
210
mova [r2], m2
211
212
add r0, mmsize
213
add r2, mmsize
214
add r4, mmsize/8
215
dec r3
216
jg .loop
217
RET
218
219
cglobal deblock_h_luma, 5,6,8,0-7*mmsize
220
%define tcm [rsp]
221
%define ms1 [rsp+mmsize]
222
%define ms2 [rsp+mmsize*2]
223
%define p1m [rsp+mmsize*3]
224
%define p2m [rsp+mmsize*4]
225
%define am [rsp+mmsize*5]
226
%define bm [rsp+mmsize*6]
227
add r1, r1
228
LOAD_AB m4, m5, r2d, r3d
229
mov r3, r1
230
mova am, m4
231
add r3, r1
232
mov r5, 32/mmsize
233
mova bm, m5
234
add r3, r1
235
%if mmsize == 16
236
mov r2, r0
237
add r2, r3
238
%endif
239
.loop:
240
%if mmsize == 8
241
movq m2, [r0-8] ; y q2 q1 q0
242
movq m7, [r0+0]
243
movq m5, [r0+r1-8]
244
movq m3, [r0+r1+0]
245
movq m0, [r0+r1*2-8]
246
movq m6, [r0+r1*2+0]
247
movq m1, [r0+r3-8]
248
TRANSPOSE4x4W 2, 5, 0, 1, 4
249
SWAP 2, 7
250
movq m7, [r0+r3]
251
TRANSPOSE4x4W 2, 3, 6, 7, 4
252
%else
253
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
254
movu m0, [r0+r1-8]
255
movu m2, [r0+r1*2-8]
256
movu m3, [r2-8]
257
TRANSPOSE4x4W 5, 0, 2, 3, 6
258
mova tcm, m3
259
260
movu m4, [r2+r1-8]
261
movu m1, [r2+r1*2-8]
262
movu m3, [r2+r3-8]
263
movu m7, [r2+r1*4-8]
264
TRANSPOSE4x4W 4, 1, 3, 7, 6
265
266
mova m6, tcm
267
punpcklqdq m6, m7
268
punpckhqdq m5, m4
269
SBUTTERFLY qdq, 0, 1, 7
270
SBUTTERFLY qdq, 2, 3, 7
271
%endif
272
273
mova p2m, m6
274
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
275
LOAD_TC m6, r4
276
mova tcm, m6
277
278
LUMA_DEBLOCK_ONE m1, m0, ms1
279
mova p1m, m5
280
281
mova m5, p2m
282
LUMA_DEBLOCK_ONE m2, m3, ms2
283
mova p2m, m5
284
285
pxor m5, m5
286
mova m6, tcm
287
pcmpgtw m5, tcm
288
psubw m6, ms1
289
pandn m5, m7
290
psubw m6, ms2
291
pand m5, m6
292
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
293
mova m0, p1m
294
mova m3, p2m
295
TRANSPOSE4x4W 0, 1, 2, 3, 4
296
LUMA_H_STORE r2, r3
297
298
add r4, mmsize/8
299
lea r0, [r0+r1*(mmsize/2)]
300
lea r2, [r2+r1*(mmsize/2)]
301
dec r5
302
jg .loop
303
RET
304
%endmacro
305
306
%if ARCH_X86_64
307
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
308
; m12=alpha, m13=beta
309
; out: m0=p1', m3=q1', m1=p0', m2=q0'
310
; clobbers: m4, m5, m6, m7, m10, m11, m14
311
%macro DEBLOCK_LUMA_INTER_SSE2 0
312
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
313
LOAD_TC m6, r4
314
DIFF_LT m8, m1, m13, m10, m4
315
DIFF_LT m9, m2, m13, m11, m4
316
pand m6, m7
317
318
mova m14, m6
319
pxor m4, m4
320
pcmpgtw m6, m4
321
pand m6, m14
322
323
mova m5, m10
324
pand m5, m6
325
LUMA_Q1 m8, m0, m1, m2, m5, m4
326
327
mova m5, m11
328
pand m5, m6
329
LUMA_Q1 m9, m3, m1, m2, m5, m4
330
331
pxor m4, m4
332
psubw m6, m10
333
pcmpgtw m4, m14
334
pandn m4, m7
335
psubw m6, m11
336
pand m4, m6
337
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
338
339
SWAP 0, 8
340
SWAP 3, 9
341
%endmacro
342
343
%macro DEBLOCK_LUMA_64 0
344
cglobal deblock_v_luma, 5,5,15
345
%define p2 m8
346
%define p1 m0
347
%define p0 m1
348
%define q0 m2
349
%define q1 m3
350
%define q2 m9
351
%define mask0 m7
352
%define mask1 m10
353
%define mask2 m11
354
add r1, r1
355
LOAD_AB m12, m13, r2d, r3d
356
mov r2, r0
357
sub r0, r1
358
sub r0, r1
359
sub r0, r1
360
mov r3, 2
361
.loop:
362
mova p2, [r0]
363
mova p1, [r0+r1]
364
mova p0, [r0+r1*2]
365
mova q0, [r2]
366
mova q1, [r2+r1]
367
mova q2, [r2+r1*2]
368
DEBLOCK_LUMA_INTER_SSE2
369
mova [r0+r1], p1
370
mova [r0+r1*2], p0
371
mova [r2], q0
372
mova [r2+r1], q1
373
add r0, mmsize
374
add r2, mmsize
375
add r4, 2
376
dec r3
377
jg .loop
378
RET
379
380
cglobal deblock_h_luma, 5,7,15
381
add r1, r1
382
LOAD_AB m12, m13, r2d, r3d
383
mov r2, r1
384
add r2, r1
385
add r2, r1
386
mov r5, r0
387
add r5, r2
388
mov r6, 2
389
.loop:
390
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
391
movu m0, [r0+r1-8]
392
movu m2, [r0+r1*2-8]
393
movu m9, [r5-8]
394
movu m5, [r5+r1-8]
395
movu m1, [r5+r1*2-8]
396
movu m3, [r5+r2-8]
397
movu m7, [r5+r1*4-8]
398
399
TRANSPOSE4x4W 8, 0, 2, 9, 10
400
TRANSPOSE4x4W 5, 1, 3, 7, 10
401
402
punpckhqdq m8, m5
403
SBUTTERFLY qdq, 0, 1, 10
404
SBUTTERFLY qdq, 2, 3, 10
405
punpcklqdq m9, m7
406
407
DEBLOCK_LUMA_INTER_SSE2
408
409
TRANSPOSE4x4W 0, 1, 2, 3, 4
410
LUMA_H_STORE r5, r2
411
add r4, 2
412
lea r0, [r0+r1*8]
413
lea r5, [r5+r1*8]
414
dec r6
415
jg .loop
416
RET
417
%endmacro
418
419
INIT_XMM sse2
420
DEBLOCK_LUMA_64
421
INIT_XMM avx
422
DEBLOCK_LUMA_64
423
%endif
424
425
%macro SWAPMOVA 2
426
%ifid %1
427
SWAP %1, %2
428
%else
429
mova %1, %2
430
%endif
431
%endmacro
432
433
; in: t0-t2: tmp registers
434
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
435
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
436
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
437
%if ARCH_X86_64
438
paddw t0, %3, %2
439
mova t2, %4
440
paddw t2, %3
441
%else
442
mova t0, %3
443
mova t2, %4
444
paddw t0, %2
445
paddw t2, %3
446
%endif
447
paddw t0, %1
448
paddw t2, t2
449
paddw t0, %5
450
paddw t2, %9
451
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
452
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
453
454
psrlw t2, 3
455
psrlw t1, t0, 2
456
psubw t2, %3
457
psubw t1, %2
458
pand t2, %8
459
pand t1, %8
460
paddw t2, %3
461
paddw t1, %2
462
SWAPMOVA %11, t1
463
464
psubw t1, t0, %3
465
paddw t0, t0
466
psubw t1, %5
467
psubw t0, %3
468
paddw t1, %6
469
paddw t1, %2
470
paddw t0, %6
471
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
472
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
473
474
pxor t0, t1
475
pxor t1, %1
476
pand t0, %8
477
pand t1, %7
478
pxor t0, t1
479
pxor t0, %1
480
SWAPMOVA %10, t0
481
SWAPMOVA %12, t2
482
%endmacro
483
484
%macro LUMA_INTRA_INIT 1
485
%define t0 m4
486
%define t1 m5
487
%define t2 m6
488
%define t3 m7
489
%assign i 4
490
%rep %1
491
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
492
%assign i i+1
493
%endrep
494
add r1, r1
495
%endmacro
496
497
; in: %1-%3=tmp, %4=p2, %5=q2
498
%macro LUMA_INTRA_INTER 5
499
LOAD_AB t0, t1, r2d, r3d
500
mova %1, t0
501
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
502
%if ARCH_X86_64
503
mova %2, t0 ; mask0
504
psrlw t3, %1, 2
505
%else
506
mova t3, %1
507
mova %2, t0 ; mask0
508
psrlw t3, 2
509
%endif
510
paddw t3, [pw_2] ; alpha/4+2
511
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
512
pand t2, %2
513
mova t3, %5 ; q2
514
mova %1, t2 ; mask1
515
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
516
pand t2, %1
517
mova t3, %4 ; p2
518
mova %3, t2 ; mask1q
519
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
520
pand t2, %1
521
mova %1, t2 ; mask1p
522
%endmacro
523
524
%macro LUMA_H_INTRA_LOAD 0
525
%if mmsize == 8
526
movu t0, [r0-8]
527
movu t1, [r0+r1-8]
528
movu m0, [r0+r1*2-8]
529
movu m1, [r0+r4-8]
530
TRANSPOSE4x4W 4, 5, 0, 1, 2
531
mova t4, t0 ; p3
532
mova t5, t1 ; p2
533
534
movu m2, [r0]
535
movu m3, [r0+r1]
536
movu t0, [r0+r1*2]
537
movu t1, [r0+r4]
538
TRANSPOSE4x4W 2, 3, 4, 5, 6
539
mova t6, t0 ; q2
540
mova t7, t1 ; q3
541
%else
542
movu t0, [r0-8]
543
movu t1, [r0+r1-8]
544
movu m0, [r0+r1*2-8]
545
movu m1, [r0+r5-8]
546
movu m2, [r4-8]
547
movu m3, [r4+r1-8]
548
movu t2, [r4+r1*2-8]
549
movu t3, [r4+r5-8]
550
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
551
mova t4, t0 ; p3
552
mova t5, t1 ; p2
553
mova t6, t2 ; q2
554
mova t7, t3 ; q3
555
%endif
556
%endmacro
557
558
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
559
%macro LUMA_H_INTRA_STORE 9
560
%if mmsize == 8
561
TRANSPOSE4x4W %1, %2, %3, %4, %9
562
movq [r0-8], m%1
563
movq [r0+r1-8], m%2
564
movq [r0+r1*2-8], m%3
565
movq [r0+r4-8], m%4
566
movq m%1, %8
567
TRANSPOSE4x4W %5, %6, %7, %1, %9
568
movq [r0], m%5
569
movq [r0+r1], m%6
570
movq [r0+r1*2], m%7
571
movq [r0+r4], m%1
572
%else
573
TRANSPOSE2x4x4W %1, %2, %3, %4, %9
574
movq [r0-8], m%1
575
movq [r0+r1-8], m%2
576
movq [r0+r1*2-8], m%3
577
movq [r0+r5-8], m%4
578
movhps [r4-8], m%1
579
movhps [r4+r1-8], m%2
580
movhps [r4+r1*2-8], m%3
581
movhps [r4+r5-8], m%4
582
%ifnum %8
583
SWAP %1, %8
584
%else
585
mova m%1, %8
586
%endif
587
TRANSPOSE2x4x4W %5, %6, %7, %1, %9
588
movq [r0], m%5
589
movq [r0+r1], m%6
590
movq [r0+r1*2], m%7
591
movq [r0+r5], m%1
592
movhps [r4], m%5
593
movhps [r4+r1], m%6
594
movhps [r4+r1*2], m%7
595
movhps [r4+r5], m%1
596
%endif
597
%endmacro
598
599
%if ARCH_X86_64
600
;-----------------------------------------------------------------------------
601
; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
602
;-----------------------------------------------------------------------------
603
%macro DEBLOCK_LUMA_INTRA_64 0
604
cglobal deblock_v_luma_intra, 4,7,16
605
%define t0 m1
606
%define t1 m2
607
%define t2 m4
608
%define p2 m8
609
%define p1 m9
610
%define p0 m10
611
%define q0 m11
612
%define q1 m12
613
%define q2 m13
614
%define aa m5
615
%define bb m14
616
add r1, r1
617
lea r4, [r1*4]
618
lea r5, [r1*3] ; 3*stride
619
neg r4
620
add r4, r0 ; pix-4*stride
621
mov r6, 2
622
mova m0, [pw_2]
623
LOAD_AB aa, bb, r2d, r3d
624
.loop:
625
mova p2, [r4+r1]
626
mova p1, [r4+2*r1]
627
mova p0, [r4+r5]
628
mova q0, [r0]
629
mova q1, [r0+r1]
630
mova q2, [r0+2*r1]
631
632
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
633
mova t2, aa
634
psrlw t2, 2
635
paddw t2, m0 ; alpha/4+2
636
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
637
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
638
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
639
pand m6, m3
640
pand m7, m6
641
pand m6, t1
642
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
643
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
644
add r0, mmsize
645
add r4, mmsize
646
dec r6
647
jg .loop
648
RET
649
650
;-----------------------------------------------------------------------------
651
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
652
;-----------------------------------------------------------------------------
653
cglobal deblock_h_luma_intra, 4,7,16
654
%define t0 m15
655
%define t1 m14
656
%define t2 m2
657
%define q3 m5
658
%define q2 m8
659
%define q1 m9
660
%define q0 m10
661
%define p0 m11
662
%define p1 m12
663
%define p2 m13
664
%define p3 m4
665
%define spill [rsp]
666
%assign pad 24-(stack_offset&15)
667
SUB rsp, pad
668
add r1, r1
669
lea r4, [r1*4]
670
lea r5, [r1*3] ; 3*stride
671
add r4, r0 ; pix+4*stride
672
mov r6, 2
673
mova m0, [pw_2]
674
.loop:
675
movu q3, [r0-8]
676
movu q2, [r0+r1-8]
677
movu q1, [r0+r1*2-8]
678
movu q0, [r0+r5-8]
679
movu p0, [r4-8]
680
movu p1, [r4+r1-8]
681
movu p2, [r4+r1*2-8]
682
movu p3, [r4+r5-8]
683
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
684
685
LOAD_AB m1, m2, r2d, r3d
686
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
687
psrlw m1, 2
688
paddw m1, m0 ; alpha/4+2
689
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
690
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
691
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
692
pand m6, m3
693
pand m7, m6
694
pand m6, t1
695
696
mova spill, q3
697
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
698
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
699
mova m7, spill
700
701
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
702
703
lea r0, [r0+r1*8]
704
lea r4, [r4+r1*8]
705
dec r6
706
jg .loop
707
ADD rsp, pad
708
RET
709
%endmacro
710
711
INIT_XMM sse2
712
DEBLOCK_LUMA_INTRA_64
713
INIT_XMM avx
714
DEBLOCK_LUMA_INTRA_64
715
716
%endif
717
718
%macro DEBLOCK_LUMA_INTRA 0
719
;-----------------------------------------------------------------------------
720
; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
721
;-----------------------------------------------------------------------------
722
cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize
723
LUMA_INTRA_INIT 3
724
lea r4, [r1*4]
725
lea r5, [r1*3]
726
neg r4
727
add r4, r0
728
mov r6, 32/mmsize
729
.loop:
730
mova m0, [r4+r1*2] ; p1
731
mova m1, [r4+r5] ; p0
732
mova m2, [r0] ; q0
733
mova m3, [r0+r1] ; q1
734
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
735
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
736
mova t3, [r0+r1*2] ; q2
737
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
738
add r0, mmsize
739
add r4, mmsize
740
dec r6
741
jg .loop
742
RET
743
744
;-----------------------------------------------------------------------------
745
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
746
;-----------------------------------------------------------------------------
747
cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize
748
LUMA_INTRA_INIT 8
749
%if mmsize == 8
750
lea r4, [r1*3]
751
mov r5, 32/mmsize
752
%else
753
lea r4, [r1*4]
754
lea r5, [r1*3] ; 3*stride
755
add r4, r0 ; pix+4*stride
756
mov r6, 32/mmsize
757
%endif
758
.loop:
759
LUMA_H_INTRA_LOAD
760
LUMA_INTRA_INTER t8, t9, t10, t5, t6
761
762
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
763
mova t3, t6 ; q2
764
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
765
766
mova m2, t4
767
mova m0, t11
768
mova m1, t5
769
mova m3, t8
770
mova m6, t6
771
772
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
773
774
lea r0, [r0+r1*(mmsize/2)]
775
%if mmsize == 8
776
dec r5
777
%else
778
lea r4, [r4+r1*(mmsize/2)]
779
dec r6
780
%endif
781
jg .loop
782
RET
783
%endmacro
784
785
%if ARCH_X86_64 == 0
786
INIT_MMX mmx2
787
DEBLOCK_LUMA
788
DEBLOCK_LUMA_INTRA
789
INIT_XMM sse2
790
DEBLOCK_LUMA
791
DEBLOCK_LUMA_INTRA
792
INIT_XMM avx
793
DEBLOCK_LUMA
794
DEBLOCK_LUMA_INTRA
795
%endif
796
%endif ; HIGH_BIT_DEPTH
797
798
%if HIGH_BIT_DEPTH == 0
799
; expands to [base],...,[base+7*stride]
800
%define PASS8ROWS(base, base3, stride, stride3) \
801
[base], [base+stride], [base+stride*2], [base3], \
802
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
803
804
%define PASS8ROWS(base, base3, stride, stride3, offset) \
805
PASS8ROWS(base+offset, base3+offset, stride, stride3)
806
807
; in: 4 rows of 8 bytes in m0..m3
808
; out: 8 rows of 4 bytes in %1..%8
809
%macro TRANSPOSE8x4B_STORE 8
810
punpckhdq m4, m0, m0
811
punpckhdq m5, m1, m1
812
punpckhdq m6, m2, m2
813
814
punpcklbw m0, m1
815
punpcklbw m2, m3
816
punpcklwd m1, m0, m2
817
punpckhwd m0, m2
818
movd %1, m1
819
punpckhdq m1, m1
820
movd %2, m1
821
movd %3, m0
822
punpckhdq m0, m0
823
movd %4, m0
824
825
punpckhdq m3, m3
826
punpcklbw m4, m5
827
punpcklbw m6, m3
828
punpcklwd m5, m4, m6
829
punpckhwd m4, m6
830
movd %5, m5
831
punpckhdq m5, m5
832
movd %6, m5
833
movd %7, m4
834
punpckhdq m4, m4
835
movd %8, m4
836
%endmacro
837
838
; in: 8 rows of 4 bytes in %9..%10
839
; out: 8 rows of 4 bytes in %1..%8
840
%macro STORE_8x4B 10
841
movd %1, %9
842
pextrd %2, %9, 1
843
pextrd %3, %9, 2
844
pextrd %4, %9, 3
845
movd %5, %10
846
pextrd %6, %10, 1
847
pextrd %7, %10, 2
848
pextrd %8, %10, 3
849
%endmacro
850
851
; in: 4 rows of 4 words in %1..%4
852
; out: 4 rows of 4 word in m0..m3
853
; clobbers: m4
854
%macro TRANSPOSE4x4W_LOAD 4-8
855
%if mmsize==8
856
SWAP 1, 4, 2, 3
857
movq m0, %1
858
movq m1, %2
859
movq m2, %3
860
movq m3, %4
861
TRANSPOSE4x4W 0, 1, 2, 3, 4
862
%else
863
movq m0, %1
864
movq m2, %2
865
movq m1, %3
866
movq m3, %4
867
punpcklwd m0, m2
868
punpcklwd m1, m3
869
mova m2, m0
870
punpckldq m0, m1
871
punpckhdq m2, m1
872
MOVHL m1, m0
873
MOVHL m3, m2
874
%endif
875
%endmacro
876
877
; in: 2 rows of 4 words in m1..m2
878
; out: 4 rows of 2 words in %1..%4
879
; clobbers: m0, m1
880
%macro TRANSPOSE4x2W_STORE 4-8
881
%if mmsize==8
882
punpckhwd m0, m1, m2
883
punpcklwd m1, m2
884
%else
885
punpcklwd m1, m2
886
MOVHL m0, m1
887
%endif
888
movd %3, m0
889
movd %1, m1
890
psrlq m1, 32
891
psrlq m0, 32
892
movd %2, m1
893
movd %4, m0
894
%endmacro
895
896
; in: 4/8 rows of 4 words in %1..%8
897
; out: 4 rows of 4/8 word in m0..m3
898
; clobbers: m4, m5, m6, m7
899
%macro TRANSPOSE4x8W_LOAD 8
900
%if mmsize==8
901
TRANSPOSE4x4W_LOAD %1, %2, %3, %4
902
%else
903
movq m0, %1
904
movq m2, %2
905
movq m1, %3
906
movq m3, %4
907
punpcklwd m0, m2
908
punpcklwd m1, m3
909
mova m2, m0
910
punpckldq m0, m1
911
punpckhdq m2, m1
912
913
movq m4, %5
914
movq m6, %6
915
movq m5, %7
916
movq m7, %8
917
punpcklwd m4, m6
918
punpcklwd m5, m7
919
mova m6, m4
920
punpckldq m4, m5
921
punpckhdq m6, m5
922
923
punpckhqdq m1, m0, m4
924
punpckhqdq m3, m2, m6
925
punpcklqdq m0, m4
926
punpcklqdq m2, m6
927
%endif
928
%endmacro
929
930
; in: 2 rows of 4/8 words in m1..m2
931
; out: 4/8 rows of 2 words in %1..%8
932
; clobbers: m0, m1
933
%macro TRANSPOSE8x2W_STORE 8
934
%if mmsize==8
935
TRANSPOSE4x2W_STORE %1, %2, %3, %4
936
%else
937
punpckhwd m0, m1, m2
938
punpcklwd m1, m2
939
movd %5, m0
940
movd %1, m1
941
psrldq m1, 4
942
psrldq m0, 4
943
movd %2, m1
944
movd %6, m0
945
psrldq m1, 4
946
psrldq m0, 4
947
movd %3, m1
948
movd %7, m0
949
psrldq m1, 4
950
psrldq m0, 4
951
movd %4, m1
952
movd %8, m0
953
%endif
954
%endmacro
955
956
%macro SBUTTERFLY3 4
957
punpckh%1 %4, %2, %3
958
punpckl%1 %2, %3
959
%endmacro
960
961
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
962
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
963
%macro TRANSPOSE6x8_MEM 9
964
RESET_MM_PERMUTATION
965
%if cpuflag(avx)
966
; input:
967
; _ABCDEF_
968
; _GHIJKL_
969
; _MNOPQR_
970
; _STUVWX_
971
; _YZabcd_
972
; _efghij_
973
; _klmnop_
974
; _qrstuv_
975
976
movh m0, %1
977
movh m2, %2
978
movh m1, %3
979
movh m3, %4
980
punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __
981
punpcklbw m1, m3 ; __ MS NT OU PV QW RX __
982
movh m2, %5
983
movh m3, %6
984
punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __
985
movh m3, %7
986
movh m4, %8
987
punpcklbw m3, m4 ; __ kq lr ms nt ou pv __
988
989
SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU
990
; DJ PV EK QW FL RX __ __
991
SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms
992
; bh nt ci ou dj pv __ __
993
SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq
994
; BH NT Zf lr CI FL OU RX
995
SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr
996
; FL RX dj pv __ __ __ __
997
movhps [%9+0x00], m0
998
movh [%9+0x10], m2
999
movhps [%9+0x20], m2
1000
movh [%9+0x30], m1
1001
movhps [%9+0x40], m1
1002
movh [%9+0x50], m3
1003
%else
1004
movq m0, %1
1005
movq m1, %2
1006
movq m2, %3
1007
movq m3, %4
1008
movq m4, %5
1009
movq m5, %6
1010
movq m6, %7
1011
SBUTTERFLY bw, 0, 1, 7
1012
SBUTTERFLY bw, 2, 3, 7
1013
SBUTTERFLY bw, 4, 5, 7
1014
movq [%9+0x10], m3
1015
SBUTTERFLY3 bw, m6, %8, m7
1016
SBUTTERFLY wd, 0, 2, 3
1017
SBUTTERFLY wd, 4, 6, 3
1018
punpckhdq m0, m4
1019
movq [%9+0x00], m0
1020
SBUTTERFLY3 wd, m1, [%9+0x10], m3
1021
SBUTTERFLY wd, 5, 7, 0
1022
SBUTTERFLY dq, 1, 5, 0
1023
SBUTTERFLY dq, 2, 6, 0
1024
punpckldq m3, m7
1025
movq [%9+0x10], m2
1026
movq [%9+0x20], m6
1027
movq [%9+0x30], m1
1028
movq [%9+0x40], m5
1029
movq [%9+0x50], m3
1030
%endif
1031
RESET_MM_PERMUTATION
1032
%endmacro
1033
1034
1035
; in: 8 rows of 8 in %1..%8
1036
; out: 8 rows of 8 in %9..%16
1037
%macro TRANSPOSE8x8_MEM 16
1038
RESET_MM_PERMUTATION
1039
%if cpuflag(avx)
1040
movh m0, %1
1041
movh m4, %2
1042
movh m1, %3
1043
movh m5, %4
1044
movh m2, %5
1045
movh m3, %7
1046
punpcklbw m0, m4
1047
punpcklbw m1, m5
1048
movh m4, %6
1049
movh m5, %8
1050
punpcklbw m2, m4
1051
punpcklbw m3, m5
1052
SBUTTERFLY wd, 0, 1, 4
1053
SBUTTERFLY wd, 2, 3, 4
1054
SBUTTERFLY dq, 0, 2, 4
1055
SBUTTERFLY dq, 1, 3, 4
1056
movh %9, m0
1057
movhps %10, m0
1058
movh %11, m2
1059
movhps %12, m2
1060
movh %13, m1
1061
movhps %14, m1
1062
movh %15, m3
1063
movhps %16, m3
1064
%else
1065
movq m0, %1
1066
movq m1, %2
1067
movq m2, %3
1068
movq m3, %4
1069
movq m4, %5
1070
movq m5, %6
1071
movq m6, %7
1072
SBUTTERFLY bw, 0, 1, 7
1073
SBUTTERFLY bw, 2, 3, 7
1074
SBUTTERFLY bw, 4, 5, 7
1075
SBUTTERFLY3 bw, m6, %8, m7
1076
movq %9, m5
1077
SBUTTERFLY wd, 0, 2, 5
1078
SBUTTERFLY wd, 4, 6, 5
1079
SBUTTERFLY wd, 1, 3, 5
1080
movq %11, m6
1081
movq m6, %9
1082
SBUTTERFLY wd, 6, 7, 5
1083
SBUTTERFLY dq, 0, 4, 5
1084
SBUTTERFLY dq, 1, 6, 5
1085
movq %9, m0
1086
movq %10, m4
1087
movq %13, m1
1088
movq %14, m6
1089
SBUTTERFLY3 dq, m2, %11, m0
1090
SBUTTERFLY dq, 3, 7, 4
1091
movq %11, m2
1092
movq %12, m0
1093
movq %15, m3
1094
movq %16, m7
1095
%endif
1096
RESET_MM_PERMUTATION
1097
%endmacro
1098
1099
; out: %4 = |%1-%2|>%3
1100
; clobbers: %5
1101
%macro DIFF_GT 5
1102
%if avx_enabled == 0
1103
mova %5, %2
1104
mova %4, %1
1105
psubusb %5, %1
1106
psubusb %4, %2
1107
%else
1108
psubusb %5, %2, %1
1109
psubusb %4, %1, %2
1110
%endif
1111
por %4, %5
1112
psubusb %4, %3
1113
%endmacro
1114
1115
; out: %4 = |%1-%2|>%3
1116
; clobbers: %5
1117
%macro DIFF_GT2 5-6
1118
%if %0<6
1119
psubusb %4, %1, %2
1120
psubusb %5, %2, %1
1121
%else
1122
mova %4, %1
1123
mova %5, %2
1124
psubusb %4, %2
1125
psubusb %5, %1
1126
%endif
1127
psubusb %5, %3
1128
psubusb %4, %3
1129
pcmpeqb %4, %5
1130
%endmacro
1131
1132
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta
1133
; out: m5=beta-1, m7=mask, %3=alpha-1
1134
; clobbers: m4,m6
1135
%macro LOAD_MASK 2-3
1136
%if cpuflag(ssse3)
1137
movd m4, %1
1138
movd m5, %2
1139
pxor m6, m6
1140
pshufb m4, m6
1141
pshufb m5, m6
1142
%else
1143
movd m4, %1
1144
movd m5, %2
1145
punpcklbw m4, m4
1146
punpcklbw m5, m5
1147
SPLATW m4, m4
1148
SPLATW m5, m5
1149
%endif
1150
mova m6, [pb_1]
1151
psubusb m4, m6 ; alpha - 1
1152
psubusb m5, m6 ; beta - 1
1153
%if %0>2
1154
mova %3, m4
1155
%endif
1156
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
1157
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
1158
por m7, m4
1159
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
1160
por m7, m4
1161
pxor m6, m6
1162
pcmpeqb m7, m6
1163
%endmacro
1164
1165
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
1166
; out: m1=p0' m2=q0'
1167
; clobbers: m0,3-6
1168
%macro DEBLOCK_P0_Q0 0
1169
pxor m5, m1, m2 ; p0^q0
1170
pand m5, [pb_1] ; (p0^q0)&1
1171
pcmpeqb m4, m4
1172
pxor m3, m4
1173
pavgb m3, m0 ; (p1 - q1 + 256)>>1
1174
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
1175
pxor m4, m1
1176
pavgb m4, m2 ; (q0 - p0 + 256)>>1
1177
pavgb m3, m5
1178
paddusb m3, m4 ; d+128+33
1179
mova m6, [pb_a1]
1180
psubusb m6, m3
1181
psubusb m3, [pb_a1]
1182
pminub m6, m7
1183
pminub m3, m7
1184
psubusb m1, m6
1185
psubusb m2, m3
1186
paddusb m1, m3
1187
paddusb m2, m6
1188
%endmacro
1189
1190
; in: m1=p0 m2=q0
1191
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
1192
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
1193
; clobbers: q2, tmp, tc0
1194
%macro LUMA_Q1 6
1195
pavgb %6, m1, m2
1196
pavgb %2, %6 ; avg(p2,avg(p0,q0))
1197
pxor %6, %3
1198
pand %6, [pb_1] ; (p2^avg(p0,q0))&1
1199
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
1200
psubusb %6, %1, %5
1201
paddusb %5, %1
1202
pmaxub %2, %6
1203
pminub %2, %5
1204
mova %4, %2
1205
%endmacro
1206
1207
%if ARCH_X86_64
1208
;-----------------------------------------------------------------------------
1209
; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1210
;-----------------------------------------------------------------------------
1211
%macro DEBLOCK_LUMA 0
1212
cglobal deblock_v_luma, 5,5,10
1213
movd m8, [r4] ; tc0
1214
lea r4, [r1*3]
1215
neg r4
1216
add r4, r0 ; pix-3*stride
1217
1218
mova m0, [r4+r1] ; p1
1219
mova m1, [r4+2*r1] ; p0
1220
mova m2, [r0] ; q0
1221
mova m3, [r0+r1] ; q1
1222
LOAD_MASK r2d, r3d
1223
1224
%if cpuflag(avx)
1225
pshufb m8, [pb_unpackbd1]
1226
pblendvb m9, m7, m6, m8
1227
%else
1228
punpcklbw m8, m8
1229
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
1230
pcmpeqb m9, m9
1231
pcmpeqb m9, m8
1232
pandn m9, m7
1233
%endif
1234
pand m8, m9
1235
1236
mova m3, [r4] ; p2
1237
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
1238
pand m6, m9
1239
psubb m7, m8, m6 ; tc++
1240
pand m6, m8
1241
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
1242
1243
mova m4, [r0+2*r1] ; q2
1244
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
1245
pand m6, m9
1246
pand m8, m6
1247
psubb m7, m6
1248
mova m3, [r0+r1]
1249
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
1250
1251
DEBLOCK_P0_Q0
1252
mova [r4+2*r1], m1
1253
mova [r0], m2
1254
RET
1255
1256
;-----------------------------------------------------------------------------
1257
; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1258
;-----------------------------------------------------------------------------
1259
1260
%if cpuflag(avx)
1261
INIT_XMM cpuname
1262
%else
1263
INIT_MMX cpuname
1264
%endif
1265
cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64
1266
lea r8, [r1*3]
1267
lea r6, [r0-4]
1268
lea r5, [r0-4+r8]
1269
%xdefine pix_tmp rsp+0x30*WIN64 ; shadow space + r4
1270
1271
; transpose 6x16 -> tmp space
1272
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp
1273
lea r6, [r6+r1*8]
1274
lea r5, [r5+r1*8]
1275
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
1276
1277
; vertical filter
1278
; alpha, beta, tc0 are still in r2d, r3d, r4
1279
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
1280
mov r7, r1
1281
lea r0, [pix_tmp+0x30]
1282
mov r1d, 0x10
1283
%if WIN64
1284
mov [rsp+0x20], r4
1285
%endif
1286
call deblock_v_luma
1287
1288
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
1289
add r6, 2
1290
add r5, 2
1291
%if cpuflag(sse4)
1292
mova m0, [pix_tmp+0x10]
1293
mova m1, [pix_tmp+0x20]
1294
mova m2, [pix_tmp+0x30]
1295
mova m3, [pix_tmp+0x40]
1296
SBUTTERFLY bw, 0, 1, 4
1297
SBUTTERFLY bw, 2, 3, 4
1298
SBUTTERFLY wd, 0, 2, 4
1299
SBUTTERFLY wd, 1, 3, 4
1300
STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3
1301
shl r7, 3
1302
sub r6, r7
1303
sub r5, r7
1304
shr r7, 3
1305
STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2
1306
%else
1307
movq m0, [pix_tmp+0x18]
1308
movq m1, [pix_tmp+0x28]
1309
movq m2, [pix_tmp+0x38]
1310
movq m3, [pix_tmp+0x48]
1311
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
1312
1313
shl r7, 3
1314
sub r6, r7
1315
sub r5, r7
1316
shr r7, 3
1317
movq m0, [pix_tmp+0x10]
1318
movq m1, [pix_tmp+0x20]
1319
movq m2, [pix_tmp+0x30]
1320
movq m3, [pix_tmp+0x40]
1321
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
1322
%endif
1323
1324
RET
1325
%endmacro
1326
1327
INIT_XMM sse2
1328
DEBLOCK_LUMA
1329
INIT_XMM avx
1330
DEBLOCK_LUMA
1331
1332
%else
1333
1334
%macro DEBLOCK_LUMA 2
1335
;-----------------------------------------------------------------------------
1336
; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1337
;-----------------------------------------------------------------------------
1338
cglobal deblock_%1_luma, 5,5,8,2*%2
1339
lea r4, [r1*3]
1340
neg r4
1341
add r4, r0 ; pix-3*stride
1342
1343
mova m0, [r4+r1] ; p1
1344
mova m1, [r4+2*r1] ; p0
1345
mova m2, [r0] ; q0
1346
mova m3, [r0+r1] ; q1
1347
LOAD_MASK r2d, r3d
1348
1349
mov r3, r4mp
1350
movd m4, [r3] ; tc0
1351
%if cpuflag(avx)
1352
pshufb m4, [pb_unpackbd1]
1353
mova [esp+%2], m4 ; tc
1354
pblendvb m4, m7, m6, m4
1355
%else
1356
punpcklbw m4, m4
1357
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
1358
mova [esp+%2], m4 ; tc
1359
pcmpeqb m3, m3
1360
pcmpgtb m4, m3
1361
pand m4, m7
1362
%endif
1363
mova [esp], m4 ; mask
1364
1365
mova m3, [r4] ; p2
1366
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
1367
pand m6, m4
1368
pand m4, [esp+%2] ; tc
1369
psubb m7, m4, m6
1370
pand m6, m4
1371
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
1372
1373
mova m4, [r0+2*r1] ; q2
1374
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
1375
mova m5, [esp] ; mask
1376
pand m6, m5
1377
mova m5, [esp+%2] ; tc
1378
pand m5, m6
1379
psubb m7, m6
1380
mova m3, [r0+r1]
1381
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
1382
1383
DEBLOCK_P0_Q0
1384
mova [r4+2*r1], m1
1385
mova [r0], m2
1386
RET
1387
1388
;-----------------------------------------------------------------------------
1389
; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1390
;-----------------------------------------------------------------------------
1391
%if cpuflag(avx)
1392
INIT_XMM cpuname
1393
%else
1394
INIT_MMX cpuname
1395
%endif
1396
cglobal deblock_h_luma, 1,5,8,0x60+12
1397
mov r3, r1m
1398
lea r4, [r3*3]
1399
sub r0, 4
1400
lea r1, [r0+r4]
1401
%define pix_tmp esp+12
1402
; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma.
1403
1404
; transpose 6x16 -> tmp space
1405
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
1406
lea r0, [r0+r3*8]
1407
lea r1, [r1+r3*8]
1408
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
1409
1410
; vertical filter
1411
lea r0, [pix_tmp+0x30]
1412
PUSH dword r4m
1413
PUSH dword r3m
1414
PUSH dword r2m
1415
PUSH dword 16
1416
PUSH dword r0
1417
call deblock_%1_luma
1418
%ifidn %1, v8
1419
add dword [esp ], 8 ; pix_tmp+0x38
1420
add dword [esp+16], 2 ; tc0+2
1421
call deblock_%1_luma
1422
%endif
1423
ADD esp, 20
1424
1425
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
1426
mov r0, r0mp
1427
sub r0, 2
1428
lea r1, [r0+r4]
1429
1430
%if cpuflag(avx)
1431
mova m0, [pix_tmp+0x10]
1432
mova m1, [pix_tmp+0x20]
1433
mova m2, [pix_tmp+0x30]
1434
mova m3, [pix_tmp+0x40]
1435
SBUTTERFLY bw, 0, 1, 4
1436
SBUTTERFLY bw, 2, 3, 4
1437
SBUTTERFLY wd, 0, 2, 4
1438
SBUTTERFLY wd, 1, 3, 4
1439
STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2
1440
lea r0, [r0+r3*8]
1441
lea r1, [r1+r3*8]
1442
STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3
1443
%else
1444
movq m0, [pix_tmp+0x10]
1445
movq m1, [pix_tmp+0x20]
1446
movq m2, [pix_tmp+0x30]
1447
movq m3, [pix_tmp+0x40]
1448
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
1449
1450
lea r0, [r0+r3*8]
1451
lea r1, [r1+r3*8]
1452
movq m0, [pix_tmp+0x18]
1453
movq m1, [pix_tmp+0x28]
1454
movq m2, [pix_tmp+0x38]
1455
movq m3, [pix_tmp+0x48]
1456
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
1457
%endif
1458
1459
RET
1460
%endmacro ; DEBLOCK_LUMA
1461
1462
INIT_MMX mmx2
1463
DEBLOCK_LUMA v8, 8
1464
INIT_XMM sse2
1465
DEBLOCK_LUMA v, 16
1466
INIT_XMM avx
1467
DEBLOCK_LUMA v, 16
1468
1469
%endif ; ARCH
1470
1471
1472
1473
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
1474
%if ARCH_X86_64
1475
pavgb t0, p2, p1
1476
pavgb t1, p0, q0
1477
%else
1478
mova t0, p2
1479
mova t1, p0
1480
pavgb t0, p1
1481
pavgb t1, q0
1482
%endif
1483
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
1484
mova t5, t1
1485
%if ARCH_X86_64
1486
paddb t2, p2, p1
1487
paddb t3, p0, q0
1488
%else
1489
mova t2, p2
1490
mova t3, p0
1491
paddb t2, p1
1492
paddb t3, q0
1493
%endif
1494
paddb t2, t3
1495
mova t3, t2
1496
mova t4, t2
1497
psrlw t2, 1
1498
pavgb t2, mpb_0
1499
pxor t2, t0
1500
pand t2, mpb_1
1501
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
1502
1503
%if ARCH_X86_64
1504
pavgb t1, p2, q1
1505
psubb t2, p2, q1
1506
%else
1507
mova t1, p2
1508
mova t2, p2
1509
pavgb t1, q1
1510
psubb t2, q1
1511
%endif
1512
paddb t3, t3
1513
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
1514
pand t2, mpb_1
1515
psubb t1, t2
1516
pavgb t1, p1
1517
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
1518
psrlw t3, 2
1519
pavgb t3, mpb_0
1520
pxor t3, t1
1521
pand t3, mpb_1
1522
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
1523
1524
pxor t3, p0, q1
1525
pavgb t2, p0, q1
1526
pand t3, mpb_1
1527
psubb t2, t3
1528
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
1529
1530
pxor t1, t2
1531
pxor t2, p0
1532
pand t1, mask1p
1533
pand t2, mask0
1534
pxor t1, t2
1535
pxor t1, p0
1536
mova %1, t1 ; store p0
1537
1538
mova t1, %4 ; p3
1539
paddb t2, t1, p2
1540
pavgb t1, p2
1541
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
1542
paddb t2, t2
1543
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
1544
psrlw t2, 2
1545
pavgb t2, mpb_0
1546
pxor t2, t1
1547
pand t2, mpb_1
1548
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
1549
1550
pxor t0, p1
1551
pxor t1, p2
1552
pand t0, mask1p
1553
pand t1, mask1p
1554
pxor t0, p1
1555
pxor t1, p2
1556
mova %2, t0 ; store p1
1557
mova %3, t1 ; store p2
1558
%endmacro
1559
1560
%macro LUMA_INTRA_SWAP_PQ 0
1561
%define q1 m0
1562
%define q0 m1
1563
%define p0 m2
1564
%define p1 m3
1565
%define p2 q2
1566
%define mask1p mask1q
1567
%endmacro
1568
1569
%macro DEBLOCK_LUMA_INTRA 1
1570
%define p1 m0
1571
%define p0 m1
1572
%define q0 m2
1573
%define q1 m3
1574
%define t0 m4
1575
%define t1 m5
1576
%define t2 m6
1577
%define t3 m7
1578
%if ARCH_X86_64
1579
%define p2 m8
1580
%define q2 m9
1581
%define t4 m10
1582
%define t5 m11
1583
%define mask0 m12
1584
%define mask1p m13
1585
%if WIN64
1586
%define mask1q [rsp]
1587
%else
1588
%define mask1q [rsp-24]
1589
%endif
1590
%define mpb_0 m14
1591
%define mpb_1 m15
1592
%else
1593
%define spill(x) [esp+16*x]
1594
%define p2 [r4+r1]
1595
%define q2 [r0+2*r1]
1596
%define t4 spill(0)
1597
%define t5 spill(1)
1598
%define mask0 spill(2)
1599
%define mask1p spill(3)
1600
%define mask1q spill(4)
1601
%define mpb_0 [pb_0]
1602
%define mpb_1 [pb_1]
1603
%endif
1604
1605
;-----------------------------------------------------------------------------
1606
; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
1607
;-----------------------------------------------------------------------------
1608
cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10
1609
lea r4, [r1*4]
1610
lea r5, [r1*3] ; 3*stride
1611
neg r4
1612
add r4, r0 ; pix-4*stride
1613
mova p1, [r4+2*r1]
1614
mova p0, [r4+r5]
1615
mova q0, [r0]
1616
mova q1, [r0+r1]
1617
%if ARCH_X86_64
1618
pxor mpb_0, mpb_0
1619
mova mpb_1, [pb_1]
1620
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
1621
SWAP 7, 12 ; m12=mask0
1622
pavgb t5, mpb_0
1623
pavgb t5, mpb_1 ; alpha/4+1
1624
movdqa p2, [r4+r1]
1625
movdqa q2, [r0+2*r1]
1626
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
1627
DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1
1628
DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1
1629
pand t0, mask0
1630
pand t4, t0
1631
pand t2, t0
1632
mova mask1q, t4
1633
mova mask1p, t2
1634
%else
1635
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
1636
mova m4, t5
1637
mova mask0, m7
1638
pavgb m4, [pb_0]
1639
pavgb m4, [pb_1] ; alpha/4+1
1640
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
1641
pand m6, mask0
1642
DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1
1643
pand m4, m6
1644
mova mask1p, m4
1645
DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1
1646
pand m4, m6
1647
mova mask1q, m4
1648
%endif
1649
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
1650
LUMA_INTRA_SWAP_PQ
1651
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
1652
.end:
1653
REP_RET
1654
1655
%if cpuflag(avx)
1656
INIT_XMM cpuname
1657
%else
1658
INIT_MMX cpuname
1659
%endif
1660
%if ARCH_X86_64
1661
;-----------------------------------------------------------------------------
1662
; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
1663
;-----------------------------------------------------------------------------
1664
cglobal deblock_h_luma_intra, 4,9,0,0x80
1665
lea r8, [r1*3]
1666
lea r6, [r0-4]
1667
lea r5, [r0-4+r8]
1668
%if WIN64
1669
%define pix_tmp rsp+0x20 ; shadow space
1670
%else
1671
%define pix_tmp rsp
1672
%endif
1673
1674
; transpose 8x16 -> tmp space
1675
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
1676
lea r6, [r6+r1*8]
1677
lea r5, [r5+r1*8]
1678
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
1679
1680
mov r7, r1
1681
lea r0, [pix_tmp+0x40]
1682
mov r1, 0x10
1683
call deblock_v_luma_intra
1684
1685
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
1686
lea r5, [r6+r8]
1687
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
1688
shl r7, 3
1689
sub r6, r7
1690
sub r5, r7
1691
shr r7, 3
1692
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
1693
RET
1694
%else
1695
cglobal deblock_h_luma_intra, 2,4,8,0x80
1696
lea r3, [r1*3]
1697
sub r0, 4
1698
lea r2, [r0+r3]
1699
%define pix_tmp rsp
1700
1701
; transpose 8x16 -> tmp space
1702
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
1703
lea r0, [r0+r1*8]
1704
lea r2, [r2+r1*8]
1705
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
1706
1707
lea r0, [pix_tmp+0x40]
1708
PUSH dword r3m
1709
PUSH dword r2m
1710
PUSH dword 16
1711
PUSH r0
1712
call deblock_%1_luma_intra
1713
%ifidn %1, v8
1714
add dword [rsp], 8 ; pix_tmp+8
1715
call deblock_%1_luma_intra
1716
%endif
1717
ADD esp, 16
1718
1719
mov r1, r1m
1720
mov r0, r0mp
1721
lea r3, [r1*3]
1722
sub r0, 4
1723
lea r2, [r0+r3]
1724
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
1725
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
1726
lea r0, [r0+r1*8]
1727
lea r2, [r2+r1*8]
1728
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
1729
RET
1730
%endif ; ARCH_X86_64
1731
%endmacro ; DEBLOCK_LUMA_INTRA
1732
1733
INIT_XMM sse2
1734
DEBLOCK_LUMA_INTRA v
1735
INIT_XMM avx
1736
DEBLOCK_LUMA_INTRA v
1737
%if ARCH_X86_64 == 0
1738
INIT_MMX mmx2
1739
DEBLOCK_LUMA_INTRA v8
1740
%endif
1741
%endif ; !HIGH_BIT_DEPTH
1742
1743
%if HIGH_BIT_DEPTH
1744
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
1745
; out: %1=p0', %2=q0'
1746
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
1747
mova %6, [pw_2]
1748
paddw %6, %3
1749
paddw %6, %4
1750
paddw %7, %6, %2
1751
paddw %6, %1
1752
paddw %6, %3
1753
paddw %7, %4
1754
psraw %6, 2
1755
psraw %7, 2
1756
psubw %6, %1
1757
psubw %7, %2
1758
pand %6, %5
1759
pand %7, %5
1760
paddw %1, %6
1761
paddw %2, %7
1762
%endmacro
1763
1764
; out: m0-m3
1765
; clobbers: m4-m7
1766
%macro CHROMA_H_LOAD 0-1
1767
movq m0, [r0-8] ; p1 p1 p0 p0
1768
movq m2, [r0] ; q0 q0 q1 q1
1769
movq m5, [r0+r1-8]
1770
movq m7, [r0+r1]
1771
%if mmsize == 8
1772
mova m1, m0
1773
mova m3, m2
1774
punpckldq m0, m5 ; p1
1775
punpckhdq m1, m5 ; p0
1776
punpckldq m2, m7 ; q0
1777
punpckhdq m3, m7 ; q1
1778
%else
1779
movq m4, [r0+r1*2-8]
1780
movq m6, [r0+r1*2]
1781
movq m1, [r0+%1-8]
1782
movq m3, [r0+%1]
1783
punpckldq m0, m5 ; p1 ... p0 ...
1784
punpckldq m2, m7 ; q0 ... q1 ...
1785
punpckldq m4, m1
1786
punpckldq m6, m3
1787
punpckhqdq m1, m0, m4 ; p0
1788
punpcklqdq m0, m4 ; p1
1789
punpckhqdq m3, m2, m6 ; q1
1790
punpcklqdq m2, m6 ; q0
1791
%endif
1792
%endmacro
1793
1794
%macro CHROMA_V_LOAD 1
1795
mova m0, [r0] ; p1
1796
mova m1, [r0+r1] ; p0
1797
mova m2, [%1] ; q0
1798
mova m3, [%1+r1] ; q1
1799
%endmacro
1800
1801
; clobbers: m1, m2, m3
1802
%macro CHROMA_H_STORE 0-1
1803
SBUTTERFLY dq, 1, 2, 3
1804
%if mmsize == 8
1805
movq [r0-4], m1
1806
movq [r0+r1-4], m2
1807
%else
1808
movq [r0-4], m1
1809
movq [r0+r1*2-4], m2
1810
movhps [r0+r1-4], m1
1811
movhps [r0+%1-4], m2
1812
%endif
1813
%endmacro
1814
1815
%macro CHROMA_V_STORE 0
1816
mova [r0+1*r1], m1
1817
mova [r0+2*r1], m2
1818
%endmacro
1819
1820
%macro DEBLOCK_CHROMA 0
1821
cglobal deblock_inter_body
1822
LOAD_AB m4, m5, r2d, r3d
1823
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1824
pxor m4, m4
1825
LOAD_TC m6, r4
1826
pmaxsw m6, m4
1827
pand m7, m6
1828
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
1829
ret
1830
1831
;-----------------------------------------------------------------------------
1832
; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1833
;-----------------------------------------------------------------------------
1834
cglobal deblock_v_chroma, 5,7,8
1835
FIX_STRIDES r1
1836
mov r5, r0
1837
sub r0, r1
1838
sub r0, r1
1839
mov r6, 32/mmsize
1840
.loop:
1841
CHROMA_V_LOAD r5
1842
call deblock_inter_body
1843
CHROMA_V_STORE
1844
add r0, mmsize
1845
add r5, mmsize
1846
add r4, mmsize/8
1847
dec r6
1848
jg .loop
1849
RET
1850
1851
;-----------------------------------------------------------------------------
1852
; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1853
;-----------------------------------------------------------------------------
1854
cglobal deblock_h_chroma, 5,7,8
1855
add r1, r1
1856
mov r5, 32/mmsize
1857
%if mmsize == 16
1858
lea r6, [r1*3]
1859
%endif
1860
.loop:
1861
CHROMA_H_LOAD r6
1862
call deblock_inter_body
1863
CHROMA_H_STORE r6
1864
lea r0, [r0+r1*(mmsize/4)]
1865
add r4, mmsize/8
1866
dec r5
1867
jg .loop
1868
RET
1869
1870
1871
cglobal deblock_intra_body
1872
LOAD_AB m4, m5, r2d, r3d
1873
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1874
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
1875
ret
1876
1877
;-----------------------------------------------------------------------------
1878
; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
1879
;-----------------------------------------------------------------------------
1880
cglobal deblock_v_chroma_intra, 4,6,8
1881
add r1, r1
1882
mov r5, 32/mmsize
1883
movd m5, r3d
1884
mov r4, r0
1885
sub r0, r1
1886
sub r0, r1
1887
SPLATW m5, m5
1888
.loop:
1889
CHROMA_V_LOAD r4
1890
call deblock_intra_body
1891
CHROMA_V_STORE
1892
add r0, mmsize
1893
add r4, mmsize
1894
dec r5
1895
jg .loop
1896
RET
1897
1898
;-----------------------------------------------------------------------------
1899
; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
1900
;-----------------------------------------------------------------------------
1901
cglobal deblock_h_chroma_intra, 4,6,8
1902
add r1, r1
1903
mov r4, 32/mmsize
1904
%if mmsize == 16
1905
lea r5, [r1*3]
1906
%endif
1907
.loop:
1908
CHROMA_H_LOAD r5
1909
call deblock_intra_body
1910
CHROMA_H_STORE r5
1911
lea r0, [r0+r1*(mmsize/4)]
1912
dec r4
1913
jg .loop
1914
RET
1915
1916
;-----------------------------------------------------------------------------
1917
; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
1918
;-----------------------------------------------------------------------------
1919
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
1920
add r1, r1
1921
%if mmsize == 8
1922
mov r4, 16/mmsize
1923
.loop:
1924
%else
1925
lea r5, [r1*3]
1926
%endif
1927
CHROMA_H_LOAD r5
1928
LOAD_AB m4, m5, r2d, r3d
1929
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1930
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
1931
CHROMA_H_STORE r5
1932
%if mmsize == 8
1933
lea r0, [r0+r1*(mmsize/4)]
1934
dec r4
1935
jg .loop
1936
%endif
1937
RET
1938
1939
;-----------------------------------------------------------------------------
1940
; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1941
;-----------------------------------------------------------------------------
1942
cglobal deblock_h_chroma_mbaff, 5,7,8
1943
add r1, r1
1944
lea r6, [r1*3]
1945
%if mmsize == 8
1946
mov r5, 16/mmsize
1947
.loop:
1948
%endif
1949
CHROMA_H_LOAD r6
1950
LOAD_AB m4, m5, r2d, r3d
1951
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1952
movd m6, [r4]
1953
punpcklbw m6, m6
1954
psraw m6, 8
1955
punpcklwd m6, m6
1956
pand m7, m6
1957
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
1958
CHROMA_H_STORE r6
1959
%if mmsize == 8
1960
lea r0, [r0+r1*(mmsize/4)]
1961
add r4, mmsize/4
1962
dec r5
1963
jg .loop
1964
%endif
1965
RET
1966
1967
;-----------------------------------------------------------------------------
1968
; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
1969
;-----------------------------------------------------------------------------
1970
cglobal deblock_h_chroma_422_intra, 4,6,8
1971
add r1, r1
1972
mov r4, 64/mmsize
1973
%if mmsize == 16
1974
lea r5, [r1*3]
1975
%endif
1976
.loop:
1977
CHROMA_H_LOAD r5
1978
call deblock_intra_body
1979
CHROMA_H_STORE r5
1980
lea r0, [r0+r1*(mmsize/4)]
1981
dec r4
1982
jg .loop
1983
RET
1984
1985
;-----------------------------------------------------------------------------
1986
; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
1987
;-----------------------------------------------------------------------------
1988
cglobal deblock_h_chroma_422, 5,7,8
1989
add r1, r1
1990
mov r5, 64/mmsize
1991
lea r6, [r1*3]
1992
.loop:
1993
CHROMA_H_LOAD r6
1994
LOAD_AB m4, m5, r2m, r3d
1995
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
1996
pxor m4, m4
1997
movd m6, [r4-1]
1998
psraw m6, 8
1999
SPLATW m6, m6
2000
pmaxsw m6, m4
2001
pand m7, m6
2002
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
2003
CHROMA_H_STORE r6
2004
lea r0, [r0+r1*(mmsize/4)]
2005
%if mmsize == 16
2006
inc r4
2007
%else
2008
mov r2, r5
2009
and r2, 1
2010
add r4, r2 ; increment once every 2 iterations
2011
%endif
2012
dec r5
2013
jg .loop
2014
RET
2015
%endmacro ; DEBLOCK_CHROMA
2016
2017
%if ARCH_X86_64 == 0
2018
INIT_MMX mmx2
2019
DEBLOCK_CHROMA
2020
%endif
2021
INIT_XMM sse2
2022
DEBLOCK_CHROMA
2023
INIT_XMM avx
2024
DEBLOCK_CHROMA
2025
%endif ; HIGH_BIT_DEPTH
2026
2027
%if HIGH_BIT_DEPTH == 0
2028
%macro CHROMA_V_START 0
2029
mov t5, r0
2030
sub t5, r1
2031
sub t5, r1
2032
%if mmsize==8
2033
mov dword r0m, 2
2034
.loop:
2035
%endif
2036
%endmacro
2037
2038
%macro CHROMA_H_START 0
2039
sub r0, 4
2040
lea t6, [r1*3]
2041
mov t5, r0
2042
add r0, t6
2043
%endmacro
2044
2045
%macro CHROMA_V_LOOP 1
2046
%if mmsize==8
2047
add r0, 8
2048
add t5, 8
2049
%if %1
2050
add r4, 2
2051
%endif
2052
dec dword r0m
2053
jg .loop
2054
%endif
2055
%endmacro
2056
2057
%macro CHROMA_H_LOOP 1
2058
%if mmsize==8
2059
lea r0, [r0+r1*4]
2060
lea t5, [t5+r1*4]
2061
%if %1
2062
add r4, 2
2063
%endif
2064
dec dword r0m
2065
jg .loop
2066
%endif
2067
%endmacro
2068
2069
%define t5 r5
2070
%define t6 r6
2071
2072
%macro DEBLOCK_CHROMA 0
2073
cglobal chroma_inter_body
2074
LOAD_MASK r2d, r3d
2075
movd m6, [r4] ; tc0
2076
punpcklbw m6, m6
2077
punpcklbw m6, m6
2078
pand m7, m6
2079
DEBLOCK_P0_Q0
2080
ret
2081
2082
;-----------------------------------------------------------------------------
2083
; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
2084
;-----------------------------------------------------------------------------
2085
cglobal deblock_v_chroma, 5,6,8
2086
CHROMA_V_START
2087
mova m0, [t5]
2088
mova m1, [t5+r1]
2089
mova m2, [r0]
2090
mova m3, [r0+r1]
2091
call chroma_inter_body
2092
mova [t5+r1], m1
2093
mova [r0], m2
2094
CHROMA_V_LOOP 1
2095
RET
2096
2097
;-----------------------------------------------------------------------------
2098
; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
2099
;-----------------------------------------------------------------------------
2100
cglobal deblock_h_chroma, 5,7,8
2101
CHROMA_H_START
2102
%if mmsize==8
2103
mov dword r0m, 2
2104
.loop:
2105
%endif
2106
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
2107
call chroma_inter_body
2108
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2109
CHROMA_H_LOOP 1
2110
RET
2111
%endmacro ; DEBLOCK_CHROMA
2112
2113
INIT_XMM sse2
2114
DEBLOCK_CHROMA
2115
INIT_XMM avx
2116
DEBLOCK_CHROMA
2117
%if ARCH_X86_64 == 0
2118
INIT_MMX mmx2
2119
DEBLOCK_CHROMA
2120
%endif
2121
2122
;-----------------------------------------------------------------------------
2123
; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
2124
;-----------------------------------------------------------------------------
2125
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
2126
cglobal deblock_h_chroma_mbaff, 5,7,8
2127
CHROMA_H_START
2128
TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
2129
LOAD_MASK r2d, r3d
2130
movd m6, [r4] ; tc0
2131
punpcklbw m6, m6
2132
pand m7, m6
2133
DEBLOCK_P0_Q0
2134
TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2135
RET
2136
%endmacro
2137
2138
INIT_XMM sse2
2139
DEBLOCK_H_CHROMA_420_MBAFF
2140
%if ARCH_X86_64 == 0
2141
INIT_MMX mmx2
2142
DEBLOCK_H_CHROMA_420_MBAFF
2143
%endif
2144
2145
%macro DEBLOCK_H_CHROMA_422 0
2146
cglobal deblock_h_chroma_422, 5,8,8
2147
%if ARCH_X86_64
2148
%define cntr r7
2149
%else
2150
%define cntr dword r0m
2151
%endif
2152
CHROMA_H_START
2153
mov cntr, 32/mmsize
2154
.loop:
2155
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
2156
LOAD_MASK r2d, r3d
2157
movd m6, [r4] ; tc0
2158
punpcklbw m6, m6
2159
%if mmsize == 16
2160
punpcklbw m6, m6
2161
punpcklbw m6, m6
2162
%else
2163
pshufw m6, m6, q0000
2164
%endif
2165
pand m7, m6
2166
DEBLOCK_P0_Q0
2167
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2168
lea r0, [r0+r1*(mmsize/2)]
2169
lea t5, [t5+r1*(mmsize/2)]
2170
add r4, mmsize/8
2171
dec cntr
2172
jg .loop
2173
RET
2174
%endmacro
2175
2176
INIT_MMX mmx2
2177
DEBLOCK_H_CHROMA_422
2178
INIT_XMM sse2
2179
DEBLOCK_H_CHROMA_422
2180
INIT_XMM avx
2181
DEBLOCK_H_CHROMA_422
2182
2183
; in: %1=p0 %2=p1 %3=q1
2184
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
2185
%macro CHROMA_INTRA_P0 3
2186
pxor m4, %1, %3
2187
pand m4, [pb_1] ; m4 = (p0^q1)&1
2188
pavgb %1, %3
2189
psubusb %1, m4
2190
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
2191
%endmacro
2192
2193
%define t5 r4
2194
%define t6 r5
2195
2196
%macro DEBLOCK_CHROMA_INTRA_BODY 0
2197
cglobal chroma_intra_body
2198
LOAD_MASK r2d, r3d
2199
mova m5, m1
2200
mova m6, m2
2201
CHROMA_INTRA_P0 m1, m0, m3
2202
CHROMA_INTRA_P0 m2, m3, m0
2203
psubb m1, m5
2204
psubb m2, m6
2205
pand m1, m7
2206
pand m2, m7
2207
paddb m1, m5
2208
paddb m2, m6
2209
ret
2210
%endmacro
2211
2212
%macro DEBLOCK_CHROMA_INTRA 0
2213
;-----------------------------------------------------------------------------
2214
; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
2215
;-----------------------------------------------------------------------------
2216
cglobal deblock_v_chroma_intra, 4,5,8
2217
CHROMA_V_START
2218
mova m0, [t5]
2219
mova m1, [t5+r1]
2220
mova m2, [r0]
2221
mova m3, [r0+r1]
2222
call chroma_intra_body
2223
mova [t5+r1], m1
2224
mova [r0], m2
2225
CHROMA_V_LOOP 0
2226
RET
2227
2228
;-----------------------------------------------------------------------------
2229
; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
2230
;-----------------------------------------------------------------------------
2231
cglobal deblock_h_chroma_intra, 4,6,8
2232
CHROMA_H_START
2233
%if mmsize==8
2234
mov dword r0m, 2
2235
.loop:
2236
%endif
2237
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
2238
call chroma_intra_body
2239
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2240
CHROMA_H_LOOP 0
2241
RET
2242
2243
cglobal deblock_h_chroma_422_intra, 4,7,8
2244
CHROMA_H_START
2245
mov r6d, 32/mmsize
2246
.loop:
2247
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
2248
call chroma_intra_body
2249
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2250
lea r0, [r0+r1*(mmsize/2)]
2251
lea t5, [t5+r1*(mmsize/2)]
2252
dec r6d
2253
jg .loop
2254
RET
2255
%endmacro ; DEBLOCK_CHROMA_INTRA
2256
2257
INIT_XMM sse2
2258
DEBLOCK_CHROMA_INTRA_BODY
2259
DEBLOCK_CHROMA_INTRA
2260
INIT_XMM avx
2261
DEBLOCK_CHROMA_INTRA_BODY
2262
DEBLOCK_CHROMA_INTRA
2263
INIT_MMX mmx2
2264
DEBLOCK_CHROMA_INTRA_BODY
2265
%if ARCH_X86_64 == 0
2266
DEBLOCK_CHROMA_INTRA
2267
%endif
2268
2269
;-----------------------------------------------------------------------------
2270
; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
2271
;-----------------------------------------------------------------------------
2272
INIT_MMX mmx2
2273
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
2274
CHROMA_H_START
2275
TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
2276
call chroma_intra_body
2277
TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
2278
RET
2279
%endif ; !HIGH_BIT_DEPTH
2280
2281
2282
2283
;-----------------------------------------------------------------------------
2284
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
2285
; uint8_t bs[2][4][4], int mvy_limit, int bframe )
2286
;-----------------------------------------------------------------------------
2287
2288
%define scan8start (4+1*8)
2289
%define nnz r0+scan8start
2290
%define ref r1+scan8start
2291
%define mv r2+scan8start*4
2292
%define bs0 r3
2293
%define bs1 r3+32
2294
2295
%macro LOAD_BYTES_MMX 1
2296
movd m2, [%1+8*0-1]
2297
movd m0, [%1+8*0]
2298
movd m3, [%1+8*2-1]
2299
movd m1, [%1+8*2]
2300
punpckldq m2, [%1+8*1-1]
2301
punpckldq m0, [%1+8*1]
2302
punpckldq m3, [%1+8*3-1]
2303
punpckldq m1, [%1+8*3]
2304
%endmacro
2305
2306
%macro DEBLOCK_STRENGTH_REFS_MMX 0
2307
LOAD_BYTES_MMX ref
2308
pxor m2, m0
2309
pxor m3, m1
2310
por m2, [bs0+0]
2311
por m3, [bs0+8]
2312
movq [bs0+0], m2
2313
movq [bs0+8], m3
2314
2315
movd m2, [ref-8*1]
2316
movd m3, [ref+8*1]
2317
punpckldq m2, m0 ; row -1, row 0
2318
punpckldq m3, m1 ; row 1, row 2
2319
pxor m0, m2
2320
pxor m1, m3
2321
por m0, [bs1+0]
2322
por m1, [bs1+8]
2323
movq [bs1+0], m0
2324
movq [bs1+8], m1
2325
%endmacro
2326
2327
%macro DEBLOCK_STRENGTH_MVS_MMX 2
2328
mova m0, [mv-%2]
2329
mova m1, [mv-%2+8]
2330
psubw m0, [mv]
2331
psubw m1, [mv+8]
2332
packsswb m0, m1
2333
ABSB m0, m1
2334
psubusb m0, m7
2335
packsswb m0, m0
2336
por m0, [%1]
2337
movd [%1], m0
2338
%endmacro
2339
2340
%macro DEBLOCK_STRENGTH_NNZ_MMX 1
2341
por m2, m0
2342
por m3, m1
2343
mova m4, [%1]
2344
mova m5, [%1+8]
2345
pminub m2, m6
2346
pminub m3, m6
2347
pminub m4, m6 ; mv ? 1 : 0
2348
pminub m5, m6
2349
paddb m2, m2 ; nnz ? 2 : 0
2350
paddb m3, m3
2351
pmaxub m2, m4
2352
pmaxub m3, m5
2353
%endmacro
2354
2355
%macro LOAD_BYTES_XMM 1
2356
movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
2357
movu m1, [%1+12]
2358
pslldq m0, m2, 1
2359
shufps m2, m1, q3131 ; cur nnz, all rows
2360
pslldq m1, 1
2361
shufps m0, m1, q3131 ; left neighbors
2362
pslldq m1, m2, 4
2363
movd m3, [%1-8] ; could be palignr if nnz was aligned
2364
por m1, m3 ; top neighbors
2365
%endmacro
2366
2367
INIT_MMX mmx2
2368
cglobal deblock_strength, 6,6
2369
; Prepare mv comparison register
2370
shl r4d, 8
2371
add r4d, 3 - (1<<8)
2372
movd m7, r4d
2373
SPLATW m7, m7
2374
mova m6, [pb_1]
2375
pxor m0, m0
2376
mova [bs0+0], m0
2377
mova [bs0+8], m0
2378
mova [bs1+0], m0
2379
mova [bs1+8], m0
2380
2381
.lists:
2382
DEBLOCK_STRENGTH_REFS_MMX
2383
mov r4d, 4
2384
.mvs:
2385
DEBLOCK_STRENGTH_MVS_MMX bs0, 4
2386
DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
2387
add r2, 4*8
2388
add r3, 4
2389
dec r4d
2390
jg .mvs
2391
add r1, 40
2392
add r2, 4*8
2393
sub r3, 16
2394
dec r5d
2395
jge .lists
2396
2397
; Check nnz
2398
LOAD_BYTES_MMX nnz
2399
DEBLOCK_STRENGTH_NNZ_MMX bs0
2400
; Transpose column output
2401
SBUTTERFLY bw, 2, 3, 4
2402
SBUTTERFLY bw, 2, 3, 4
2403
mova [bs0+0], m2
2404
mova [bs0+8], m3
2405
movd m2, [nnz-8*1]
2406
movd m3, [nnz+8*1]
2407
punpckldq m2, m0 ; row -1, row 0
2408
punpckldq m3, m1 ; row 1, row 2
2409
DEBLOCK_STRENGTH_NNZ_MMX bs1
2410
mova [bs1+0], m2
2411
mova [bs1+8], m3
2412
RET
2413
2414
%macro DEBLOCK_STRENGTH_XMM 0
2415
cglobal deblock_strength, 6,6,7
2416
; Prepare mv comparison register
2417
shl r4d, 8
2418
add r4d, 3 - (1<<8)
2419
movd m6, r4d
2420
SPLATW m6, m6
2421
pxor m4, m4 ; bs0
2422
pxor m5, m5 ; bs1
2423
2424
.lists:
2425
; Check refs
2426
LOAD_BYTES_XMM ref
2427
pxor m0, m2
2428
pxor m1, m2
2429
por m4, m0
2430
por m5, m1
2431
2432
; Check mvs
2433
%if cpuflag(ssse3)
2434
mova m0, [mv+4*8*0]
2435
mova m1, [mv+4*8*1]
2436
palignr m3, m0, [mv+4*8*0-16], 12
2437
palignr m2, m1, [mv+4*8*1-16], 12
2438
psubw m0, m3
2439
psubw m1, m2
2440
packsswb m0, m1
2441
2442
mova m2, [mv+4*8*2]
2443
mova m1, [mv+4*8*3]
2444
palignr m3, m2, [mv+4*8*2-16], 12
2445
psubw m2, m3
2446
palignr m3, m1, [mv+4*8*3-16], 12
2447
psubw m1, m3
2448
packsswb m2, m1
2449
%else
2450
movu m0, [mv-4+4*8*0]
2451
movu m1, [mv-4+4*8*1]
2452
movu m2, [mv-4+4*8*2]
2453
movu m3, [mv-4+4*8*3]
2454
psubw m0, [mv+4*8*0]
2455
psubw m1, [mv+4*8*1]
2456
psubw m2, [mv+4*8*2]
2457
psubw m3, [mv+4*8*3]
2458
packsswb m0, m1
2459
packsswb m2, m3
2460
%endif
2461
ABSB m0, m1
2462
ABSB m2, m3
2463
psubusb m0, m6
2464
psubusb m2, m6
2465
packsswb m0, m2
2466
por m4, m0
2467
2468
mova m0, [mv+4*8*-1]
2469
mova m1, [mv+4*8* 0]
2470
mova m2, [mv+4*8* 1]
2471
mova m3, [mv+4*8* 2]
2472
psubw m0, m1
2473
psubw m1, m2
2474
psubw m2, m3
2475
psubw m3, [mv+4*8* 3]
2476
packsswb m0, m1
2477
packsswb m2, m3
2478
ABSB m0, m1
2479
ABSB m2, m3
2480
psubusb m0, m6
2481
psubusb m2, m6
2482
packsswb m0, m2
2483
por m5, m0
2484
add r1, 40
2485
add r2, 4*8*5
2486
dec r5d
2487
jge .lists
2488
2489
; Check nnz
2490
LOAD_BYTES_XMM nnz
2491
por m0, m2
2492
por m1, m2
2493
mova m6, [pb_1]
2494
pminub m0, m6
2495
pminub m1, m6
2496
pminub m4, m6 ; mv ? 1 : 0
2497
pminub m5, m6
2498
paddb m0, m0 ; nnz ? 2 : 0
2499
paddb m1, m1
2500
pmaxub m4, m0
2501
pmaxub m5, m1
2502
%if cpuflag(ssse3)
2503
pshufb m4, [transpose_shuf]
2504
%else
2505
movhlps m3, m4
2506
punpcklbw m4, m3
2507
movhlps m3, m4
2508
punpcklbw m4, m3
2509
%endif
2510
mova [bs1], m5
2511
mova [bs0], m4
2512
RET
2513
%endmacro
2514
2515
INIT_XMM sse2
2516
DEBLOCK_STRENGTH_XMM
2517
INIT_XMM ssse3
2518
DEBLOCK_STRENGTH_XMM
2519
INIT_XMM avx
2520
DEBLOCK_STRENGTH_XMM
2521
2522
%macro LOAD_BYTES_YMM 1
2523
movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
2524
pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
2525
mova m2, [insert_top_shuf]
2526
vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
2527
vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
2528
vpbroadcastd m2, [%1-8] ; ABCD ....
2529
vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
2530
%endmacro
2531
2532
INIT_YMM avx2
2533
cglobal deblock_strength, 6,6,7
2534
; Prepare mv comparison register
2535
shl r4d, 8
2536
add r4d, 3 - (1<<8)
2537
movd xm6, r4d
2538
vpbroadcastw m6, xm6
2539
pxor m5, m5 ; bs0,bs1
2540
2541
.lists:
2542
; Check refs
2543
LOAD_BYTES_YMM ref
2544
pxor m0, m1
2545
por m5, m0
2546
2547
; Check mvs
2548
movu xm0, [mv-4+4*8*0]
2549
vinserti128 m0, m0, [mv+4*8*-1], 1
2550
vbroadcasti128 m2, [mv+4*8* 0]
2551
vinserti128 m1, m2, [mv-4+4*8*1], 0
2552
vbroadcasti128 m3, [mv+4*8* 1]
2553
psubw m0, m2
2554
psubw m1, m3
2555
2556
vinserti128 m2, m3, [mv-4+4*8*2], 0
2557
vbroadcasti128 m4, [mv+4*8* 2]
2558
vinserti128 m3, m4, [mv-4+4*8*3], 0
2559
psubw m2, m4
2560
vbroadcasti128 m4, [mv+4*8* 3]
2561
psubw m3, m4
2562
packsswb m0, m1
2563
packsswb m2, m3
2564
pabsb m0, m0
2565
pabsb m2, m2
2566
psubusb m0, m6
2567
psubusb m2, m6
2568
packsswb m0, m2
2569
por m5, m0
2570
2571
add r1, 40
2572
add r2, 4*8*5
2573
dec r5d
2574
jge .lists
2575
2576
; Check nnz
2577
LOAD_BYTES_YMM nnz
2578
por m0, m1
2579
mova m6, [pb_1]
2580
pminub m0, m6
2581
pminub m5, m6 ; mv ? 1 : 0
2582
paddb m0, m0 ; nnz ? 2 : 0
2583
pmaxub m5, m0
2584
vextracti128 [bs1], m5, 1
2585
pshufb xm5, [transpose_shuf]
2586
mova [bs0], xm5
2587
RET
2588
2589