Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* quant-a.asm: x86 quantization and level-run
3
;*****************************************************************************
4
;* Copyright (C) 2005-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Christian Heine <sennindemokrit@gmx.net>
9
;* Oskar Arvidsson <oskar@irock.se>
10
;* Henrik Gramner <henrik@gramner.com>
11
;*
12
;* This program is free software; you can redistribute it and/or modify
13
;* it under the terms of the GNU General Public License as published by
14
;* the Free Software Foundation; either version 2 of the License, or
15
;* (at your option) any later version.
16
;*
17
;* This program is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
;* GNU General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU General Public License
23
;* along with this program; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25
;*
26
;* This program is also available under a commercial proprietary license.
27
;* For more information, contact us at licensing@x264.com.
28
;*****************************************************************************
29
30
%include "x86inc.asm"
31
%include "x86util.asm"
32
33
SECTION_RODATA 32
34
35
%macro DQM4 3
36
dw %1, %2, %1, %2, %2, %3, %2, %3
37
%endmacro
38
%macro DQM8 6
39
dw %1, %4, %5, %4, %1, %4, %5, %4
40
dw %4, %2, %6, %2, %4, %2, %6, %2
41
dw %5, %6, %3, %6, %5, %6, %3, %6
42
dw %4, %2, %6, %2, %4, %2, %6, %2
43
%endmacro
44
45
dequant4_scale:
46
DQM4 10, 13, 16
47
DQM4 11, 14, 18
48
DQM4 13, 16, 20
49
DQM4 14, 18, 23
50
DQM4 16, 20, 25
51
DQM4 18, 23, 29
52
53
dequant8_scale:
54
DQM8 20, 18, 32, 19, 25, 24
55
DQM8 22, 19, 35, 21, 28, 26
56
DQM8 26, 23, 42, 24, 33, 31
57
DQM8 28, 25, 45, 26, 35, 33
58
DQM8 32, 28, 51, 30, 40, 38
59
DQM8 36, 32, 58, 34, 46, 43
60
61
decimate_mask_table4:
62
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
63
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
64
db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
65
db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
66
db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
67
db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
68
db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
69
db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
70
db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
71
72
chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
73
chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
74
chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
75
chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
76
77
%if HIGH_BIT_DEPTH==0
78
dct_coef_shuffle:
79
%macro DCT_COEF_SHUFFLE 8
80
%assign y x
81
%rep 8
82
%rep 7
83
%rotate (~(y>>7))&1
84
%assign y y<<((~(y>>7))&1)
85
%endrep
86
db %1*2
87
%rotate 1
88
%assign y y<<1
89
%endrep
90
%endmacro
91
%assign x 0
92
%rep 256
93
DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
94
%assign x x+1
95
%endrep
96
%endif
97
98
SECTION .text
99
100
cextern pb_1
101
cextern pw_1
102
cextern pw_2
103
cextern pw_256
104
cextern pd_1
105
cextern pb_01
106
cextern pd_1024
107
cextern deinterleave_shufd
108
cextern popcnt_table
109
110
%macro QUANT_DC_START 2
111
movd xm%1, r1m ; mf
112
movd xm%2, r2m ; bias
113
%if cpuflag(avx2)
114
vpbroadcastdct m%1, xm%1
115
vpbroadcastdct m%2, xm%2
116
%elif HIGH_BIT_DEPTH
117
SPLATD m%1, m%1
118
SPLATD m%2, m%2
119
%elif cpuflag(sse4) ; ssse3, but not faster on conroe
120
mova m5, [pb_01]
121
pshufb m%1, m5
122
pshufb m%2, m5
123
%else
124
SPLATW m%1, m%1
125
SPLATW m%2, m%2
126
%endif
127
%endmacro
128
129
%macro QUANT_END 0
130
xor eax, eax
131
%if cpuflag(sse4)
132
ptest m5, m5
133
%else ; !sse4
134
%if ARCH_X86_64
135
%if mmsize == 16
136
packsswb m5, m5
137
%endif
138
movq rcx, m5
139
test rcx, rcx
140
%else
141
%if mmsize == 16
142
pxor m4, m4
143
pcmpeqb m5, m4
144
pmovmskb ecx, m5
145
cmp ecx, (1<<mmsize)-1
146
%else
147
packsswb m5, m5
148
movd ecx, m5
149
test ecx, ecx
150
%endif
151
%endif
152
%endif ; cpuflag
153
setne al
154
%endmacro
155
156
%if HIGH_BIT_DEPTH
157
%macro QUANT_ONE_DC 4
158
%if cpuflag(sse4)
159
mova m0, [%1]
160
ABSD m1, m0
161
paddd m1, %3
162
pmulld m1, %2
163
psrad m1, 16
164
%else ; !sse4
165
mova m0, [%1]
166
ABSD m1, m0
167
paddd m1, %3
168
mova m2, m1
169
psrlq m2, 32
170
pmuludq m1, %2
171
pmuludq m2, %2
172
psllq m2, 32
173
paddd m1, m2
174
psrld m1, 16
175
%endif ; cpuflag
176
PSIGND m1, m0
177
mova [%1], m1
178
ACCUM por, 5, 1, %4
179
%endmacro
180
181
%macro QUANT_TWO_DC 4
182
%if cpuflag(sse4)
183
mova m0, [%1 ]
184
mova m1, [%1+mmsize]
185
ABSD m2, m0
186
ABSD m3, m1
187
paddd m2, %3
188
paddd m3, %3
189
pmulld m2, %2
190
pmulld m3, %2
191
psrad m2, 16
192
psrad m3, 16
193
PSIGND m2, m0
194
PSIGND m3, m1
195
mova [%1 ], m2
196
mova [%1+mmsize], m3
197
ACCUM por, 5, 2, %4
198
por m5, m3
199
%else ; !sse4
200
QUANT_ONE_DC %1, %2, %3, %4
201
QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
202
%endif ; cpuflag
203
%endmacro
204
205
%macro QUANT_ONE_AC_MMX 5
206
mova m0, [%1]
207
mova m2, [%2]
208
ABSD m1, m0
209
mova m4, m2
210
paddd m1, [%3]
211
mova m3, m1
212
psrlq m4, 32
213
psrlq m3, 32
214
pmuludq m1, m2
215
pmuludq m3, m4
216
psllq m3, 32
217
paddd m1, m3
218
psrad m1, 16
219
PSIGND m1, m0
220
mova [%1], m1
221
ACCUM por, %5, 1, %4
222
%endmacro
223
224
%macro QUANT_TWO_AC 5
225
%if cpuflag(sse4)
226
mova m0, [%1 ]
227
mova m1, [%1+mmsize]
228
ABSD m2, m0
229
ABSD m3, m1
230
paddd m2, [%3 ]
231
paddd m3, [%3+mmsize]
232
pmulld m2, [%2 ]
233
pmulld m3, [%2+mmsize]
234
psrad m2, 16
235
psrad m3, 16
236
PSIGND m2, m0
237
PSIGND m3, m1
238
mova [%1 ], m2
239
mova [%1+mmsize], m3
240
ACCUM por, %5, 2, %4
241
por m%5, m3
242
%else ; !sse4
243
QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
244
QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
245
%endif ; cpuflag
246
%endmacro
247
248
;-----------------------------------------------------------------------------
249
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
250
;-----------------------------------------------------------------------------
251
%macro QUANT_DC 2
252
cglobal quant_%1x%2_dc, 3,3,8
253
QUANT_DC_START 6,7
254
%if %1*%2 <= mmsize/4
255
QUANT_ONE_DC r0, m6, m7, 0
256
%else
257
%assign x 0
258
%rep %1*%2/(mmsize/2)
259
QUANT_TWO_DC r0+x, m6, m7, x
260
%assign x x+mmsize*2
261
%endrep
262
%endif
263
QUANT_END
264
RET
265
%endmacro
266
267
;-----------------------------------------------------------------------------
268
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
269
;-----------------------------------------------------------------------------
270
%macro QUANT_AC 2
271
cglobal quant_%1x%2, 3,3,8
272
%assign x 0
273
%rep %1*%2/(mmsize/2)
274
QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
275
%assign x x+mmsize*2
276
%endrep
277
QUANT_END
278
RET
279
%endmacro
280
281
%macro QUANT_4x4 2
282
QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
283
QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
284
%endmacro
285
286
%macro QUANT_4x4x4 0
287
cglobal quant_4x4x4, 3,3,8
288
QUANT_4x4 0, 5
289
QUANT_4x4 64, 6
290
add r0, 128
291
packssdw m5, m6
292
QUANT_4x4 0, 6
293
QUANT_4x4 64, 7
294
packssdw m6, m7
295
packssdw m5, m6 ; AAAA BBBB CCCC DDDD
296
pxor m4, m4
297
pcmpeqd m5, m4
298
movmskps eax, m5
299
xor eax, 0xf
300
RET
301
%endmacro
302
303
INIT_XMM sse2
304
QUANT_DC 2, 2
305
QUANT_DC 4, 4
306
QUANT_AC 4, 4
307
QUANT_AC 8, 8
308
QUANT_4x4x4
309
310
INIT_XMM ssse3
311
QUANT_DC 2, 2
312
QUANT_DC 4, 4
313
QUANT_AC 4, 4
314
QUANT_AC 8, 8
315
QUANT_4x4x4
316
317
INIT_XMM sse4
318
QUANT_DC 2, 2
319
QUANT_DC 4, 4
320
QUANT_AC 4, 4
321
QUANT_AC 8, 8
322
QUANT_4x4x4
323
324
INIT_YMM avx2
325
QUANT_DC 4, 4
326
QUANT_AC 4, 4
327
QUANT_AC 8, 8
328
329
INIT_YMM avx2
330
cglobal quant_4x4x4, 3,3,6
331
QUANT_TWO_AC r0, r1, r2, 0, 4
332
QUANT_TWO_AC r0+64, r1, r2, 0, 5
333
add r0, 128
334
packssdw m4, m5
335
QUANT_TWO_AC r0, r1, r2, 0, 5
336
QUANT_TWO_AC r0+64, r1, r2, 0, 1
337
packssdw m5, m1
338
packssdw m4, m5
339
pxor m3, m3
340
pcmpeqd m4, m3
341
movmskps eax, m4
342
mov edx, eax
343
shr eax, 4
344
and eax, edx
345
xor eax, 0xf
346
RET
347
348
%endif ; HIGH_BIT_DEPTH
349
350
%if HIGH_BIT_DEPTH == 0
351
%macro QUANT_ONE 5
352
;;; %1 (m64) dct[y][x]
353
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
354
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
355
mova m1, %1 ; load dct coeffs
356
ABSW m0, m1, sign
357
paddusw m0, %3 ; round
358
pmulhuw m0, %2 ; divide
359
PSIGNW m0, m1 ; restore sign
360
mova %1, m0 ; store
361
ACCUM por, %5, 0, %4
362
%endmacro
363
364
%macro QUANT_TWO 8
365
mova m1, %1
366
mova m3, %2
367
ABSW m0, m1, sign
368
ABSW m2, m3, sign
369
paddusw m0, %5
370
paddusw m2, %6
371
pmulhuw m0, %3
372
pmulhuw m2, %4
373
PSIGNW m0, m1
374
PSIGNW m2, m3
375
mova %1, m0
376
mova %2, m2
377
ACCUM por, %8, 0, %7
378
ACCUM por, %8, 2, %7+mmsize
379
%endmacro
380
381
;-----------------------------------------------------------------------------
382
; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
383
;-----------------------------------------------------------------------------
384
%macro QUANT_DC 2-3 0
385
cglobal %1, 1,1,%3
386
%if %2==1
387
QUANT_DC_START 2,3
388
QUANT_ONE [r0], m2, m3, 0, 5
389
%else
390
QUANT_DC_START 4,6
391
%assign x 0
392
%rep %2/2
393
QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
394
%assign x x+mmsize*2
395
%endrep
396
%endif
397
QUANT_END
398
RET
399
%endmacro
400
401
;-----------------------------------------------------------------------------
402
; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
403
;-----------------------------------------------------------------------------
404
%macro QUANT_AC 2
405
cglobal %1, 3,3
406
%if %2==1
407
QUANT_ONE [r0], [r1], [r2], 0, 5
408
%else
409
%assign x 0
410
%rep %2/2
411
QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
412
%assign x x+mmsize*2
413
%endrep
414
%endif
415
QUANT_END
416
RET
417
%endmacro
418
419
%macro QUANT_4x4 2
420
%if UNIX64
421
QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
422
%else
423
QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
424
%if mmsize==8
425
QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
426
%endif
427
%endif
428
%endmacro
429
430
%macro QUANT_4x4x4 0
431
cglobal quant_4x4x4, 3,3,7
432
%if UNIX64
433
mova m8, [r1+mmsize*0]
434
mova m9, [r1+mmsize*1]
435
mova m10, [r2+mmsize*0]
436
mova m11, [r2+mmsize*1]
437
%endif
438
QUANT_4x4 0, 4
439
QUANT_4x4 32, 5
440
packssdw m4, m5
441
QUANT_4x4 64, 5
442
QUANT_4x4 96, 6
443
packssdw m5, m6
444
packssdw m4, m5 ; AAAA BBBB CCCC DDDD
445
pxor m3, m3
446
pcmpeqd m4, m3
447
movmskps eax, m4
448
xor eax, 0xf
449
RET
450
%endmacro
451
452
INIT_MMX mmx2
453
QUANT_DC quant_2x2_dc, 1
454
%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
455
QUANT_DC quant_4x4_dc, 4
456
INIT_MMX mmx2
457
QUANT_AC quant_4x4, 4
458
QUANT_AC quant_8x8, 16
459
%endif
460
461
INIT_XMM sse2
462
QUANT_DC quant_4x4_dc, 2, 7
463
QUANT_AC quant_4x4, 2
464
QUANT_AC quant_8x8, 8
465
QUANT_4x4x4
466
467
INIT_XMM ssse3
468
QUANT_DC quant_4x4_dc, 2, 7
469
QUANT_AC quant_4x4, 2
470
QUANT_AC quant_8x8, 8
471
QUANT_4x4x4
472
473
INIT_MMX ssse3
474
QUANT_DC quant_2x2_dc, 1
475
476
INIT_XMM sse4
477
;Not faster on Conroe, so only used in SSE4 versions
478
QUANT_DC quant_4x4_dc, 2, 7
479
QUANT_AC quant_4x4, 2
480
QUANT_AC quant_8x8, 8
481
482
INIT_YMM avx2
483
QUANT_AC quant_4x4, 1
484
QUANT_AC quant_8x8, 4
485
QUANT_DC quant_4x4_dc, 1, 6
486
487
INIT_YMM avx2
488
cglobal quant_4x4x4, 3,3,6
489
mova m2, [r1]
490
mova m3, [r2]
491
QUANT_ONE [r0+ 0], m2, m3, 0, 4
492
QUANT_ONE [r0+32], m2, m3, 0, 5
493
packssdw m4, m5
494
QUANT_ONE [r0+64], m2, m3, 0, 5
495
QUANT_ONE [r0+96], m2, m3, 0, 1
496
packssdw m5, m1
497
packssdw m4, m5
498
pxor m3, m3
499
pcmpeqd m4, m3
500
movmskps eax, m4
501
mov edx, eax
502
shr eax, 4
503
and eax, edx
504
xor eax, 0xf
505
RET
506
%endif ; !HIGH_BIT_DEPTH
507
508
509
510
;=============================================================================
511
; dequant
512
;=============================================================================
513
514
%macro DEQUANT16_L 4
515
;;; %1 dct[y][x]
516
;;; %2,%3 dequant_mf[i_mf][y][x]
517
;;; m2 i_qbits
518
%if HIGH_BIT_DEPTH
519
mova m0, %1
520
mova m1, %4
521
pmaddwd m0, %2
522
pmaddwd m1, %3
523
pslld m0, xm2
524
pslld m1, xm2
525
mova %1, m0
526
mova %4, m1
527
%else
528
mova m0, %2
529
packssdw m0, %3
530
%if mmsize==32
531
vpermq m0, m0, q3120
532
%endif
533
pmullw m0, %1
534
psllw m0, xm2
535
mova %1, m0
536
%endif
537
%endmacro
538
539
%macro DEQUANT32_R 4
540
;;; %1 dct[y][x]
541
;;; %2,%3 dequant_mf[i_mf][y][x]
542
;;; m2 -i_qbits
543
;;; m3 f
544
;;; m4 0
545
%if HIGH_BIT_DEPTH
546
mova m0, %1
547
mova m1, %4
548
pmadcswd m0, m0, %2, m3
549
pmadcswd m1, m1, %3, m3
550
psrad m0, xm2
551
psrad m1, xm2
552
mova %1, m0
553
mova %4, m1
554
%else
555
%if mmsize == 32
556
pmovzxwd m0, %1
557
pmovzxwd m1, %4
558
%else
559
mova m0, %1
560
punpckhwd m1, m0, m4
561
punpcklwd m0, m4
562
%endif
563
pmadcswd m0, m0, %2, m3
564
pmadcswd m1, m1, %3, m3
565
psrad m0, xm2
566
psrad m1, xm2
567
packssdw m0, m1
568
%if mmsize == 32
569
vpermq m0, m0, q3120
570
%endif
571
mova %1, m0
572
%endif
573
%endmacro
574
575
%macro DEQUANT_LOOP 3
576
%if 8*(%2-2*%3) > 0
577
mov t0d, 8*(%2-2*%3)
578
%%loop:
579
%1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
580
%1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
581
sub t0d, 16*%3
582
jge %%loop
583
RET
584
%else
585
%if mmsize < 32
586
%1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
587
%endif
588
%1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
589
RET
590
%endif
591
%endmacro
592
593
%macro DEQUANT16_FLAT 2-5
594
mova m0, %1
595
psllw m0, m4
596
%assign i %0-2
597
%rep %0-1
598
%if i
599
mova m %+ i, [r0+%2]
600
pmullw m %+ i, m0
601
%else
602
pmullw m0, [r0+%2]
603
%endif
604
mova [r0+%2], m %+ i
605
%assign i i-1
606
%rotate 1
607
%endrep
608
%endmacro
609
610
%if ARCH_X86_64
611
DECLARE_REG_TMP 6,3,2
612
%else
613
DECLARE_REG_TMP 2,0,1
614
%endif
615
616
%macro DEQUANT_START 2
617
movifnidn t2d, r2m
618
imul t0d, t2d, 0x2b
619
shr t0d, 8 ; i_qbits = i_qp / 6
620
lea t1d, [t0*5]
621
sub t2d, t0d
622
sub t2d, t1d ; i_mf = i_qp % 6
623
shl t2d, %1
624
%if ARCH_X86_64
625
add r1, t2 ; dequant_mf[i_mf]
626
%else
627
add r1, r1mp ; dequant_mf[i_mf]
628
mov r0, r0mp ; dct
629
%endif
630
sub t0d, %2
631
jl .rshift32 ; negative qbits => rightshift
632
%endmacro
633
634
;-----------------------------------------------------------------------------
635
; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
636
;-----------------------------------------------------------------------------
637
%macro DEQUANT 3
638
cglobal dequant_%1x%1, 0,3,6
639
.skip_prologue:
640
DEQUANT_START %2+2, %2
641
642
.lshift:
643
movd xm2, t0d
644
DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
645
646
.rshift32:
647
neg t0d
648
mova m3, [pd_1]
649
movd xm2, t0d
650
pslld m3, xm2
651
pxor m4, m4
652
psrld m3, 1
653
DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
654
655
%if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
656
cglobal dequant_%1x%1_flat16, 0,3
657
movifnidn t2d, r2m
658
%if %1 == 8
659
cmp t2d, 12
660
jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
661
sub t2d, 12
662
%endif
663
imul t0d, t2d, 0x2b
664
shr t0d, 8 ; i_qbits = i_qp / 6
665
lea t1d, [t0*5]
666
sub t2d, t0d
667
sub t2d, t1d ; i_mf = i_qp % 6
668
shl t2d, %2
669
%ifdef PIC
670
lea r1, [dequant%1_scale]
671
add r1, t2
672
%else
673
lea r1, [dequant%1_scale + t2]
674
%endif
675
movifnidn r0, r0mp
676
movd xm4, t0d
677
%if %1 == 4
678
%if mmsize == 8
679
DEQUANT16_FLAT [r1], 0, 16
680
DEQUANT16_FLAT [r1+8], 8, 24
681
%elif mmsize == 16
682
DEQUANT16_FLAT [r1], 0, 16
683
%else
684
vbroadcasti128 m0, [r1]
685
psllw m0, xm4
686
pmullw m0, [r0]
687
mova [r0], m0
688
%endif
689
%elif mmsize == 8
690
DEQUANT16_FLAT [r1], 0, 8, 64, 72
691
DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
692
DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
693
DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
694
%elif mmsize == 16
695
DEQUANT16_FLAT [r1], 0, 64
696
DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
697
DEQUANT16_FLAT [r1+32], 32, 96
698
%else
699
mova m1, [r1+ 0]
700
mova m2, [r1+32]
701
psllw m1, xm4
702
psllw m2, xm4
703
pmullw m0, m1, [r0+ 0]
704
pmullw m3, m2, [r0+32]
705
pmullw m4, m1, [r0+64]
706
pmullw m5, m2, [r0+96]
707
mova [r0+ 0], m0
708
mova [r0+32], m3
709
mova [r0+64], m4
710
mova [r0+96], m5
711
%endif
712
RET
713
%endif ; !HIGH_BIT_DEPTH && !AVX
714
%endmacro ; DEQUANT
715
716
%if HIGH_BIT_DEPTH
717
INIT_XMM sse2
718
DEQUANT 4, 4, 2
719
DEQUANT 8, 6, 2
720
INIT_XMM xop
721
DEQUANT 4, 4, 2
722
DEQUANT 8, 6, 2
723
INIT_YMM avx2
724
DEQUANT 4, 4, 4
725
DEQUANT 8, 6, 4
726
%else
727
%if ARCH_X86_64 == 0
728
INIT_MMX mmx
729
DEQUANT 4, 4, 1
730
DEQUANT 8, 6, 1
731
%endif
732
INIT_XMM sse2
733
DEQUANT 4, 4, 2
734
DEQUANT 8, 6, 2
735
INIT_XMM avx
736
DEQUANT 4, 4, 2
737
DEQUANT 8, 6, 2
738
INIT_XMM xop
739
DEQUANT 4, 4, 2
740
DEQUANT 8, 6, 2
741
INIT_YMM avx2
742
DEQUANT 4, 4, 4
743
DEQUANT 8, 6, 4
744
%endif
745
746
%macro DEQUANT_DC 2
747
cglobal dequant_4x4dc, 0,3,6
748
DEQUANT_START 6, 6
749
750
.lshift:
751
%if cpuflag(avx2)
752
vpbroadcastdct m3, [r1]
753
%else
754
movd xm3, [r1]
755
SPLAT%1 m3, xm3
756
%endif
757
movd xm2, t0d
758
pslld m3, xm2
759
%assign %%x 0
760
%rep SIZEOF_PIXEL*32/mmsize
761
%2 m0, m3, [r0+%%x]
762
mova [r0+%%x], m0
763
%assign %%x %%x+mmsize
764
%endrep
765
RET
766
767
.rshift32:
768
neg t0d
769
%if cpuflag(avx2)
770
vpbroadcastdct m2, [r1]
771
%else
772
movd xm2, [r1]
773
%endif
774
mova m5, [p%1_1]
775
movd xm3, t0d
776
pslld m4, m5, xm3
777
psrld m4, 1
778
%if HIGH_BIT_DEPTH
779
%if notcpuflag(avx2)
780
pshufd m2, m2, 0
781
%endif
782
%assign %%x 0
783
%rep SIZEOF_PIXEL*32/mmsize
784
pmadcswd m0, m2, [r0+%%x], m4
785
psrad m0, xm3
786
mova [r0+%%x], m0
787
%assign %%x %%x+mmsize
788
%endrep
789
790
%else ; !HIGH_BIT_DEPTH
791
%if notcpuflag(avx2)
792
PSHUFLW m2, m2, 0
793
%endif
794
punpcklwd m2, m4
795
%assign %%x 0
796
%rep SIZEOF_PIXEL*32/mmsize
797
mova m0, [r0+%%x]
798
punpckhwd m1, m0, m5
799
punpcklwd m0, m5
800
pmaddwd m0, m2
801
pmaddwd m1, m2
802
psrad m0, xm3
803
psrad m1, xm3
804
packssdw m0, m1
805
mova [r0+%%x], m0
806
%assign %%x %%x+mmsize
807
%endrep
808
%endif ; !HIGH_BIT_DEPTH
809
RET
810
%endmacro
811
812
%if HIGH_BIT_DEPTH
813
INIT_XMM sse2
814
DEQUANT_DC d, pmaddwd
815
INIT_XMM xop
816
DEQUANT_DC d, pmaddwd
817
INIT_YMM avx2
818
DEQUANT_DC d, pmaddwd
819
%else
820
%if ARCH_X86_64 == 0
821
INIT_MMX mmx2
822
DEQUANT_DC w, pmullw
823
%endif
824
INIT_XMM sse2
825
DEQUANT_DC w, pmullw
826
INIT_XMM avx
827
DEQUANT_DC w, pmullw
828
INIT_YMM avx2
829
DEQUANT_DC w, pmullw
830
%endif
831
832
; t4 is eax for return value.
833
%if ARCH_X86_64
834
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
835
%else
836
DECLARE_REG_TMP 4,1,2,3,0,5
837
%endif
838
839
;-----------------------------------------------------------------------------
840
; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
841
;-----------------------------------------------------------------------------
842
843
%macro OPTIMIZE_CHROMA_2x2_DC 0
844
cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
845
movifnidn t0, r0mp
846
movd m2, r1m
847
movq m1, [t0]
848
%if cpuflag(sse4)
849
pcmpeqb m4, m4
850
pslld m4, 11
851
%else
852
pxor m4, m4
853
%endif
854
%if cpuflag(ssse3)
855
mova m3, [chroma_dc_dct_mask]
856
mova m5, [chroma_dc_dmf_mask]
857
%else
858
mova m3, [chroma_dc_dct_mask_mmx]
859
mova m5, [chroma_dc_dmf_mask_mmx]
860
%endif
861
pshuflw m2, m2, 0
862
pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
863
punpcklqdq m2, m2
864
punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
865
mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
866
PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
867
PSIGNW m2, m5 ; + - - + - - + +
868
paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
869
pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
870
punpcklwd m1, m1
871
psrad m2, 16 ; + - - +
872
mov t1d, 3
873
paddd m0, m6
874
xor t4d, t4d
875
%if notcpuflag(ssse3)
876
psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
877
%endif
878
%if cpuflag(sse4)
879
ptest m0, m4
880
%else
881
mova m6, m0
882
SWAP 0, 6
883
psrad m6, 11
884
pcmpeqd m6, m4
885
pmovmskb t5d, m6
886
cmp t5d, 0xffff
887
%endif
888
jz .ret ; if the DC coefficients already round to zero, terminate early
889
mova m3, m0
890
.outer_loop:
891
movsx t3d, word [t0+2*t1] ; dct[coeff]
892
pshufd m6, m1, q3333
893
pshufd m1, m1, q2100 ; move the next element to high dword
894
PSIGND m5, m2, m6
895
test t3d, t3d
896
jz .loop_end
897
.outer_loop_0:
898
mov t2d, t3d
899
sar t3d, 31
900
or t3d, 1
901
.inner_loop:
902
psubd m3, m5 ; coeff -= sign
903
pxor m6, m0, m3
904
%if cpuflag(sse4)
905
ptest m6, m4
906
%else
907
psrad m6, 11
908
pcmpeqd m6, m4
909
pmovmskb t5d, m6
910
cmp t5d, 0xffff
911
%endif
912
jz .round_coeff
913
paddd m3, m5 ; coeff += sign
914
mov t4d, 1
915
.loop_end:
916
dec t1d
917
jz .last_coeff
918
pshufd m2, m2, q1320 ; - + - + / - - + +
919
jg .outer_loop
920
.ret:
921
REP_RET
922
.round_coeff:
923
sub t2d, t3d
924
mov [t0+2*t1], t2w
925
jnz .inner_loop
926
jmp .loop_end
927
.last_coeff:
928
movsx t3d, word [t0]
929
punpcklqdq m2, m2 ; + + + +
930
PSIGND m5, m2, m1
931
test t3d, t3d
932
jnz .outer_loop_0
933
RET
934
%endmacro
935
936
%if HIGH_BIT_DEPTH == 0
937
INIT_XMM sse2
938
OPTIMIZE_CHROMA_2x2_DC
939
INIT_XMM ssse3
940
OPTIMIZE_CHROMA_2x2_DC
941
INIT_XMM sse4
942
OPTIMIZE_CHROMA_2x2_DC
943
INIT_XMM avx
944
OPTIMIZE_CHROMA_2x2_DC
945
%endif ; !HIGH_BIT_DEPTH
946
947
%if HIGH_BIT_DEPTH
948
;-----------------------------------------------------------------------------
949
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
950
;-----------------------------------------------------------------------------
951
%macro DENOISE_DCT 0
952
cglobal denoise_dct, 4,4,6
953
pxor m5, m5
954
movsxdifnidn r3, r3d
955
.loop:
956
mova m2, [r0+r3*4-2*mmsize]
957
mova m3, [r0+r3*4-1*mmsize]
958
ABSD m0, m2
959
ABSD m1, m3
960
paddd m4, m0, [r1+r3*4-2*mmsize]
961
psubd m0, [r2+r3*4-2*mmsize]
962
mova [r1+r3*4-2*mmsize], m4
963
paddd m4, m1, [r1+r3*4-1*mmsize]
964
psubd m1, [r2+r3*4-1*mmsize]
965
mova [r1+r3*4-1*mmsize], m4
966
pcmpgtd m4, m0, m5
967
pand m0, m4
968
pcmpgtd m4, m1, m5
969
pand m1, m4
970
PSIGND m0, m2
971
PSIGND m1, m3
972
mova [r0+r3*4-2*mmsize], m0
973
mova [r0+r3*4-1*mmsize], m1
974
sub r3d, mmsize/2
975
jg .loop
976
RET
977
%endmacro
978
979
%if ARCH_X86_64 == 0
980
INIT_MMX mmx
981
DENOISE_DCT
982
%endif
983
INIT_XMM sse2
984
DENOISE_DCT
985
INIT_XMM ssse3
986
DENOISE_DCT
987
INIT_XMM avx
988
DENOISE_DCT
989
INIT_YMM avx2
990
DENOISE_DCT
991
992
%else ; !HIGH_BIT_DEPTH
993
994
;-----------------------------------------------------------------------------
995
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
996
;-----------------------------------------------------------------------------
997
%macro DENOISE_DCT 0
998
cglobal denoise_dct, 4,4,7
999
pxor m6, m6
1000
movsxdifnidn r3, r3d
1001
.loop:
1002
mova m2, [r0+r3*2-2*mmsize]
1003
mova m3, [r0+r3*2-1*mmsize]
1004
ABSW m0, m2, sign
1005
ABSW m1, m3, sign
1006
psubusw m4, m0, [r2+r3*2-2*mmsize]
1007
psubusw m5, m1, [r2+r3*2-1*mmsize]
1008
PSIGNW m4, m2
1009
PSIGNW m5, m3
1010
mova [r0+r3*2-2*mmsize], m4
1011
mova [r0+r3*2-1*mmsize], m5
1012
punpcklwd m2, m0, m6
1013
punpcklwd m3, m1, m6
1014
punpckhwd m0, m6
1015
punpckhwd m1, m6
1016
paddd m2, [r1+r3*4-4*mmsize]
1017
paddd m0, [r1+r3*4-3*mmsize]
1018
paddd m3, [r1+r3*4-2*mmsize]
1019
paddd m1, [r1+r3*4-1*mmsize]
1020
mova [r1+r3*4-4*mmsize], m2
1021
mova [r1+r3*4-3*mmsize], m0
1022
mova [r1+r3*4-2*mmsize], m3
1023
mova [r1+r3*4-1*mmsize], m1
1024
sub r3, mmsize
1025
jg .loop
1026
RET
1027
%endmacro
1028
1029
%if ARCH_X86_64 == 0
1030
INIT_MMX mmx
1031
DENOISE_DCT
1032
%endif
1033
INIT_XMM sse2
1034
DENOISE_DCT
1035
INIT_XMM ssse3
1036
DENOISE_DCT
1037
INIT_XMM avx
1038
DENOISE_DCT
1039
1040
INIT_YMM avx2
1041
cglobal denoise_dct, 4,4,4
1042
pxor m3, m3
1043
movsxdifnidn r3, r3d
1044
.loop:
1045
mova m1, [r0+r3*2-mmsize]
1046
pabsw m0, m1
1047
psubusw m2, m0, [r2+r3*2-mmsize]
1048
vpermq m0, m0, q3120
1049
psignw m2, m1
1050
mova [r0+r3*2-mmsize], m2
1051
punpcklwd m1, m0, m3
1052
punpckhwd m0, m3
1053
paddd m1, [r1+r3*4-2*mmsize]
1054
paddd m0, [r1+r3*4-1*mmsize]
1055
mova [r1+r3*4-2*mmsize], m1
1056
mova [r1+r3*4-1*mmsize], m0
1057
sub r3, mmsize/2
1058
jg .loop
1059
RET
1060
1061
%endif ; !HIGH_BIT_DEPTH
1062
1063
;-----------------------------------------------------------------------------
1064
; int decimate_score( dctcoef *dct )
1065
;-----------------------------------------------------------------------------
1066
1067
%macro DECIMATE_MASK 5
1068
%if mmsize==16
1069
%if HIGH_BIT_DEPTH
1070
movdqa m0, [%3+ 0]
1071
movdqa m1, [%3+32]
1072
packssdw m0, [%3+16]
1073
packssdw m1, [%3+48]
1074
ABSW2 m0, m1, m0, m1, m3, m4
1075
%else
1076
ABSW m0, [%3+ 0], m3
1077
ABSW m1, [%3+16], m4
1078
%endif
1079
packsswb m0, m1
1080
pxor m2, m2
1081
pcmpeqb m2, m0
1082
pcmpgtb m0, %4
1083
pmovmskb %1, m2
1084
pmovmskb %2, m0
1085
%else ; mmsize==8
1086
%if HIGH_BIT_DEPTH
1087
movq m0, [%3+ 0]
1088
movq m1, [%3+16]
1089
movq m2, [%3+32]
1090
movq m3, [%3+48]
1091
packssdw m0, [%3+ 8]
1092
packssdw m1, [%3+24]
1093
packssdw m2, [%3+40]
1094
packssdw m3, [%3+56]
1095
%else
1096
movq m0, [%3+ 0]
1097
movq m1, [%3+ 8]
1098
movq m2, [%3+16]
1099
movq m3, [%3+24]
1100
%endif
1101
ABSW2 m0, m1, m0, m1, m6, m7
1102
ABSW2 m2, m3, m2, m3, m6, m7
1103
packsswb m0, m1
1104
packsswb m2, m3
1105
pxor m4, m4
1106
pxor m6, m6
1107
pcmpeqb m4, m0
1108
pcmpeqb m6, m2
1109
pcmpgtb m0, %4
1110
pcmpgtb m2, %4
1111
pmovmskb %5, m4
1112
pmovmskb %1, m6
1113
shl %1, 8
1114
or %1, %5
1115
pmovmskb %5, m0
1116
pmovmskb %2, m2
1117
shl %2, 8
1118
or %2, %5
1119
%endif
1120
%endmacro
1121
1122
cextern decimate_table4
1123
cextern decimate_table8
1124
1125
%macro DECIMATE4x4 1
1126
1127
cglobal decimate_score%1, 1,3
1128
%ifdef PIC
1129
lea r4, [decimate_table4]
1130
lea r5, [decimate_mask_table4]
1131
%define table r4
1132
%define mask_table r5
1133
%else
1134
%define table decimate_table4
1135
%define mask_table decimate_mask_table4
1136
%endif
1137
DECIMATE_MASK edx, eax, r0, [pb_1], ecx
1138
xor edx, 0xffff
1139
je .ret
1140
test eax, eax
1141
jne .ret9
1142
%if %1==15
1143
shr edx, 1
1144
%endif
1145
movzx ecx, dl
1146
movzx eax, byte [mask_table + rcx]
1147
cmp edx, ecx
1148
je .ret
1149
bsr ecx, ecx
1150
shr edx, 1
1151
shr edx, cl
1152
tzcnt ecx, edx
1153
shr edx, 1
1154
shr edx, cl
1155
add al, byte [table + rcx]
1156
add al, byte [mask_table + rdx]
1157
.ret:
1158
REP_RET
1159
.ret9:
1160
mov eax, 9
1161
RET
1162
1163
%endmacro
1164
1165
%if ARCH_X86_64 == 0
1166
INIT_MMX mmx2
1167
DECIMATE4x4 15
1168
DECIMATE4x4 16
1169
%endif
1170
INIT_XMM sse2
1171
DECIMATE4x4 15
1172
DECIMATE4x4 16
1173
INIT_XMM ssse3
1174
DECIMATE4x4 15
1175
DECIMATE4x4 16
1176
1177
; 2x gt1 output, 2x nz output, 1x mask
1178
%macro DECIMATE_MASK64_AVX2 5
1179
pabsw m0, [r0+ 0]
1180
pabsw m2, [r0+32]
1181
pabsw m1, [r0+64]
1182
pabsw m3, [r0+96]
1183
packsswb m0, m2
1184
packsswb m1, m3
1185
pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
1186
pcmpgtb m3, m1, %5 ; we can save latency by doing them here
1187
pmovmskb %1, m2
1188
pmovmskb %2, m3
1189
or %1, %2
1190
jne .ret9
1191
vpermq m0, m0, q3120
1192
vpermq m1, m1, q3120
1193
pxor m4, m4
1194
pcmpeqb m0, m4
1195
pcmpeqb m1, m4
1196
pmovmskb %3, m0
1197
pmovmskb %4, m1
1198
%endmacro
1199
1200
%macro DECIMATE8x8 0
1201
1202
%if ARCH_X86_64
1203
cglobal decimate_score64, 1,5
1204
%ifdef PIC
1205
lea r4, [decimate_table8]
1206
%define table r4
1207
%else
1208
%define table decimate_table8
1209
%endif
1210
mova m5, [pb_1]
1211
%if mmsize==32
1212
DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
1213
shl r3, 32
1214
or r1, r3
1215
xor r1, -1
1216
je .ret
1217
%else
1218
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
1219
test eax, eax
1220
jne .ret9
1221
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
1222
shl r2d, 16
1223
or r1d, r2d
1224
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
1225
shl r2, 32
1226
or eax, r3d
1227
or r1, r2
1228
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
1229
shl r2, 48
1230
or r1, r2
1231
xor r1, -1
1232
je .ret
1233
add eax, r3d
1234
jne .ret9
1235
%endif
1236
mov al, -6
1237
.loop:
1238
tzcnt rcx, r1
1239
shr r1, cl
1240
add al, byte [table + rcx]
1241
jge .ret9
1242
shr r1, 1
1243
jne .loop
1244
add al, 6
1245
.ret:
1246
REP_RET
1247
.ret9:
1248
mov eax, 9
1249
RET
1250
1251
%else ; ARCH
1252
%if mmsize == 8
1253
cglobal decimate_score64, 1,6
1254
%else
1255
cglobal decimate_score64, 1,5
1256
%endif
1257
mova m5, [pb_1]
1258
%if mmsize==32
1259
DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
1260
xor r3, -1
1261
je .tryret
1262
xor r4, -1
1263
.cont:
1264
%else
1265
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
1266
test r2, r2
1267
jne .ret9
1268
DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
1269
shl r4, 16
1270
or r3, r4
1271
DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
1272
or r2, r1
1273
DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
1274
shl r1, 16
1275
or r4, r1
1276
xor r3, -1
1277
je .tryret
1278
xor r4, -1
1279
.cont:
1280
add r0, r2
1281
jne .ret9
1282
%endif
1283
mov al, -6
1284
.loop:
1285
tzcnt ecx, r3
1286
test r3, r3
1287
je .largerun
1288
shrd r3, r4, cl
1289
shr r4, cl
1290
add al, byte [decimate_table8 + ecx]
1291
jge .ret9
1292
shrd r3, r4, 1
1293
shr r4, 1
1294
test r3, r3
1295
jne .loop
1296
test r4, r4
1297
jne .loop
1298
add al, 6
1299
.ret:
1300
REP_RET
1301
.tryret:
1302
xor r4, -1
1303
jne .cont
1304
RET
1305
.ret9:
1306
mov eax, 9
1307
RET
1308
.largerun:
1309
mov r3, r4
1310
xor r4, r4
1311
tzcnt ecx, r3
1312
shr r3, cl
1313
shr r3, 1
1314
jne .loop
1315
add al, 6
1316
RET
1317
%endif ; ARCH
1318
1319
%endmacro
1320
1321
%if ARCH_X86_64 == 0
1322
INIT_MMX mmx2
1323
DECIMATE8x8
1324
%endif
1325
INIT_XMM sse2
1326
DECIMATE8x8
1327
INIT_XMM ssse3
1328
DECIMATE8x8
1329
INIT_YMM avx2
1330
DECIMATE8x8
1331
1332
;-----------------------------------------------------------------------------
1333
; int coeff_last( dctcoef *dct )
1334
;-----------------------------------------------------------------------------
1335
1336
%macro BSR 3
1337
%if cpuflag(lzcnt)
1338
lzcnt %1, %2
1339
xor %1, %3
1340
%else
1341
bsr %1, %2
1342
%endif
1343
%endmacro
1344
1345
%macro LZCOUNT 3
1346
%if cpuflag(lzcnt)
1347
lzcnt %1, %2
1348
%else
1349
bsr %1, %2
1350
xor %1, %3
1351
%endif
1352
%endmacro
1353
1354
%if HIGH_BIT_DEPTH
1355
%macro LAST_MASK 3-4
1356
%if %1 == 4
1357
movq mm0, [%3]
1358
packssdw mm0, [%3+8]
1359
packsswb mm0, mm0
1360
pcmpeqb mm0, mm2
1361
pmovmskb %2, mm0
1362
%elif mmsize == 16
1363
movdqa xmm0, [%3+ 0]
1364
%if %1 == 8
1365
packssdw xmm0, [%3+16]
1366
packsswb xmm0, xmm0
1367
%else
1368
movdqa xmm1, [%3+32]
1369
packssdw xmm0, [%3+16]
1370
packssdw xmm1, [%3+48]
1371
packsswb xmm0, xmm1
1372
%endif
1373
pcmpeqb xmm0, xmm2
1374
pmovmskb %2, xmm0
1375
%elif %1 == 8
1376
movq mm0, [%3+ 0]
1377
movq mm1, [%3+16]
1378
packssdw mm0, [%3+ 8]
1379
packssdw mm1, [%3+24]
1380
packsswb mm0, mm1
1381
pcmpeqb mm0, mm2
1382
pmovmskb %2, mm0
1383
%else
1384
movq mm0, [%3+ 0]
1385
movq mm1, [%3+16]
1386
packssdw mm0, [%3+ 8]
1387
packssdw mm1, [%3+24]
1388
movq mm3, [%3+32]
1389
movq mm4, [%3+48]
1390
packssdw mm3, [%3+40]
1391
packssdw mm4, [%3+56]
1392
packsswb mm0, mm1
1393
packsswb mm3, mm4
1394
pcmpeqb mm0, mm2
1395
pcmpeqb mm3, mm2
1396
pmovmskb %2, mm0
1397
pmovmskb %4, mm3
1398
shl %4, 8
1399
or %2, %4
1400
%endif
1401
%endmacro
1402
1403
%macro COEFF_LAST4 0
1404
cglobal coeff_last4, 1,3
1405
pxor mm2, mm2
1406
LAST_MASK 4, r1d, r0
1407
xor r1d, 0xff
1408
shr r1d, 4
1409
BSR eax, r1d, 0x1f
1410
RET
1411
%endmacro
1412
1413
INIT_MMX mmx2
1414
COEFF_LAST4
1415
INIT_MMX mmx2, lzcnt
1416
COEFF_LAST4
1417
1418
%macro COEFF_LAST8 0
1419
cglobal coeff_last8, 1,3
1420
pxor m2, m2
1421
LAST_MASK 8, r1d, r0
1422
%if mmsize == 16
1423
xor r1d, 0xffff
1424
shr r1d, 8
1425
%else
1426
xor r1d, 0xff
1427
%endif
1428
BSR eax, r1d, 0x1f
1429
RET
1430
%endmacro
1431
1432
%if ARCH_X86_64 == 0
1433
INIT_MMX mmx2
1434
COEFF_LAST8
1435
%endif
1436
INIT_XMM sse2
1437
COEFF_LAST8
1438
INIT_XMM sse2, lzcnt
1439
COEFF_LAST8
1440
1441
%else ; !HIGH_BIT_DEPTH
1442
%macro LAST_MASK 3-4
1443
%if %1 <= 8
1444
movq mm0, [%3+ 0]
1445
%if %1 == 4
1446
packsswb mm0, mm0
1447
%else
1448
packsswb mm0, [%3+ 8]
1449
%endif
1450
pcmpeqb mm0, mm2
1451
pmovmskb %2, mm0
1452
%elif mmsize == 16
1453
movdqa xmm0, [%3+ 0]
1454
packsswb xmm0, [%3+16]
1455
pcmpeqb xmm0, xmm2
1456
pmovmskb %2, xmm0
1457
%else
1458
movq mm0, [%3+ 0]
1459
movq mm1, [%3+16]
1460
packsswb mm0, [%3+ 8]
1461
packsswb mm1, [%3+24]
1462
pcmpeqb mm0, mm2
1463
pcmpeqb mm1, mm2
1464
pmovmskb %2, mm0
1465
pmovmskb %4, mm1
1466
shl %4, 8
1467
or %2, %4
1468
%endif
1469
%endmacro
1470
1471
%macro COEFF_LAST48 0
1472
%if ARCH_X86_64
1473
cglobal coeff_last4, 1,1
1474
BSR rax, [r0], 0x3f
1475
shr eax, 4
1476
RET
1477
%else
1478
cglobal coeff_last4, 0,3
1479
mov edx, r0mp
1480
mov eax, [edx+4]
1481
xor ecx, ecx
1482
test eax, eax
1483
cmovz eax, [edx]
1484
setnz cl
1485
BSR eax, eax, 0x1f
1486
shr eax, 4
1487
lea eax, [eax+ecx*2]
1488
RET
1489
%endif
1490
1491
cglobal coeff_last8, 1,3
1492
pxor m2, m2
1493
LAST_MASK 8, r1d, r0, r2d
1494
xor r1d, 0xff
1495
BSR eax, r1d, 0x1f
1496
RET
1497
%endmacro
1498
1499
INIT_MMX mmx2
1500
COEFF_LAST48
1501
INIT_MMX mmx2, lzcnt
1502
COEFF_LAST48
1503
%endif ; HIGH_BIT_DEPTH
1504
1505
%macro COEFF_LAST 0
1506
cglobal coeff_last15, 1,3
1507
pxor m2, m2
1508
LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
1509
xor r1d, 0xffff
1510
BSR eax, r1d, 0x1f
1511
dec eax
1512
RET
1513
1514
cglobal coeff_last16, 1,3
1515
pxor m2, m2
1516
LAST_MASK 16, r1d, r0, r2d
1517
xor r1d, 0xffff
1518
BSR eax, r1d, 0x1f
1519
RET
1520
1521
%if ARCH_X86_64 == 0
1522
cglobal coeff_last64, 1, 4-mmsize/16
1523
pxor m2, m2
1524
LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
1525
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
1526
shl r2d, 16
1527
or r1d, r2d
1528
xor r1d, -1
1529
jne .secondhalf
1530
LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
1531
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
1532
shl r2d, 16
1533
or r1d, r2d
1534
not r1d
1535
BSR eax, r1d, 0x1f
1536
RET
1537
.secondhalf:
1538
BSR eax, r1d, 0x1f
1539
add eax, 32
1540
RET
1541
%else
1542
cglobal coeff_last64, 1,3
1543
pxor m2, m2
1544
LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
1545
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
1546
shl r2d, 16
1547
or r1d, r2d
1548
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
1549
LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
1550
shl r0d, 16
1551
or r2d, r0d
1552
shl r2, 32
1553
or r1, r2
1554
not r1
1555
BSR rax, r1, 0x3f
1556
RET
1557
%endif
1558
%endmacro
1559
1560
%if ARCH_X86_64 == 0
1561
INIT_MMX mmx2
1562
COEFF_LAST
1563
%endif
1564
INIT_XMM sse2
1565
COEFF_LAST
1566
INIT_XMM sse2, lzcnt
1567
COEFF_LAST
1568
1569
%macro LAST_MASK_AVX2 2
1570
%if HIGH_BIT_DEPTH
1571
mova m0, [%2+ 0]
1572
packssdw m0, [%2+32]
1573
mova m1, [%2+64]
1574
packssdw m1, [%2+96]
1575
packsswb m0, m1
1576
mova m1, [deinterleave_shufd]
1577
vpermd m0, m1, m0
1578
%else
1579
mova m0, [%2+ 0]
1580
packsswb m0, [%2+32]
1581
vpermq m0, m0, q3120
1582
%endif
1583
pcmpeqb m0, m2
1584
pmovmskb %1, m0
1585
%endmacro
1586
1587
%if ARCH_X86_64 == 0
1588
INIT_YMM avx2,lzcnt
1589
cglobal coeff_last64, 1,2
1590
pxor m2, m2
1591
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
1592
xor r1d, -1
1593
jne .secondhalf
1594
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1595
not r1d
1596
BSR eax, r1d, 0x1f
1597
RET
1598
.secondhalf:
1599
BSR eax, r1d, 0x1f
1600
add eax, 32
1601
RET
1602
%else
1603
INIT_YMM avx2,lzcnt
1604
cglobal coeff_last64, 1,3
1605
pxor m2, m2
1606
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
1607
LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
1608
shl r2, 32
1609
or r1, r2
1610
not r1
1611
BSR rax, r1, 0x3f
1612
RET
1613
%endif
1614
1615
;-----------------------------------------------------------------------------
1616
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
1617
;-----------------------------------------------------------------------------
1618
1619
struc levelrun
1620
.last: resd 1
1621
.mask: resd 1
1622
align 16, resb 1
1623
.level: resw 16
1624
endstruc
1625
1626
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
1627
%if WIN64
1628
DECLARE_REG_TMP 3,1,2,0,4,5,6
1629
%elif ARCH_X86_64
1630
DECLARE_REG_TMP 0,1,2,3,4,5,6
1631
%else
1632
DECLARE_REG_TMP 6,3,2,1,4,5,0
1633
%endif
1634
1635
%macro COEFF_LEVELRUN 1
1636
cglobal coeff_level_run%1,0,7
1637
movifnidn t0, r0mp
1638
movifnidn t1, r1mp
1639
pxor m2, m2
1640
xor t3d, t3d
1641
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
1642
%if %1==15
1643
shr t5d, 1
1644
%elif %1==8
1645
and t5d, 0xff
1646
%elif %1==4
1647
and t5d, 0xf
1648
%endif
1649
xor t5d, (1<<%1)-1
1650
mov [t1+levelrun.mask], t5d
1651
shl t5d, 32-%1
1652
mov t4d, %1-1
1653
LZCOUNT t3d, t5d, 0x1f
1654
xor t6d, t6d
1655
add t5d, t5d
1656
sub t4d, t3d
1657
shl t5d, t3b
1658
mov [t1+levelrun.last], t4d
1659
.loop:
1660
LZCOUNT t3d, t5d, 0x1f
1661
%if HIGH_BIT_DEPTH
1662
mov t2d, [t0+t4*4]
1663
%else
1664
mov t2w, [t0+t4*2]
1665
%endif
1666
inc t3d
1667
shl t5d, t3b
1668
%if HIGH_BIT_DEPTH
1669
mov [t1+t6*4+levelrun.level], t2d
1670
%else
1671
mov [t1+t6*2+levelrun.level], t2w
1672
%endif
1673
inc t6d
1674
sub t4d, t3d
1675
jge .loop
1676
RET
1677
%endmacro
1678
1679
INIT_MMX mmx2
1680
%if ARCH_X86_64 == 0
1681
COEFF_LEVELRUN 15
1682
COEFF_LEVELRUN 16
1683
%endif
1684
COEFF_LEVELRUN 4
1685
COEFF_LEVELRUN 8
1686
INIT_XMM sse2
1687
%if HIGH_BIT_DEPTH
1688
COEFF_LEVELRUN 8
1689
%endif
1690
COEFF_LEVELRUN 15
1691
COEFF_LEVELRUN 16
1692
INIT_XMM sse2, lzcnt
1693
%if HIGH_BIT_DEPTH
1694
COEFF_LEVELRUN 8
1695
%endif
1696
COEFF_LEVELRUN 15
1697
COEFF_LEVELRUN 16
1698
INIT_MMX mmx2, lzcnt
1699
COEFF_LEVELRUN 4
1700
COEFF_LEVELRUN 8
1701
1702
; Similar to the one above, but saves the DCT
1703
; coefficients in m0/m1 so we don't have to load
1704
; them later.
1705
%macro LAST_MASK_LUT 3
1706
pxor xm5, xm5
1707
%if %1 <= 8
1708
mova m0, [%3]
1709
packsswb m2, m0, m0
1710
%else
1711
mova xm0, [%3+ 0]
1712
mova xm1, [%3+16]
1713
packsswb xm2, xm0, xm1
1714
%if mmsize==32
1715
vinserti128 m0, m0, xm1, 1
1716
%endif
1717
%endif
1718
pcmpeqb xm2, xm5
1719
pmovmskb %2, xm2
1720
%endmacro
1721
1722
%macro COEFF_LEVELRUN_LUT 1
1723
cglobal coeff_level_run%1,2,4+(%1/9)
1724
%ifdef PIC
1725
lea r5, [$$]
1726
%define GLOBAL +r5-$$
1727
%else
1728
%define GLOBAL
1729
%endif
1730
LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
1731
%if %1==15
1732
shr eax, 1
1733
%elif %1==8
1734
and eax, 0xff
1735
%elif %1==4
1736
and eax, 0xf
1737
%endif
1738
xor eax, (1<<%1)-1
1739
mov [r1+levelrun.mask], eax
1740
%if %1==15
1741
add eax, eax
1742
%endif
1743
%if %1 > 8
1744
%if ARCH_X86_64
1745
mov r4d, eax
1746
shr r4d, 8
1747
%else
1748
movzx r4d, ah ; first 8 bits
1749
%endif
1750
%endif
1751
movzx r2d, al ; second 8 bits
1752
shl eax, 32-%1-(%1&1)
1753
LZCOUNT eax, eax, 0x1f
1754
mov r3d, %1-1
1755
sub r3d, eax
1756
mov [r1+levelrun.last], r3d
1757
; Here we abuse pshufb, combined with a lookup table, to do a gather
1758
; operation based on a bitmask. For example:
1759
;
1760
; dct 15-8 (input): 0 0 4 0 0 -2 1 0
1761
; dct 7-0 (input): 0 0 -1 0 0 0 0 15
1762
; bitmask 1: 0 0 1 0 0 1 1 0
1763
; bitmask 2: 0 0 1 0 0 0 0 1
1764
; gather 15-8: 4 -2 1 __ __ __ __ __
1765
; gather 7-0: -1 15 __ __ __ __ __ __
1766
; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
1767
;
1768
; The overlapping, dependent stores almost surely cause a mess of
1769
; forwarding issues, but it's still enormously faster.
1770
%if %1 > 8
1771
movzx eax, byte [popcnt_table+r4 GLOBAL]
1772
movzx r3d, byte [popcnt_table+r2 GLOBAL]
1773
%if mmsize==16
1774
movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
1775
movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
1776
mova m4, [pw_256]
1777
; Storing 8 bytes of shuffle constant and converting it (unpack + or)
1778
; is neutral to slightly faster in local speed measurements, but it
1779
; cuts the table size in half, which is surely a big cache win.
1780
punpcklbw m3, m3
1781
punpcklbw m2, m2
1782
por m3, m4
1783
por m2, m4
1784
pshufb m1, m3
1785
pshufb m0, m2
1786
mova [r1+levelrun.level], m1
1787
; This obnoxious unaligned store messes with store forwarding and
1788
; stalls the CPU to no end, but merging the two registers before
1789
; storing requires a variable 128-bit shift. Emulating this does
1790
; work, but requires a lot of ops and the gain is tiny and
1791
; inconsistent, so we'll err on the side of fewer instructions.
1792
movu [r1+rax*2+levelrun.level], m0
1793
%else ; mmsize==32
1794
movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
1795
vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
1796
punpcklbw m2, m2
1797
por m2, [pw_256]
1798
pshufb m0, m2
1799
vextracti128 [r1+levelrun.level], m0, 1
1800
movu [r1+rax*2+levelrun.level], xm0
1801
%endif
1802
add eax, r3d
1803
%else
1804
movzx eax, byte [popcnt_table+r2 GLOBAL]
1805
movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
1806
punpcklbw m1, m1
1807
por m1, [pw_256]
1808
pshufb m0, m1
1809
mova [r1+levelrun.level], m0
1810
%endif
1811
RET
1812
%endmacro
1813
1814
%if HIGH_BIT_DEPTH==0
1815
INIT_MMX ssse3
1816
COEFF_LEVELRUN_LUT 4
1817
INIT_XMM ssse3
1818
COEFF_LEVELRUN_LUT 8
1819
COEFF_LEVELRUN_LUT 15
1820
COEFF_LEVELRUN_LUT 16
1821
INIT_MMX ssse3, lzcnt
1822
COEFF_LEVELRUN_LUT 4
1823
INIT_XMM ssse3, lzcnt
1824
COEFF_LEVELRUN_LUT 8
1825
COEFF_LEVELRUN_LUT 15
1826
COEFF_LEVELRUN_LUT 16
1827
INIT_XMM avx2, lzcnt
1828
COEFF_LEVELRUN_LUT 15
1829
COEFF_LEVELRUN_LUT 16
1830
%endif
1831
1832