Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
;*****************************************************************************
2
;* dct-a.asm: x86 transform and zigzag
3
;*****************************************************************************
4
;* Copyright (C) 2003-2016 x264 project
5
;*
6
;* Authors: Holger Lubitz <holger@lubitz.org>
7
;* Loren Merritt <lorenm@u.washington.edu>
8
;* Laurent Aimar <fenrir@via.ecp.fr>
9
;* Min Chen <chenm001.163.com>
10
;* Fiona Glaser <fiona@x264.com>
11
;*
12
;* This program is free software; you can redistribute it and/or modify
13
;* it under the terms of the GNU General Public License as published by
14
;* the Free Software Foundation; either version 2 of the License, or
15
;* (at your option) any later version.
16
;*
17
;* This program is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
;* GNU General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU General Public License
23
;* along with this program; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25
;*
26
;* This program is also available under a commercial proprietary license.
27
;* For more information, contact us at licensing@x264.com.
28
;*****************************************************************************
29
30
%include "x86inc.asm"
31
%include "x86util.asm"
32
33
SECTION_RODATA 32
34
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
35
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
36
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
37
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
38
pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
39
pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
40
pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
41
pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
42
43
pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
44
pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
45
pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
46
pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
47
pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
48
pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
49
pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
50
pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
51
pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
52
53
pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
54
pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
55
pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
56
pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
57
pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
58
pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
59
pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
60
pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
61
62
pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
63
pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
64
pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
65
pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
66
pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
67
pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
68
pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
69
pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
70
71
SECTION .text
72
73
cextern pw_32_0
74
cextern pw_32
75
cextern pw_512
76
cextern pw_8000
77
cextern pw_pixel_max
78
cextern hsub_mul
79
cextern pb_1
80
cextern pw_1
81
cextern pd_1
82
cextern pd_32
83
cextern pw_ppppmmmm
84
cextern pw_pmpmpmpm
85
cextern deinterleave_shufd
86
cextern pb_unpackbd1
87
cextern pb_unpackbd2
88
89
%macro WALSH4_1D 6
90
SUMSUB_BADC %1, %5, %4, %3, %2, %6
91
SUMSUB_BADC %1, %5, %3, %4, %2, %6
92
SWAP %2, %5, %4
93
%endmacro
94
95
%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
96
movq m%3, m%4
97
pxor m%1, m%4
98
psubw m%3, m%2
99
pxor m%2, m%4
100
pavgw m%3, m%1
101
pavgw m%2, m%1
102
pxor m%3, m%4
103
pxor m%2, m%4
104
SWAP %1, %2, %3
105
%endmacro
106
107
%macro DCT_UNPACK 3
108
punpcklwd %3, %1
109
punpckhwd %2, %1
110
psrad %3, 16
111
psrad %2, 16
112
SWAP %1, %3
113
%endmacro
114
115
%if HIGH_BIT_DEPTH
116
;-----------------------------------------------------------------------------
117
; void dct4x4dc( dctcoef d[4][4] )
118
;-----------------------------------------------------------------------------
119
%macro DCT4x4_DC 0
120
cglobal dct4x4dc, 1,1,5
121
mova m0, [r0+ 0]
122
mova m1, [r0+16]
123
mova m2, [r0+32]
124
mova m3, [r0+48]
125
WALSH4_1D d, 0,1,2,3,4
126
TRANSPOSE4x4D 0,1,2,3,4
127
paddd m0, [pd_1]
128
WALSH4_1D d, 0,1,2,3,4
129
psrad m0, 1
130
psrad m1, 1
131
psrad m2, 1
132
psrad m3, 1
133
mova [r0+ 0], m0
134
mova [r0+16], m1
135
mova [r0+32], m2
136
mova [r0+48], m3
137
RET
138
%endmacro ; DCT4x4_DC
139
140
INIT_XMM sse2
141
DCT4x4_DC
142
INIT_XMM avx
143
DCT4x4_DC
144
%else
145
146
INIT_MMX mmx2
147
cglobal dct4x4dc, 1,1
148
movq m3, [r0+24]
149
movq m2, [r0+16]
150
movq m1, [r0+ 8]
151
movq m0, [r0+ 0]
152
movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
153
WALSH4_1D w, 0,1,2,3,4
154
TRANSPOSE4x4W 0,1,2,3,4
155
SUMSUB_BADC w, 1, 0, 3, 2, 4
156
SWAP 0, 1
157
SWAP 2, 3
158
SUMSUB_17BIT 0,2,4,7
159
SUMSUB_17BIT 1,3,5,7
160
movq [r0+0], m0
161
movq [r0+8], m2
162
movq [r0+16], m3
163
movq [r0+24], m1
164
RET
165
%endif ; HIGH_BIT_DEPTH
166
167
%if HIGH_BIT_DEPTH
168
;-----------------------------------------------------------------------------
169
; void idct4x4dc( int32_t d[4][4] )
170
;-----------------------------------------------------------------------------
171
%macro IDCT4x4DC 0
172
cglobal idct4x4dc, 1,1
173
mova m3, [r0+48]
174
mova m2, [r0+32]
175
mova m1, [r0+16]
176
mova m0, [r0+ 0]
177
WALSH4_1D d,0,1,2,3,4
178
TRANSPOSE4x4D 0,1,2,3,4
179
WALSH4_1D d,0,1,2,3,4
180
mova [r0+ 0], m0
181
mova [r0+16], m1
182
mova [r0+32], m2
183
mova [r0+48], m3
184
RET
185
%endmacro ; IDCT4x4DC
186
187
INIT_XMM sse2
188
IDCT4x4DC
189
INIT_XMM avx
190
IDCT4x4DC
191
%else
192
193
;-----------------------------------------------------------------------------
194
; void idct4x4dc( int16_t d[4][4] )
195
;-----------------------------------------------------------------------------
196
INIT_MMX mmx
197
cglobal idct4x4dc, 1,1
198
movq m3, [r0+24]
199
movq m2, [r0+16]
200
movq m1, [r0+ 8]
201
movq m0, [r0+ 0]
202
WALSH4_1D w,0,1,2,3,4
203
TRANSPOSE4x4W 0,1,2,3,4
204
WALSH4_1D w,0,1,2,3,4
205
movq [r0+ 0], m0
206
movq [r0+ 8], m1
207
movq [r0+16], m2
208
movq [r0+24], m3
209
RET
210
%endif ; HIGH_BIT_DEPTH
211
212
%if HIGH_BIT_DEPTH
213
;-----------------------------------------------------------------------------
214
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
215
;-----------------------------------------------------------------------------
216
INIT_MMX mmx
217
cglobal sub4x4_dct, 3,3
218
.skip_prologue:
219
LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
220
LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
221
LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
222
LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
223
DCT4_1D 0,1,2,3,4
224
TRANSPOSE4x4W 0,1,2,3,4
225
226
SUMSUB_BADC w, 3, 0, 2, 1
227
SUMSUB_BA w, 2, 3, 4
228
DCT_UNPACK m2, m4, m5
229
DCT_UNPACK m3, m6, m7
230
mova [r0+ 0], m2 ; s03 + s12
231
mova [r0+ 8], m4
232
mova [r0+32], m3 ; s03 - s12
233
mova [r0+40], m6
234
235
DCT_UNPACK m0, m2, m4
236
DCT_UNPACK m1, m3, m5
237
SUMSUB2_AB d, 0, 1, 4
238
SUMSUB2_AB d, 2, 3, 5
239
mova [r0+16], m0 ; d03*2 + d12
240
mova [r0+24], m2
241
mova [r0+48], m4 ; d03 - 2*d12
242
mova [r0+56], m5
243
RET
244
%else
245
246
%macro SUB_DCT4 0
247
cglobal sub4x4_dct, 3,3
248
.skip_prologue:
249
%if cpuflag(ssse3)
250
mova m5, [hsub_mul]
251
%endif
252
LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
253
DCT4_1D 0,1,2,3,4
254
TRANSPOSE4x4W 0,1,2,3,4
255
DCT4_1D 0,1,2,3,4
256
movq [r0+ 0], m0
257
movq [r0+ 8], m1
258
movq [r0+16], m2
259
movq [r0+24], m3
260
RET
261
%endmacro
262
263
INIT_MMX mmx
264
SUB_DCT4
265
INIT_MMX ssse3
266
SUB_DCT4
267
%endif ; HIGH_BIT_DEPTH
268
269
%if HIGH_BIT_DEPTH
270
;-----------------------------------------------------------------------------
271
; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
272
;-----------------------------------------------------------------------------
273
%macro STORE_DIFFx2 6
274
psrad %1, 6
275
psrad %2, 6
276
packssdw %1, %2
277
movq %3, %5
278
movhps %3, %6
279
paddsw %1, %3
280
CLIPW %1, %4, [pw_pixel_max]
281
movq %5, %1
282
movhps %6, %1
283
%endmacro
284
285
%macro ADD4x4_IDCT 0
286
cglobal add4x4_idct, 2,2,6
287
add r0, 2*FDEC_STRIDEB
288
.skip_prologue:
289
mova m1, [r1+16]
290
mova m3, [r1+48]
291
mova m2, [r1+32]
292
mova m0, [r1+ 0]
293
IDCT4_1D d,0,1,2,3,4,5
294
TRANSPOSE4x4D 0,1,2,3,4
295
paddd m0, [pd_32]
296
IDCT4_1D d,0,1,2,3,4,5
297
pxor m5, m5
298
STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
299
STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
300
RET
301
%endmacro
302
303
INIT_XMM sse2
304
ADD4x4_IDCT
305
INIT_XMM avx
306
ADD4x4_IDCT
307
308
%else ; !HIGH_BIT_DEPTH
309
310
INIT_MMX mmx
311
cglobal add4x4_idct, 2,2
312
pxor m7, m7
313
.skip_prologue:
314
movq m1, [r1+ 8]
315
movq m3, [r1+24]
316
movq m2, [r1+16]
317
movq m0, [r1+ 0]
318
IDCT4_1D w,0,1,2,3,4,5
319
TRANSPOSE4x4W 0,1,2,3,4
320
paddw m0, [pw_32]
321
IDCT4_1D w,0,1,2,3,4,5
322
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
323
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
324
STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
325
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
326
RET
327
328
%macro ADD4x4 0
329
cglobal add4x4_idct, 2,2,6
330
mova m1, [r1+0x00] ; row1/row0
331
mova m3, [r1+0x10] ; row3/row2
332
psraw m0, m1, 1 ; row1>>1/...
333
psraw m2, m3, 1 ; row3>>1/...
334
movsd m0, m1 ; row1>>1/row0
335
movsd m2, m3 ; row3>>1/row2
336
psubw m0, m3 ; row1>>1-row3/row0-2
337
paddw m2, m1 ; row3>>1+row1/row0+2
338
SBUTTERFLY2 wd, 0, 2, 1
339
SUMSUB_BA w, 2, 0, 1
340
pshuflw m1, m2, q2301
341
pshufhw m2, m2, q2301
342
punpckldq m1, m0
343
punpckhdq m2, m0
344
SWAP 0, 1
345
346
mova m1, [pw_32_0]
347
paddw m1, m0 ; row1/row0 corrected
348
psraw m0, 1 ; row1>>1/...
349
psraw m3, m2, 1 ; row3>>1/...
350
movsd m0, m1 ; row1>>1/row0
351
movsd m3, m2 ; row3>>1/row2
352
psubw m0, m2 ; row1>>1-row3/row0-2
353
paddw m3, m1 ; row3>>1+row1/row0+2
354
SBUTTERFLY2 qdq, 0, 3, 1
355
SUMSUB_BA w, 3, 0, 1
356
357
movd m4, [r0+FDEC_STRIDE*0]
358
movd m1, [r0+FDEC_STRIDE*1]
359
movd m2, [r0+FDEC_STRIDE*2]
360
movd m5, [r0+FDEC_STRIDE*3]
361
punpckldq m1, m4 ; row0/row1
362
pxor m4, m4
363
punpckldq m2, m5 ; row3/row2
364
punpcklbw m1, m4
365
psraw m3, 6
366
punpcklbw m2, m4
367
psraw m0, 6
368
paddsw m3, m1
369
paddsw m0, m2
370
packuswb m0, m3 ; row0/row1/row3/row2
371
pextrd [r0+FDEC_STRIDE*0], m0, 3
372
pextrd [r0+FDEC_STRIDE*1], m0, 2
373
movd [r0+FDEC_STRIDE*2], m0
374
pextrd [r0+FDEC_STRIDE*3], m0, 1
375
RET
376
%endmacro ; ADD4x4
377
378
INIT_XMM sse4
379
ADD4x4
380
INIT_XMM avx
381
ADD4x4
382
383
%macro STOREx2_AVX2 9
384
movq xm%3, [r0+%5*FDEC_STRIDE]
385
vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
386
movq xm%4, [r0+%7*FDEC_STRIDE]
387
vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
388
punpcklbw m%3, m%9
389
punpcklbw m%4, m%9
390
psraw m%1, 6
391
psraw m%2, 6
392
paddsw m%1, m%3
393
paddsw m%2, m%4
394
packuswb m%1, m%2
395
vextracti128 xm%2, m%1, 1
396
movq [r0+%5*FDEC_STRIDE], xm%1
397
movq [r0+%6*FDEC_STRIDE], xm%2
398
movhps [r0+%7*FDEC_STRIDE], xm%1
399
movhps [r0+%8*FDEC_STRIDE], xm%2
400
%endmacro
401
402
INIT_YMM avx2
403
cglobal add8x8_idct, 2,3,8
404
add r0, 4*FDEC_STRIDE
405
pxor m7, m7
406
TAIL_CALL .skip_prologue, 0
407
global current_function %+ .skip_prologue
408
.skip_prologue:
409
; TRANSPOSE4x4Q
410
mova xm0, [r1+ 0]
411
mova xm1, [r1+32]
412
mova xm2, [r1+16]
413
mova xm3, [r1+48]
414
vinserti128 m0, m0, [r1+ 64], 1
415
vinserti128 m1, m1, [r1+ 96], 1
416
vinserti128 m2, m2, [r1+ 80], 1
417
vinserti128 m3, m3, [r1+112], 1
418
SBUTTERFLY qdq, 0, 1, 4
419
SBUTTERFLY qdq, 2, 3, 4
420
IDCT4_1D w,0,1,2,3,4,5
421
TRANSPOSE2x4x4W 0,1,2,3,4
422
paddw m0, [pw_32]
423
IDCT4_1D w,0,1,2,3,4,5
424
STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
425
STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
426
ret
427
428
; 2xdst, 2xtmp, 4xsrcrow, 1xzero
429
%macro LOAD_DIFF8x2_AVX2 9
430
movq xm%1, [r1+%5*FENC_STRIDE]
431
movq xm%2, [r1+%6*FENC_STRIDE]
432
vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
433
vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
434
punpcklbw m%1, m%9
435
punpcklbw m%2, m%9
436
movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
437
movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
438
vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
439
vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
440
punpcklbw m%3, m%9
441
punpcklbw m%4, m%9
442
psubw m%1, m%3
443
psubw m%2, m%4
444
%endmacro
445
446
; 4x src, 1x tmp
447
%macro STORE8_DCT_AVX2 5
448
SBUTTERFLY qdq, %1, %2, %5
449
SBUTTERFLY qdq, %3, %4, %5
450
mova [r0+ 0], xm%1
451
mova [r0+ 16], xm%3
452
mova [r0+ 32], xm%2
453
mova [r0+ 48], xm%4
454
vextracti128 [r0+ 64], m%1, 1
455
vextracti128 [r0+ 80], m%3, 1
456
vextracti128 [r0+ 96], m%2, 1
457
vextracti128 [r0+112], m%4, 1
458
%endmacro
459
460
%macro STORE16_DCT_AVX2 5
461
SBUTTERFLY qdq, %1, %2, %5
462
SBUTTERFLY qdq, %3, %4, %5
463
mova [r0+ 0-128], xm%1
464
mova [r0+16-128], xm%3
465
mova [r0+32-128], xm%2
466
mova [r0+48-128], xm%4
467
vextracti128 [r0+ 0], m%1, 1
468
vextracti128 [r0+16], m%3, 1
469
vextracti128 [r0+32], m%2, 1
470
vextracti128 [r0+48], m%4, 1
471
%endmacro
472
473
INIT_YMM avx2
474
cglobal sub8x8_dct, 3,3,7
475
pxor m6, m6
476
add r2, 4*FDEC_STRIDE
477
LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
478
LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
479
DCT4_1D 0, 1, 2, 3, 4
480
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
481
DCT4_1D 0, 1, 2, 3, 4
482
STORE8_DCT_AVX2 0, 1, 2, 3, 4
483
RET
484
485
INIT_YMM avx2
486
cglobal sub16x16_dct, 3,3,6
487
add r0, 128
488
add r2, 4*FDEC_STRIDE
489
call .sub16x4_dct
490
add r0, 64
491
add r1, 4*FENC_STRIDE
492
add r2, 4*FDEC_STRIDE
493
call .sub16x4_dct
494
add r0, 256-64
495
add r1, 4*FENC_STRIDE
496
add r2, 4*FDEC_STRIDE
497
call .sub16x4_dct
498
add r0, 64
499
add r1, 4*FENC_STRIDE
500
add r2, 4*FDEC_STRIDE
501
call .sub16x4_dct
502
RET
503
.sub16x4_dct:
504
LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
505
LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
506
DCT4_1D 0, 1, 2, 3, 4
507
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
508
DCT4_1D 0, 1, 2, 3, 4
509
STORE16_DCT_AVX2 0, 1, 2, 3, 4
510
ret
511
%endif ; HIGH_BIT_DEPTH
512
513
INIT_MMX
514
;-----------------------------------------------------------------------------
515
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
516
;-----------------------------------------------------------------------------
517
%macro SUB_NxN_DCT 7
518
cglobal %1, 3,3,%7
519
%if HIGH_BIT_DEPTH == 0
520
%if mmsize == 8
521
pxor m7, m7
522
%else
523
add r2, 4*FDEC_STRIDE
524
mova m7, [hsub_mul]
525
%endif
526
%endif ; !HIGH_BIT_DEPTH
527
.skip_prologue:
528
call %2.skip_prologue
529
add r0, %3
530
add r1, %4-%5-%6*FENC_STRIDE
531
add r2, %4-%5-%6*FDEC_STRIDE
532
call %2.skip_prologue
533
add r0, %3
534
add r1, (%4-%6)*FENC_STRIDE-%5-%4
535
add r2, (%4-%6)*FDEC_STRIDE-%5-%4
536
call %2.skip_prologue
537
add r0, %3
538
add r1, %4-%5-%6*FENC_STRIDE
539
add r2, %4-%5-%6*FDEC_STRIDE
540
TAIL_CALL %2.skip_prologue, 1
541
%endmacro
542
543
;-----------------------------------------------------------------------------
544
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
545
;-----------------------------------------------------------------------------
546
%macro ADD_NxN_IDCT 6-7
547
%if HIGH_BIT_DEPTH
548
cglobal %1, 2,2,%7
549
%if %3==256
550
add r1, 128
551
%endif
552
%else
553
cglobal %1, 2,2,11
554
pxor m7, m7
555
%endif
556
%if mmsize>=16 && %3!=256
557
add r0, 4*FDEC_STRIDE
558
%endif
559
.skip_prologue:
560
call %2.skip_prologue
561
add r0, %4-%5-%6*FDEC_STRIDE
562
add r1, %3
563
call %2.skip_prologue
564
add r0, (%4-%6)*FDEC_STRIDE-%5-%4
565
add r1, %3
566
call %2.skip_prologue
567
add r0, %4-%5-%6*FDEC_STRIDE
568
add r1, %3
569
TAIL_CALL %2.skip_prologue, 1
570
%endmacro
571
572
%if HIGH_BIT_DEPTH
573
INIT_MMX
574
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
575
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
576
INIT_XMM
577
ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
578
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
579
ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
580
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
581
cextern add8x8_idct8_sse2.skip_prologue
582
cextern add8x8_idct8_avx.skip_prologue
583
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
584
ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
585
cextern sub8x8_dct8_sse2.skip_prologue
586
cextern sub8x8_dct8_sse4.skip_prologue
587
cextern sub8x8_dct8_avx.skip_prologue
588
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
589
SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
590
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
591
%else ; !HIGH_BIT_DEPTH
592
%if ARCH_X86_64 == 0
593
INIT_MMX
594
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
595
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
596
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
597
ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
598
599
cextern sub8x8_dct8_mmx.skip_prologue
600
cextern add8x8_idct8_mmx.skip_prologue
601
SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
602
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
603
%endif
604
605
INIT_XMM
606
cextern sub8x8_dct_sse2.skip_prologue
607
cextern sub8x8_dct_ssse3.skip_prologue
608
cextern sub8x8_dct_avx.skip_prologue
609
cextern sub8x8_dct_xop.skip_prologue
610
SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
611
SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
612
SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
613
SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
614
615
cextern add8x8_idct_sse2.skip_prologue
616
cextern add8x8_idct_avx.skip_prologue
617
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
618
ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
619
620
cextern add8x8_idct8_sse2.skip_prologue
621
cextern add8x8_idct8_avx.skip_prologue
622
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
623
ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
624
625
cextern sub8x8_dct8_sse2.skip_prologue
626
cextern sub8x8_dct8_ssse3.skip_prologue
627
cextern sub8x8_dct8_avx.skip_prologue
628
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
629
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
630
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
631
632
INIT_YMM
633
ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
634
%endif ; HIGH_BIT_DEPTH
635
636
%if HIGH_BIT_DEPTH
637
;-----------------------------------------------------------------------------
638
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
639
;-----------------------------------------------------------------------------
640
%macro ADD_DC 2
641
mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
642
mova m1, [%1+FDEC_STRIDEB*1]
643
mova m2, [%1+FDEC_STRIDEB*2]
644
paddsw m0, %2
645
paddsw m1, %2
646
paddsw m2, %2
647
paddsw %2, [%1+FDEC_STRIDEB*3]
648
CLIPW m0, m5, m6
649
CLIPW m1, m5, m6
650
CLIPW m2, m5, m6
651
CLIPW %2, m5, m6
652
mova [%1+FDEC_STRIDEB*0], m0
653
mova [%1+FDEC_STRIDEB*1], m1
654
mova [%1+FDEC_STRIDEB*2], m2
655
mova [%1+FDEC_STRIDEB*3], %2
656
%endmacro
657
658
%macro ADD_IDCT_DC 0
659
cglobal add8x8_idct_dc, 2,2,7
660
mova m6, [pw_pixel_max]
661
pxor m5, m5
662
mova m3, [r1]
663
paddd m3, [pd_32]
664
psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
665
pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
666
pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
667
pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
668
pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
669
ADD_DC r0+FDEC_STRIDEB*0, m4
670
ADD_DC r0+FDEC_STRIDEB*4, m3
671
RET
672
673
cglobal add16x16_idct_dc, 2,3,8
674
mov r2, 4
675
mova m6, [pw_pixel_max]
676
mova m7, [pd_32]
677
pxor m5, m5
678
.loop:
679
mova m3, [r1]
680
paddd m3, m7
681
psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
682
pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
683
pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
684
pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
685
pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
686
ADD_DC r0+FDEC_STRIDEB*0, m4
687
ADD_DC r0+SIZEOF_PIXEL*8, m3
688
add r1, 16
689
add r0, 4*FDEC_STRIDEB
690
dec r2
691
jg .loop
692
RET
693
%endmacro ; ADD_IDCT_DC
694
695
INIT_XMM sse2
696
ADD_IDCT_DC
697
INIT_XMM avx
698
ADD_IDCT_DC
699
700
%else ;!HIGH_BIT_DEPTH
701
%macro ADD_DC 3
702
mova m4, [%3+FDEC_STRIDE*0]
703
mova m5, [%3+FDEC_STRIDE*1]
704
mova m6, [%3+FDEC_STRIDE*2]
705
paddusb m4, %1
706
paddusb m5, %1
707
paddusb m6, %1
708
paddusb %1, [%3+FDEC_STRIDE*3]
709
psubusb m4, %2
710
psubusb m5, %2
711
psubusb m6, %2
712
psubusb %1, %2
713
mova [%3+FDEC_STRIDE*0], m4
714
mova [%3+FDEC_STRIDE*1], m5
715
mova [%3+FDEC_STRIDE*2], m6
716
mova [%3+FDEC_STRIDE*3], %1
717
%endmacro
718
719
INIT_MMX mmx2
720
cglobal add8x8_idct_dc, 2,2
721
mova m0, [r1]
722
pxor m1, m1
723
add r0, FDEC_STRIDE*4
724
paddw m0, [pw_32]
725
psraw m0, 6
726
psubw m1, m0
727
packuswb m0, m0
728
packuswb m1, m1
729
punpcklbw m0, m0
730
punpcklbw m1, m1
731
pshufw m2, m0, q3322
732
pshufw m3, m1, q3322
733
punpcklbw m0, m0
734
punpcklbw m1, m1
735
ADD_DC m0, m1, r0-FDEC_STRIDE*4
736
ADD_DC m2, m3, r0
737
RET
738
739
INIT_XMM ssse3
740
cglobal add8x8_idct_dc, 2,2
741
movh m0, [r1]
742
pxor m1, m1
743
add r0, FDEC_STRIDE*4
744
pmulhrsw m0, [pw_512]
745
psubw m1, m0
746
mova m5, [pb_unpackbd1]
747
packuswb m0, m0
748
packuswb m1, m1
749
pshufb m0, m5
750
pshufb m1, m5
751
movh m2, [r0+FDEC_STRIDE*-4]
752
movh m3, [r0+FDEC_STRIDE*-3]
753
movh m4, [r0+FDEC_STRIDE*-2]
754
movh m5, [r0+FDEC_STRIDE*-1]
755
movhps m2, [r0+FDEC_STRIDE* 0]
756
movhps m3, [r0+FDEC_STRIDE* 1]
757
movhps m4, [r0+FDEC_STRIDE* 2]
758
movhps m5, [r0+FDEC_STRIDE* 3]
759
paddusb m2, m0
760
paddusb m3, m0
761
paddusb m4, m0
762
paddusb m5, m0
763
psubusb m2, m1
764
psubusb m3, m1
765
psubusb m4, m1
766
psubusb m5, m1
767
movh [r0+FDEC_STRIDE*-4], m2
768
movh [r0+FDEC_STRIDE*-3], m3
769
movh [r0+FDEC_STRIDE*-2], m4
770
movh [r0+FDEC_STRIDE*-1], m5
771
movhps [r0+FDEC_STRIDE* 0], m2
772
movhps [r0+FDEC_STRIDE* 1], m3
773
movhps [r0+FDEC_STRIDE* 2], m4
774
movhps [r0+FDEC_STRIDE* 3], m5
775
RET
776
777
INIT_MMX mmx2
778
cglobal add16x16_idct_dc, 2,3
779
mov r2, 4
780
.loop:
781
mova m0, [r1]
782
pxor m1, m1
783
paddw m0, [pw_32]
784
psraw m0, 6
785
psubw m1, m0
786
packuswb m0, m0
787
packuswb m1, m1
788
punpcklbw m0, m0
789
punpcklbw m1, m1
790
pshufw m2, m0, q3322
791
pshufw m3, m1, q3322
792
punpcklbw m0, m0
793
punpcklbw m1, m1
794
ADD_DC m0, m1, r0
795
ADD_DC m2, m3, r0+8
796
add r1, 8
797
add r0, FDEC_STRIDE*4
798
dec r2
799
jg .loop
800
RET
801
802
INIT_XMM sse2
803
cglobal add16x16_idct_dc, 2,2,8
804
call .loop
805
add r0, FDEC_STRIDE*4
806
TAIL_CALL .loop, 0
807
.loop:
808
add r0, FDEC_STRIDE*4
809
movq m0, [r1+0]
810
movq m2, [r1+8]
811
add r1, 16
812
punpcklwd m0, m0
813
punpcklwd m2, m2
814
pxor m3, m3
815
paddw m0, [pw_32]
816
paddw m2, [pw_32]
817
psraw m0, 6
818
psraw m2, 6
819
psubw m1, m3, m0
820
packuswb m0, m1
821
psubw m3, m2
822
punpckhbw m1, m0, m0
823
packuswb m2, m3
824
punpckhbw m3, m2, m2
825
punpcklbw m0, m0
826
punpcklbw m2, m2
827
ADD_DC m0, m1, r0+FDEC_STRIDE*-4
828
ADD_DC m2, m3, r0
829
ret
830
831
%macro ADD16x16 0
832
cglobal add16x16_idct_dc, 2,2,8
833
call .loop
834
add r0, FDEC_STRIDE*4
835
TAIL_CALL .loop, 0
836
.loop:
837
add r0, FDEC_STRIDE*4
838
mova m0, [r1]
839
add r1, 16
840
pxor m1, m1
841
pmulhrsw m0, [pw_512]
842
psubw m1, m0
843
mova m5, [pb_unpackbd1]
844
mova m6, [pb_unpackbd2]
845
packuswb m0, m0
846
packuswb m1, m1
847
pshufb m2, m0, m6
848
pshufb m0, m5
849
pshufb m3, m1, m6
850
pshufb m1, m5
851
ADD_DC m0, m1, r0+FDEC_STRIDE*-4
852
ADD_DC m2, m3, r0
853
ret
854
%endmacro ; ADD16x16
855
856
INIT_XMM ssse3
857
ADD16x16
858
INIT_XMM avx
859
ADD16x16
860
861
%macro ADD_DC_AVX2 3
862
mova xm4, [r0+FDEC_STRIDE*0+%3]
863
mova xm5, [r0+FDEC_STRIDE*1+%3]
864
vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
865
vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
866
paddusb m4, %1
867
paddusb m5, %1
868
psubusb m4, %2
869
psubusb m5, %2
870
mova [r0+FDEC_STRIDE*0+%3], xm4
871
mova [r0+FDEC_STRIDE*1+%3], xm5
872
vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
873
vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
874
%endmacro
875
876
INIT_YMM avx2
877
cglobal add16x16_idct_dc, 2,3,6
878
add r0, FDEC_STRIDE*4
879
mova m0, [r1]
880
pxor m1, m1
881
pmulhrsw m0, [pw_512]
882
psubw m1, m0
883
mova m4, [pb_unpackbd1]
884
mova m5, [pb_unpackbd2]
885
packuswb m0, m0
886
packuswb m1, m1
887
pshufb m2, m0, m4 ; row0, row2
888
pshufb m3, m1, m4 ; row0, row2
889
pshufb m0, m5 ; row1, row3
890
pshufb m1, m5 ; row1, row3
891
lea r2, [r0+FDEC_STRIDE*8]
892
ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
893
ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
894
ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
895
ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
896
RET
897
898
%endif ; HIGH_BIT_DEPTH
899
900
;-----------------------------------------------------------------------------
901
; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
902
;-----------------------------------------------------------------------------
903
904
%macro DCTDC_2ROW_MMX 4
905
mova %1, [r1+FENC_STRIDE*(0+%3)]
906
mova m1, [r1+FENC_STRIDE*(1+%3)]
907
mova m2, [r2+FDEC_STRIDE*(0+%4)]
908
mova m3, [r2+FDEC_STRIDE*(1+%4)]
909
mova %2, %1
910
punpckldq %1, m1
911
punpckhdq %2, m1
912
mova m1, m2
913
punpckldq m2, m3
914
punpckhdq m1, m3
915
pxor m3, m3
916
psadbw %1, m3
917
psadbw %2, m3
918
psadbw m2, m3
919
psadbw m1, m3
920
psubw %1, m2
921
psubw %2, m1
922
%endmacro
923
924
%macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
925
PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
926
PSHUFLW m0, %2, q2301 ; s3 __ s2 __
927
paddw m1, %2 ; s1 s13 s0 s02
928
psubw m1, m0 ; d13 s13 d02 s02
929
PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
930
psrlq m1, 32 ; __ __ d13 s13
931
paddw m0, m1 ; d02 s02 d02+d13 s02+s13
932
psllq m1, 32 ; d13 s13
933
psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
934
%endmacro
935
936
%if HIGH_BIT_DEPTH == 0
937
INIT_MMX mmx2
938
cglobal sub8x8_dct_dc, 3,3
939
DCTDC_2ROW_MMX m0, m4, 0, 0
940
DCTDC_2ROW_MMX m5, m6, 2, 2
941
paddw m0, m5
942
paddw m4, m6
943
punpckldq m0, m4
944
add r2, FDEC_STRIDE*4
945
DCTDC_2ROW_MMX m7, m4, 4, 0
946
DCTDC_2ROW_MMX m5, m6, 6, 2
947
paddw m7, m5
948
paddw m4, m6
949
punpckldq m7, m4
950
DCT2x2 m0, m7
951
mova [r0], m0
952
ret
953
954
%macro DCTDC_2ROW_SSE2 4
955
movh m1, [r1+FENC_STRIDE*(0+%1)]
956
movh m2, [r1+FENC_STRIDE*(1+%1)]
957
punpckldq m1, m2
958
movh m2, [r2+FDEC_STRIDE*(0+%2)]
959
punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
960
psadbw m1, m0
961
psadbw m2, m0
962
ACCUM paddd, %4, 1, %3
963
psubd m%4, m2
964
%endmacro
965
966
INIT_XMM sse2
967
cglobal sub8x8_dct_dc, 3,3
968
pxor m0, m0
969
DCTDC_2ROW_SSE2 0, 0, 0, 3
970
DCTDC_2ROW_SSE2 2, 2, 1, 3
971
add r2, FDEC_STRIDE*4
972
DCTDC_2ROW_SSE2 4, 0, 0, 4
973
DCTDC_2ROW_SSE2 6, 2, 1, 4
974
packssdw m3, m3
975
packssdw m4, m4
976
DCT2x2 m3, m4
977
movq [r0], m0
978
RET
979
980
%macro SUB8x16_DCT_DC 0
981
cglobal sub8x16_dct_dc, 3,3
982
pxor m0, m0
983
DCTDC_2ROW_SSE2 0, 0, 0, 3
984
DCTDC_2ROW_SSE2 2, 2, 1, 3
985
add r1, FENC_STRIDE*8
986
add r2, FDEC_STRIDE*8
987
DCTDC_2ROW_SSE2 -4, -4, 0, 4
988
DCTDC_2ROW_SSE2 -2, -2, 1, 4
989
shufps m3, m4, q2020
990
DCTDC_2ROW_SSE2 0, 0, 0, 5
991
DCTDC_2ROW_SSE2 2, 2, 1, 5
992
add r2, FDEC_STRIDE*4
993
DCTDC_2ROW_SSE2 4, 0, 0, 4
994
DCTDC_2ROW_SSE2 6, 2, 1, 4
995
shufps m5, m4, q2020
996
%if cpuflag(ssse3)
997
%define %%sign psignw
998
%else
999
%define %%sign pmullw
1000
%endif
1001
SUMSUB_BA d, 5, 3, 0
1002
packssdw m5, m3
1003
pshuflw m0, m5, q2301
1004
pshufhw m0, m0, q2301
1005
%%sign m5, [pw_pmpmpmpm]
1006
paddw m0, m5
1007
pshufd m1, m0, q1320
1008
pshufd m0, m0, q0231
1009
%%sign m1, [pw_ppppmmmm]
1010
paddw m0, m1
1011
mova [r0], m0
1012
RET
1013
%endmacro ; SUB8x16_DCT_DC
1014
1015
INIT_XMM sse2
1016
SUB8x16_DCT_DC
1017
INIT_XMM ssse3
1018
SUB8x16_DCT_DC
1019
1020
%endif ; !HIGH_BIT_DEPTH
1021
1022
%macro DCTDC_4ROW_SSE2 2
1023
mova %1, [r1+FENC_STRIDEB*%2]
1024
mova m0, [r2+FDEC_STRIDEB*%2]
1025
%assign Y (%2+1)
1026
%rep 3
1027
paddw %1, [r1+FENC_STRIDEB*Y]
1028
paddw m0, [r2+FDEC_STRIDEB*Y]
1029
%assign Y (Y+1)
1030
%endrep
1031
psubw %1, m0
1032
pshufd m0, %1, q2301
1033
paddw %1, m0
1034
%endmacro
1035
1036
%if HIGH_BIT_DEPTH
1037
%macro SUB8x8_DCT_DC_10 0
1038
cglobal sub8x8_dct_dc, 3,3,3
1039
DCTDC_4ROW_SSE2 m1, 0
1040
DCTDC_4ROW_SSE2 m2, 4
1041
mova m0, [pw_ppmmmmpp]
1042
pmaddwd m1, m0
1043
pmaddwd m2, m0
1044
pshufd m0, m1, q2200 ; -1 -1 +0 +0
1045
pshufd m1, m1, q0033 ; +0 +0 +1 +1
1046
paddd m1, m0
1047
pshufd m0, m2, q1023 ; -2 +2 -3 +3
1048
paddd m1, m2
1049
paddd m1, m0
1050
mova [r0], m1
1051
RET
1052
%endmacro
1053
INIT_XMM sse2
1054
SUB8x8_DCT_DC_10
1055
1056
%macro SUB8x16_DCT_DC_10 0
1057
cglobal sub8x16_dct_dc, 3,3,6
1058
DCTDC_4ROW_SSE2 m1, 0
1059
DCTDC_4ROW_SSE2 m2, 4
1060
DCTDC_4ROW_SSE2 m3, 8
1061
DCTDC_4ROW_SSE2 m4, 12
1062
mova m0, [pw_ppmmmmpp]
1063
pmaddwd m1, m0
1064
pmaddwd m2, m0
1065
pshufd m5, m1, q2200 ; -1 -1 +0 +0
1066
pshufd m1, m1, q0033 ; +0 +0 +1 +1
1067
paddd m1, m5
1068
pshufd m5, m2, q1023 ; -2 +2 -3 +3
1069
paddd m1, m2
1070
paddd m1, m5 ; a6 a2 a4 a0
1071
pmaddwd m3, m0
1072
pmaddwd m4, m0
1073
pshufd m5, m3, q2200
1074
pshufd m3, m3, q0033
1075
paddd m3, m5
1076
pshufd m5, m4, q1023
1077
paddd m3, m4
1078
paddd m3, m5 ; a7 a3 a5 a1
1079
paddd m0, m1, m3
1080
psubd m1, m3
1081
pshufd m0, m0, q3120
1082
pshufd m1, m1, q3120
1083
punpcklqdq m2, m0, m1
1084
punpckhqdq m1, m0
1085
mova [r0+ 0], m2
1086
mova [r0+16], m1
1087
RET
1088
%endmacro
1089
INIT_XMM sse2
1090
SUB8x16_DCT_DC_10
1091
INIT_XMM avx
1092
SUB8x16_DCT_DC_10
1093
%endif
1094
1095
;-----------------------------------------------------------------------------
1096
; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
1097
;-----------------------------------------------------------------------------
1098
%macro SCAN_8x8 0
1099
cglobal zigzag_scan_8x8_frame, 2,2,8
1100
movdqa xmm0, [r1]
1101
movdqa xmm1, [r1+16]
1102
movdq2q mm0, xmm0
1103
PALIGNR xmm1, xmm1, 14, xmm2
1104
movdq2q mm1, xmm1
1105
1106
movdqa xmm2, [r1+32]
1107
movdqa xmm3, [r1+48]
1108
PALIGNR xmm2, xmm2, 12, xmm4
1109
movdq2q mm2, xmm2
1110
PALIGNR xmm3, xmm3, 10, xmm4
1111
movdq2q mm3, xmm3
1112
1113
punpckhwd xmm0, xmm1
1114
punpckhwd xmm2, xmm3
1115
1116
movq mm4, mm1
1117
movq mm5, mm1
1118
movq mm6, mm2
1119
movq mm7, mm3
1120
punpckhwd mm1, mm0
1121
psllq mm0, 16
1122
psrlq mm3, 16
1123
punpckhdq mm1, mm1
1124
punpckhdq mm2, mm0
1125
punpcklwd mm0, mm4
1126
punpckhwd mm4, mm3
1127
punpcklwd mm4, mm2
1128
punpckhdq mm0, mm2
1129
punpcklwd mm6, mm3
1130
punpcklwd mm5, mm7
1131
punpcklwd mm5, mm6
1132
1133
movdqa xmm4, [r1+64]
1134
movdqa xmm5, [r1+80]
1135
movdqa xmm6, [r1+96]
1136
movdqa xmm7, [r1+112]
1137
1138
movq [r0+2*00], mm0
1139
movq [r0+2*04], mm4
1140
movd [r0+2*08], mm1
1141
movq [r0+2*36], mm5
1142
movq [r0+2*46], mm6
1143
1144
PALIGNR xmm4, xmm4, 14, xmm3
1145
movdq2q mm4, xmm4
1146
PALIGNR xmm5, xmm5, 12, xmm3
1147
movdq2q mm5, xmm5
1148
PALIGNR xmm6, xmm6, 10, xmm3
1149
movdq2q mm6, xmm6
1150
%if cpuflag(ssse3)
1151
PALIGNR xmm7, xmm7, 8, xmm3
1152
movdq2q mm7, xmm7
1153
%else
1154
movhlps xmm3, xmm7
1155
punpcklqdq xmm7, xmm7
1156
movdq2q mm7, xmm3
1157
%endif
1158
1159
punpckhwd xmm4, xmm5
1160
punpckhwd xmm6, xmm7
1161
1162
movq mm0, mm4
1163
movq mm1, mm5
1164
movq mm3, mm7
1165
punpcklwd mm7, mm6
1166
psrlq mm6, 16
1167
punpcklwd mm4, mm6
1168
punpcklwd mm5, mm4
1169
punpckhdq mm4, mm3
1170
punpcklwd mm3, mm6
1171
punpckhwd mm3, mm4
1172
punpckhwd mm0, mm1
1173
punpckldq mm4, mm0
1174
punpckhdq mm0, mm6
1175
pshufw mm4, mm4, q1230
1176
1177
movq [r0+2*14], mm4
1178
movq [r0+2*25], mm0
1179
movd [r0+2*54], mm7
1180
movq [r0+2*56], mm5
1181
movq [r0+2*60], mm3
1182
1183
punpckhdq xmm3, xmm0, xmm2
1184
punpckldq xmm0, xmm2
1185
punpckhdq xmm7, xmm4, xmm6
1186
punpckldq xmm4, xmm6
1187
pshufhw xmm0, xmm0, q0123
1188
pshuflw xmm4, xmm4, q0123
1189
pshufhw xmm3, xmm3, q0123
1190
pshuflw xmm7, xmm7, q0123
1191
1192
movlps [r0+2*10], xmm0
1193
movhps [r0+2*17], xmm0
1194
movlps [r0+2*21], xmm3
1195
movlps [r0+2*28], xmm4
1196
movhps [r0+2*32], xmm3
1197
movhps [r0+2*39], xmm4
1198
movlps [r0+2*43], xmm7
1199
movhps [r0+2*50], xmm7
1200
1201
RET
1202
%endmacro
1203
1204
%if HIGH_BIT_DEPTH == 0
1205
INIT_XMM sse2
1206
SCAN_8x8
1207
INIT_XMM ssse3
1208
SCAN_8x8
1209
%endif
1210
1211
;-----------------------------------------------------------------------------
1212
; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
1213
;-----------------------------------------------------------------------------
1214
; Output order:
1215
; 0 8 1 2 9 16 24 17
1216
; 10 3 4 11 18 25 32 40
1217
; 33 26 19 12 5 6 13 20
1218
; 27 34 41 48 56 49 42 35
1219
; 28 21 14 7 15 22 29 36
1220
; 43 50 57 58 51 44 37 30
1221
; 23 31 38 45 52 59 60 53
1222
; 46 39 47 54 61 62 55 63
1223
%macro SCAN_8x8_FRAME 5
1224
cglobal zigzag_scan_8x8_frame, 2,2,8
1225
mova m0, [r1]
1226
mova m1, [r1+ 8*SIZEOF_DCTCOEF]
1227
movu m2, [r1+14*SIZEOF_DCTCOEF]
1228
movu m3, [r1+21*SIZEOF_DCTCOEF]
1229
mova m4, [r1+28*SIZEOF_DCTCOEF]
1230
punpckl%4 m5, m0, m1
1231
psrl%2 m0, %1
1232
punpckh%4 m6, m1, m0
1233
punpckl%3 m5, m0
1234
punpckl%3 m1, m1
1235
punpckh%4 m1, m3
1236
mova m7, [r1+52*SIZEOF_DCTCOEF]
1237
mova m0, [r1+60*SIZEOF_DCTCOEF]
1238
punpckh%4 m1, m2
1239
punpckl%4 m2, m4
1240
punpckh%4 m4, m3
1241
punpckl%3 m3, m3
1242
punpckh%4 m3, m2
1243
mova [r0], m5
1244
mova [r0+ 4*SIZEOF_DCTCOEF], m1
1245
mova [r0+ 8*SIZEOF_DCTCOEF], m6
1246
punpckl%4 m6, m0
1247
punpckl%4 m6, m7
1248
mova m1, [r1+32*SIZEOF_DCTCOEF]
1249
movu m5, [r1+39*SIZEOF_DCTCOEF]
1250
movu m2, [r1+46*SIZEOF_DCTCOEF]
1251
movu [r0+35*SIZEOF_DCTCOEF], m3
1252
movu [r0+47*SIZEOF_DCTCOEF], m4
1253
punpckh%4 m7, m0
1254
psll%2 m0, %1
1255
punpckh%3 m3, m5, m5
1256
punpckl%4 m5, m1
1257
punpckh%4 m1, m2
1258
mova [r0+52*SIZEOF_DCTCOEF], m6
1259
movu [r0+13*SIZEOF_DCTCOEF], m5
1260
movu m4, [r1+11*SIZEOF_DCTCOEF]
1261
movu m6, [r1+25*SIZEOF_DCTCOEF]
1262
punpckl%4 m5, m7
1263
punpckl%4 m1, m3
1264
punpckh%3 m0, m7
1265
mova m3, [r1+ 4*SIZEOF_DCTCOEF]
1266
movu m7, [r1+18*SIZEOF_DCTCOEF]
1267
punpckl%4 m2, m5
1268
movu [r0+25*SIZEOF_DCTCOEF], m1
1269
mova m1, m4
1270
mova m5, m6
1271
punpckl%4 m4, m3
1272
punpckl%4 m6, m7
1273
punpckh%4 m1, m3
1274
punpckh%4 m5, m7
1275
punpckh%3 m3, m6, m4
1276
punpckh%3 m7, m5, m1
1277
punpckl%3 m6, m4
1278
punpckl%3 m5, m1
1279
movu m4, [r1+35*SIZEOF_DCTCOEF]
1280
movu m1, [r1+49*SIZEOF_DCTCOEF]
1281
pshuf%5 m6, m6, q0123
1282
pshuf%5 m5, m5, q0123
1283
mova [r0+60*SIZEOF_DCTCOEF], m0
1284
mova [r0+56*SIZEOF_DCTCOEF], m2
1285
movu m0, [r1+42*SIZEOF_DCTCOEF]
1286
mova m2, [r1+56*SIZEOF_DCTCOEF]
1287
movu [r0+17*SIZEOF_DCTCOEF], m3
1288
mova [r0+32*SIZEOF_DCTCOEF], m7
1289
movu [r0+10*SIZEOF_DCTCOEF], m6
1290
movu [r0+21*SIZEOF_DCTCOEF], m5
1291
punpckh%4 m3, m0, m4
1292
punpckh%4 m7, m2, m1
1293
punpckl%4 m0, m4
1294
punpckl%4 m2, m1
1295
punpckl%3 m4, m2, m0
1296
punpckl%3 m1, m7, m3
1297
punpckh%3 m2, m0
1298
punpckh%3 m7, m3
1299
pshuf%5 m2, m2, q0123
1300
pshuf%5 m7, m7, q0123
1301
mova [r0+28*SIZEOF_DCTCOEF], m4
1302
movu [r0+43*SIZEOF_DCTCOEF], m1
1303
movu [r0+39*SIZEOF_DCTCOEF], m2
1304
movu [r0+50*SIZEOF_DCTCOEF], m7
1305
RET
1306
%endmacro
1307
1308
%if HIGH_BIT_DEPTH
1309
INIT_XMM sse2
1310
SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1311
INIT_XMM avx
1312
SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1313
%else
1314
INIT_MMX mmx2
1315
SCAN_8x8_FRAME 16, q , dq , wd, w
1316
%endif
1317
1318
;-----------------------------------------------------------------------------
1319
; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1320
;-----------------------------------------------------------------------------
1321
%macro SCAN_4x4 4
1322
cglobal zigzag_scan_4x4_frame, 2,2,6
1323
mova m0, [r1+ 0*SIZEOF_DCTCOEF]
1324
mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1325
mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1326
mova m3, [r1+12*SIZEOF_DCTCOEF]
1327
punpckl%4 m4, m0, m1
1328
psrl%2 m0, %1
1329
punpckl%3 m4, m0
1330
mova [r0+ 0*SIZEOF_DCTCOEF], m4
1331
punpckh%4 m0, m2
1332
punpckh%4 m4, m2, m3
1333
psll%2 m3, %1
1334
punpckl%3 m2, m2
1335
punpckl%4 m5, m1, m3
1336
punpckh%3 m1, m1
1337
punpckh%4 m5, m2
1338
punpckl%4 m1, m0
1339
punpckh%3 m3, m4
1340
mova [r0+ 4*SIZEOF_DCTCOEF], m5
1341
mova [r0+ 8*SIZEOF_DCTCOEF], m1
1342
mova [r0+12*SIZEOF_DCTCOEF], m3
1343
RET
1344
%endmacro
1345
1346
%if HIGH_BIT_DEPTH
1347
INIT_XMM sse2
1348
SCAN_4x4 4, dq, qdq, dq
1349
INIT_XMM avx
1350
SCAN_4x4 4, dq, qdq, dq
1351
%else
1352
INIT_MMX mmx
1353
SCAN_4x4 16, q , dq , wd
1354
1355
;-----------------------------------------------------------------------------
1356
; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1357
;-----------------------------------------------------------------------------
1358
%macro SCAN_4x4_FRAME 0
1359
cglobal zigzag_scan_4x4_frame, 2,2
1360
mova m1, [r1+16]
1361
mova m0, [r1+ 0]
1362
pshufb m1, [pb_scan4frameb]
1363
pshufb m0, [pb_scan4framea]
1364
psrldq m2, m1, 6
1365
palignr m1, m0, 6
1366
pslldq m0, 10
1367
palignr m2, m0, 10
1368
mova [r0+ 0], m1
1369
mova [r0+16], m2
1370
RET
1371
%endmacro
1372
1373
INIT_XMM ssse3
1374
SCAN_4x4_FRAME
1375
INIT_XMM avx
1376
SCAN_4x4_FRAME
1377
1378
INIT_XMM xop
1379
cglobal zigzag_scan_4x4_frame, 2,2
1380
mova m0, [r1+ 0]
1381
mova m1, [r1+16]
1382
vpperm m2, m0, m1, [pb_scan4frame2a]
1383
vpperm m1, m0, m1, [pb_scan4frame2b]
1384
mova [r0+ 0], m2
1385
mova [r0+16], m1
1386
RET
1387
%endif ; !HIGH_BIT_DEPTH
1388
1389
%if HIGH_BIT_DEPTH
1390
;-----------------------------------------------------------------------------
1391
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1392
;-----------------------------------------------------------------------------
1393
INIT_XMM sse2
1394
cglobal zigzag_scan_4x4_field, 2,3
1395
movu m4, [r1+ 8]
1396
pshufd m0, m4, q3102
1397
mova m1, [r1+32]
1398
mova m2, [r1+48]
1399
movu [r0+ 8], m0
1400
mova [r0+32], m1
1401
mova [r0+48], m2
1402
movq mm0, [r1]
1403
movq [r0], mm0
1404
movq mm0, [r1+24]
1405
movq [r0+24], mm0
1406
RET
1407
%else
1408
;-----------------------------------------------------------------------------
1409
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1410
;-----------------------------------------------------------------------------
1411
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1412
INIT_MMX mmx2
1413
cglobal zigzag_scan_4x4_field, 2,3
1414
pshufw m0, [r1+4], q3102
1415
mova m1, [r1+16]
1416
mova m2, [r1+24]
1417
movu [r0+4], m0
1418
mova [r0+16], m1
1419
mova [r0+24], m2
1420
mov r2d, [r1]
1421
mov [r0], r2d
1422
mov r2d, [r1+12]
1423
mov [r0+12], r2d
1424
RET
1425
%endif ; HIGH_BIT_DEPTH
1426
1427
;-----------------------------------------------------------------------------
1428
; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1429
;-----------------------------------------------------------------------------
1430
; Output order:
1431
; 0 1 2 8 9 3 4 10
1432
; 16 11 5 6 7 12 17 24
1433
; 18 13 14 15 19 25 32 26
1434
; 20 21 22 23 27 33 40 34
1435
; 28 29 30 31 35 41 48 42
1436
; 36 37 38 39 43 49 50 44
1437
; 45 46 47 51 56 57 52 53
1438
; 54 55 58 59 60 61 62 63
1439
%undef SCAN_8x8
1440
%macro SCAN_8x8 5
1441
cglobal zigzag_scan_8x8_field, 2,3,8
1442
mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1443
mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1444
mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1445
pshuf%1 m3, m0, q3333 ; 03 03 03 03
1446
movd r2d, m2 ; 09 08
1447
pshuf%1 m2, m2, q0321 ; 08 11 10 09
1448
punpckl%2 m3, m1 ; 05 03 04 03
1449
pinsr%1 m0, r2d, 3 ; 08 02 01 00
1450
punpckl%2 m4, m2, m3 ; 04 10 03 09
1451
pshuf%1 m4, m4, q2310 ; 10 04 03 09
1452
mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1453
mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
1454
mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1455
mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1456
punpckl%3 m6, m5 ; 17 16 XX XX
1457
psrl%4 m1, %5 ; XX 07 06 05
1458
punpckh%2 m6, m2 ; 08 17 11 16
1459
punpckl%3 m6, m1 ; 06 05 11 16
1460
mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1461
psrl%4 m1, %5 ; XX XX 07 06
1462
punpckl%2 m1, m5 ; 17 07 16 06
1463
mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1464
mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1465
punpckh%3 m1, m1 ; 17 07 17 07
1466
punpckl%2 m6, m3, m2 ; 25 13 24 12
1467
pextr%1 r2d, m5, 2
1468
mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1469
punpckl%2 m1, m6 ; 24 17 12 07
1470
mova [r0+12*SIZEOF_DCTCOEF], m1
1471
pinsr%1 m3, r2d, 0 ; 15 14 13 18
1472
mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1473
mova m7, [r1+28*SIZEOF_DCTCOEF]
1474
mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1475
psrl%4 m5, %5*3 ; XX XX XX 19
1476
pshuf%1 m1, m2, q3321 ; 27 27 26 25
1477
punpckl%2 m5, m0 ; 33 XX 32 19
1478
psrl%4 m2, %5*3 ; XX XX XX 27
1479
punpckl%2 m5, m1 ; 26 32 25 19
1480
mova [r0+32*SIZEOF_DCTCOEF], m7
1481
mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1482
mova m7, [r1+36*SIZEOF_DCTCOEF]
1483
mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1484
pshuf%1 m3, m0, q3321 ; 35 35 34 33
1485
punpckl%2 m2, m1 ; 41 XX 40 27
1486
mova [r0+40*SIZEOF_DCTCOEF], m7
1487
punpckl%2 m2, m3 ; 34 40 33 27
1488
mova [r0+28*SIZEOF_DCTCOEF], m2
1489
mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1490
mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1491
psrl%4 m0, %5*3 ; XX XX XX 35
1492
punpckl%2 m0, m2 ; 49 XX 48 35
1493
pshuf%1 m3, m1, q3321 ; 43 43 42 41
1494
punpckl%2 m0, m3 ; 42 48 41 35
1495
mova [r0+36*SIZEOF_DCTCOEF], m0
1496
pextr%1 r2d, m2, 3 ; 51
1497
psrl%4 m1, %5*3 ; XX XX XX 43
1498
punpckl%2 m1, m7 ; 45 XX 44 43
1499
psrl%4 m2, %5 ; XX 51 50 49
1500
punpckl%2 m1, m2 ; 50 44 49 43
1501
pshuf%1 m1, m1, q2310 ; 44 50 49 43
1502
mova [r0+44*SIZEOF_DCTCOEF], m1
1503
psrl%4 m7, %5 ; XX 47 46 45
1504
pinsr%1 m7, r2d, 3 ; 51 47 46 45
1505
mova [r0+48*SIZEOF_DCTCOEF], m7
1506
mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1507
mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1508
mova m7, [r1+60*SIZEOF_DCTCOEF]
1509
punpckl%3 m2, m0, m1 ; 53 52 57 56
1510
punpckh%3 m1, m0 ; 59 58 55 54
1511
mova [r0+52*SIZEOF_DCTCOEF], m2
1512
mova [r0+56*SIZEOF_DCTCOEF], m1
1513
mova [r0+60*SIZEOF_DCTCOEF], m7
1514
RET
1515
%endmacro
1516
%if HIGH_BIT_DEPTH
1517
INIT_XMM sse4
1518
SCAN_8x8 d, dq, qdq, dq, 4
1519
INIT_XMM avx
1520
SCAN_8x8 d, dq, qdq, dq, 4
1521
%else
1522
INIT_MMX mmx2
1523
SCAN_8x8 w, wd, dq , q , 16
1524
%endif
1525
1526
;-----------------------------------------------------------------------------
1527
; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1528
;-----------------------------------------------------------------------------
1529
%macro ZIGZAG_SUB_4x4 2
1530
%ifidn %1, ac
1531
cglobal zigzag_sub_4x4%1_%2, 4,4,8
1532
%else
1533
cglobal zigzag_sub_4x4%1_%2, 3,3,8
1534
%endif
1535
movd m0, [r1+0*FENC_STRIDE]
1536
movd m1, [r1+1*FENC_STRIDE]
1537
movd m2, [r1+2*FENC_STRIDE]
1538
movd m3, [r1+3*FENC_STRIDE]
1539
movd m4, [r2+0*FDEC_STRIDE]
1540
movd m5, [r2+1*FDEC_STRIDE]
1541
movd m6, [r2+2*FDEC_STRIDE]
1542
movd m7, [r2+3*FDEC_STRIDE]
1543
movd [r2+0*FDEC_STRIDE], m0
1544
movd [r2+1*FDEC_STRIDE], m1
1545
movd [r2+2*FDEC_STRIDE], m2
1546
movd [r2+3*FDEC_STRIDE], m3
1547
punpckldq m0, m1
1548
punpckldq m2, m3
1549
punpckldq m4, m5
1550
punpckldq m6, m7
1551
punpcklqdq m0, m2
1552
punpcklqdq m4, m6
1553
mova m7, [pb_sub4%2]
1554
pshufb m0, m7
1555
pshufb m4, m7
1556
mova m7, [hsub_mul]
1557
punpckhbw m1, m0, m4
1558
punpcklbw m0, m4
1559
pmaddubsw m1, m7
1560
pmaddubsw m0, m7
1561
%ifidn %1, ac
1562
movd r2d, m0
1563
pand m0, [pb_subacmask]
1564
%endif
1565
mova [r0+ 0], m0
1566
por m0, m1
1567
pxor m2, m2
1568
mova [r0+16], m1
1569
pcmpeqb m0, m2
1570
pmovmskb eax, m0
1571
%ifidn %1, ac
1572
mov [r3], r2w
1573
%endif
1574
sub eax, 0xffff
1575
shr eax, 31
1576
RET
1577
%endmacro
1578
1579
%if HIGH_BIT_DEPTH == 0
1580
INIT_XMM ssse3
1581
ZIGZAG_SUB_4x4 , frame
1582
ZIGZAG_SUB_4x4 ac, frame
1583
ZIGZAG_SUB_4x4 , field
1584
ZIGZAG_SUB_4x4 ac, field
1585
INIT_XMM avx
1586
ZIGZAG_SUB_4x4 , frame
1587
ZIGZAG_SUB_4x4 ac, frame
1588
ZIGZAG_SUB_4x4 , field
1589
ZIGZAG_SUB_4x4 ac, field
1590
%endif ; !HIGH_BIT_DEPTH
1591
1592
%if HIGH_BIT_DEPTH == 0
1593
INIT_XMM xop
1594
cglobal zigzag_scan_8x8_field, 2,3,7
1595
lea r2, [pb_scan8field1]
1596
%define off(m) (r2+m-pb_scan8field1)
1597
mova m0, [r1+ 0]
1598
mova m1, [r1+ 16]
1599
vpperm m5, m0, m1, [off(pb_scan8field1)]
1600
mova [r0+ 0], m5
1601
vpperm m0, m0, m1, [off(pb_scan8field2a)]
1602
mova m2, [r1+ 32]
1603
mova m3, [r1+ 48]
1604
vpperm m5, m2, m3, [off(pb_scan8field2b)]
1605
por m5, m0
1606
mova [r0+ 16], m5
1607
mova m4, [off(pb_scan8field3b)]
1608
vpperm m1, m1, m2, [off(pb_scan8field3a)]
1609
mova m0, [r1+ 64]
1610
vpperm m5, m3, m0, m4
1611
por m5, m1
1612
mova [r0+ 32], m5
1613
; 4b, 5b are the same as pb_scan8field3b.
1614
; 5a is the same as pb_scan8field4a.
1615
mova m5, [off(pb_scan8field4a)]
1616
vpperm m2, m2, m3, m5
1617
mova m1, [r1+ 80]
1618
vpperm m6, m0, m1, m4
1619
por m6, m2
1620
mova [r0+ 48], m6
1621
vpperm m3, m3, m0, m5
1622
mova m2, [r1+ 96]
1623
vpperm m5, m1, m2, m4
1624
por m5, m3
1625
mova [r0+ 64], m5
1626
vpperm m5, m0, m1, [off(pb_scan8field6)]
1627
mova [r0+ 80], m5
1628
vpperm m5, m1, m2, [off(pb_scan8field7)]
1629
mov r2d, [r1+ 98]
1630
mov [r0+ 90], r2d
1631
mova [r0+ 96], m5
1632
mova m3, [r1+112]
1633
movd [r0+104], m3
1634
mov r2d, [r1+108]
1635
mova [r0+112], m3
1636
mov [r0+112], r2d
1637
%undef off
1638
RET
1639
1640
cglobal zigzag_scan_8x8_frame, 2,3,8
1641
lea r2, [pb_scan8frame1]
1642
%define off(m) (r2+m-pb_scan8frame1)
1643
mova m7, [r1+ 16]
1644
mova m3, [r1+ 32]
1645
vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
1646
mova m2, [r1+ 48]
1647
vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
1648
mova m1, [r1+ 80]
1649
mova m4, [r1+ 64]
1650
vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
1651
vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
1652
vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
1653
vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
1654
mova m4, [r1+ 96]
1655
vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
1656
mova m1, [r1+ 0]
1657
vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
1658
vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
1659
mova [r0+ 0], m1
1660
movh m0, [r1+ 6]
1661
movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
1662
vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
1663
mova [r0+ 16], m1
1664
vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
1665
mova [r0+ 32], m1
1666
vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
1667
mova [r0+ 64], m1
1668
movh m0, [r1+100]
1669
movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
1670
vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
1671
mova [r0+ 80], m1
1672
vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
1673
mova [r0+ 96], m1
1674
mova m1, [r1+112]
1675
vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
1676
vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
1677
mova [r0+ 48], m1
1678
vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
1679
mova [r0+112], m1
1680
%undef off
1681
RET
1682
%endif
1683
1684
;-----------------------------------------------------------------------------
1685
; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1686
;-----------------------------------------------------------------------------
1687
%macro INTERLEAVE 2
1688
mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
1689
mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
1690
mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
1691
mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
1692
TRANSPOSE4x4%2 0,1,2,3,4
1693
mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
1694
mova [r0+(%1+32)*SIZEOF_PIXEL], m1
1695
mova [r0+(%1+64)*SIZEOF_PIXEL], m2
1696
mova [r0+(%1+96)*SIZEOF_PIXEL], m3
1697
packsswb m0, m1
1698
ACCUM por, 6, 2, %1
1699
ACCUM por, 7, 3, %1
1700
ACCUM por, 5, 0, %1
1701
%endmacro
1702
1703
%macro ZIGZAG_8x8_CAVLC 1
1704
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1705
INTERLEAVE 0, %1
1706
INTERLEAVE 8, %1
1707
INTERLEAVE 16, %1
1708
INTERLEAVE 24, %1
1709
packsswb m6, m7
1710
packsswb m5, m6
1711
packsswb m5, m5
1712
pxor m0, m0
1713
%if HIGH_BIT_DEPTH
1714
packsswb m5, m5
1715
%endif
1716
pcmpeqb m5, m0
1717
paddb m5, [pb_1]
1718
movd r0d, m5
1719
mov [r2+0], r0w
1720
shr r0d, 16
1721
mov [r2+8], r0w
1722
RET
1723
%endmacro
1724
1725
%if HIGH_BIT_DEPTH
1726
INIT_XMM sse2
1727
ZIGZAG_8x8_CAVLC D
1728
INIT_XMM avx
1729
ZIGZAG_8x8_CAVLC D
1730
%else
1731
INIT_MMX mmx
1732
ZIGZAG_8x8_CAVLC W
1733
%endif
1734
1735
%macro INTERLEAVE_XMM 1
1736
mova m0, [r1+%1*4+ 0]
1737
mova m1, [r1+%1*4+16]
1738
mova m4, [r1+%1*4+32]
1739
mova m5, [r1+%1*4+48]
1740
SBUTTERFLY wd, 0, 1, 6
1741
SBUTTERFLY wd, 4, 5, 7
1742
SBUTTERFLY wd, 0, 1, 6
1743
SBUTTERFLY wd, 4, 5, 7
1744
movh [r0+%1+ 0], m0
1745
movhps [r0+%1+ 32], m0
1746
movh [r0+%1+ 64], m1
1747
movhps [r0+%1+ 96], m1
1748
movh [r0+%1+ 8], m4
1749
movhps [r0+%1+ 40], m4
1750
movh [r0+%1+ 72], m5
1751
movhps [r0+%1+104], m5
1752
ACCUM por, 2, 0, %1
1753
ACCUM por, 3, 1, %1
1754
por m2, m4
1755
por m3, m5
1756
%endmacro
1757
1758
%if HIGH_BIT_DEPTH == 0
1759
%macro ZIGZAG_8x8_CAVLC 0
1760
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1761
INTERLEAVE_XMM 0
1762
INTERLEAVE_XMM 16
1763
packsswb m2, m3
1764
pxor m5, m5
1765
packsswb m2, m2
1766
packsswb m2, m2
1767
pcmpeqb m5, m2
1768
paddb m5, [pb_1]
1769
movd r0d, m5
1770
mov [r2+0], r0w
1771
shr r0d, 16
1772
mov [r2+8], r0w
1773
RET
1774
%endmacro
1775
1776
INIT_XMM sse2
1777
ZIGZAG_8x8_CAVLC
1778
INIT_XMM avx
1779
ZIGZAG_8x8_CAVLC
1780
1781
INIT_YMM avx2
1782
cglobal zigzag_interleave_8x8_cavlc, 3,3,6
1783
mova m0, [r1+ 0]
1784
mova m1, [r1+32]
1785
mova m2, [r1+64]
1786
mova m3, [r1+96]
1787
mova m5, [deinterleave_shufd]
1788
SBUTTERFLY wd, 0, 1, 4
1789
SBUTTERFLY wd, 2, 3, 4
1790
SBUTTERFLY wd, 0, 1, 4
1791
SBUTTERFLY wd, 2, 3, 4
1792
vpermd m0, m5, m0
1793
vpermd m1, m5, m1
1794
vpermd m2, m5, m2
1795
vpermd m3, m5, m3
1796
mova [r0+ 0], xm0
1797
mova [r0+ 16], xm2
1798
vextracti128 [r0+ 32], m0, 1
1799
vextracti128 [r0+ 48], m2, 1
1800
mova [r0+ 64], xm1
1801
mova [r0+ 80], xm3
1802
vextracti128 [r0+ 96], m1, 1
1803
vextracti128 [r0+112], m3, 1
1804
1805
packsswb m0, m2 ; nnz0, nnz1
1806
packsswb m1, m3 ; nnz2, nnz3
1807
packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
1808
vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
1809
pxor m5, m5
1810
pcmpeqq m0, m5
1811
pmovmskb r0d, m0
1812
not r0d
1813
and r0d, 0x01010101
1814
mov [r2+0], r0w
1815
shr r0d, 16
1816
mov [r2+8], r0w
1817
RET
1818
%endif ; !HIGH_BIT_DEPTH
1819
1820