Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* predict-a.asm: x86 intra prediction
3
;*****************************************************************************
4
;* Copyright (C) 2005-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Holger Lubitz <holger@lubitz.org>
8
;* Fiona Glaser <fiona@x264.com>
9
;* Henrik Gramner <henrik@gramner.com>
10
;*
11
;* This program is free software; you can redistribute it and/or modify
12
;* it under the terms of the GNU General Public License as published by
13
;* the Free Software Foundation; either version 2 of the License, or
14
;* (at your option) any later version.
15
;*
16
;* This program is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
;* GNU General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU General Public License
22
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24
;*
25
;* This program is also available under a commercial proprietary license.
26
;* For more information, contact us at licensing@x264.com.
27
;*****************************************************************************
28
29
%include "x86inc.asm"
30
%include "x86util.asm"
31
32
SECTION_RODATA 32
33
34
pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
35
pw_m3: times 16 dw -3
36
pw_m7: times 16 dw -7
37
pb_00s_ff: times 8 db 0
38
pb_0s_ff: times 7 db 0
39
db 0xff
40
shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
41
shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
42
shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
43
shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
44
pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
45
46
SECTION .text
47
48
cextern pb_0
49
cextern pb_1
50
cextern pb_3
51
cextern pw_1
52
cextern pw_2
53
cextern pw_4
54
cextern pw_8
55
cextern pw_16
56
cextern pw_00ff
57
cextern pw_pixel_max
58
cextern pw_0to15
59
60
%macro STORE8 1
61
mova [r0+0*FDEC_STRIDEB], %1
62
mova [r0+1*FDEC_STRIDEB], %1
63
add r0, 4*FDEC_STRIDEB
64
mova [r0-2*FDEC_STRIDEB], %1
65
mova [r0-1*FDEC_STRIDEB], %1
66
mova [r0+0*FDEC_STRIDEB], %1
67
mova [r0+1*FDEC_STRIDEB], %1
68
mova [r0+2*FDEC_STRIDEB], %1
69
mova [r0+3*FDEC_STRIDEB], %1
70
%endmacro
71
72
%macro STORE16 1-4
73
%if %0 > 1
74
mov r1d, 2*%0
75
.loop:
76
mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
77
mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
78
mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
79
mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
80
%ifidn %0, 4
81
mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
82
mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
83
mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
84
mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
85
add r0, 2*FDEC_STRIDEB
86
%else ; %0 == 2
87
add r0, 4*FDEC_STRIDEB
88
mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
89
mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
90
mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
91
mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
92
%endif
93
dec r1d
94
jg .loop
95
%else ; %0 == 1
96
STORE8 %1
97
%if HIGH_BIT_DEPTH ; Different code paths to reduce code size
98
add r0, 6*FDEC_STRIDEB
99
mova [r0-2*FDEC_STRIDEB], %1
100
mova [r0-1*FDEC_STRIDEB], %1
101
mova [r0+0*FDEC_STRIDEB], %1
102
mova [r0+1*FDEC_STRIDEB], %1
103
add r0, 4*FDEC_STRIDEB
104
mova [r0-2*FDEC_STRIDEB], %1
105
mova [r0-1*FDEC_STRIDEB], %1
106
mova [r0+0*FDEC_STRIDEB], %1
107
mova [r0+1*FDEC_STRIDEB], %1
108
%else
109
add r0, 8*FDEC_STRIDE
110
mova [r0-4*FDEC_STRIDE], %1
111
mova [r0-3*FDEC_STRIDE], %1
112
mova [r0-2*FDEC_STRIDE], %1
113
mova [r0-1*FDEC_STRIDE], %1
114
mova [r0+0*FDEC_STRIDE], %1
115
mova [r0+1*FDEC_STRIDE], %1
116
mova [r0+2*FDEC_STRIDE], %1
117
mova [r0+3*FDEC_STRIDE], %1
118
%endif ; HIGH_BIT_DEPTH
119
%endif
120
%endmacro
121
122
%macro PRED_H_LOAD 2 ; reg, offset
123
%if cpuflag(avx2)
124
vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
125
%elif HIGH_BIT_DEPTH
126
movd %1, [r0+(%2)*FDEC_STRIDEB-4]
127
SPLATW %1, %1, 1
128
%else
129
SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
130
%endif
131
%endmacro
132
133
%macro PRED_H_STORE 3 ; reg, offset, width
134
%assign %%w %3*SIZEOF_PIXEL
135
%if %%w == 8
136
movq [r0+(%2)*FDEC_STRIDEB], %1
137
%else
138
%assign %%i 0
139
%rep %%w/mmsize
140
mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
141
%assign %%i %%i+mmsize
142
%endrep
143
%endif
144
%endmacro
145
146
%macro PRED_H_4ROWS 2 ; width, inc_ptr
147
PRED_H_LOAD m0, 0
148
PRED_H_LOAD m1, 1
149
PRED_H_STORE m0, 0, %1
150
PRED_H_STORE m1, 1, %1
151
PRED_H_LOAD m0, 2
152
%if %2
153
add r0, 4*FDEC_STRIDEB
154
%endif
155
PRED_H_LOAD m1, 3-4*%2
156
PRED_H_STORE m0, 2-4*%2, %1
157
PRED_H_STORE m1, 3-4*%2, %1
158
%endmacro
159
160
; dest, left, right, src, tmp
161
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
162
%macro PRED8x8_LOWPASS 4-5
163
%if HIGH_BIT_DEPTH
164
paddw %2, %3
165
psrlw %2, 1
166
pavgw %1, %4, %2
167
%else
168
mova %5, %2
169
pavgb %2, %3
170
pxor %3, %5
171
pand %3, [pb_1]
172
psubusb %2, %3
173
pavgb %1, %4, %2
174
%endif
175
%endmacro
176
177
;-----------------------------------------------------------------------------
178
; void predict_4x4_h( pixel *src )
179
;-----------------------------------------------------------------------------
180
%if HIGH_BIT_DEPTH
181
INIT_XMM avx2
182
cglobal predict_4x4_h, 1,1
183
PRED_H_4ROWS 4, 0
184
RET
185
%endif
186
187
;-----------------------------------------------------------------------------
188
; void predict_4x4_ddl( pixel *src )
189
;-----------------------------------------------------------------------------
190
%macro PREDICT_4x4_DDL 0
191
cglobal predict_4x4_ddl, 1,1
192
movu m1, [r0-FDEC_STRIDEB]
193
PSLLPIX m2, m1, 1
194
mova m0, m1
195
%if HIGH_BIT_DEPTH
196
PSRLPIX m1, m1, 1
197
pshufhw m1, m1, q2210
198
%else
199
pxor m1, m2
200
PSRLPIX m1, m1, 1
201
pxor m1, m0
202
%endif
203
PRED8x8_LOWPASS m0, m2, m1, m0, m3
204
205
%assign Y 0
206
%rep 4
207
PSRLPIX m0, m0, 1
208
movh [r0+Y*FDEC_STRIDEB], m0
209
%assign Y (Y+1)
210
%endrep
211
212
RET
213
%endmacro
214
215
%if HIGH_BIT_DEPTH
216
INIT_XMM sse2
217
PREDICT_4x4_DDL
218
INIT_XMM avx
219
PREDICT_4x4_DDL
220
INIT_MMX mmx2
221
cglobal predict_4x4_ddl, 1,2
222
movu m1, [r0-FDEC_STRIDEB+4]
223
PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
224
mova m3, [r0-FDEC_STRIDEB+8]
225
mova [r0+0*FDEC_STRIDEB], m0
226
pshufw m4, m3, q3321
227
PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
228
mova [r0+3*FDEC_STRIDEB], m2
229
pshufw m1, m0, q0021
230
punpckldq m1, m2
231
mova [r0+1*FDEC_STRIDEB], m1
232
psllq m0, 16
233
PALIGNR m2, m0, 6, m0
234
mova [r0+2*FDEC_STRIDEB], m2
235
RET
236
%else ; !HIGH_BIT_DEPTH
237
INIT_MMX mmx2
238
PREDICT_4x4_DDL
239
%endif
240
241
;-----------------------------------------------------------------------------
242
; void predict_4x4_vr( pixel *src )
243
;-----------------------------------------------------------------------------
244
%if HIGH_BIT_DEPTH == 0
245
INIT_MMX ssse3
246
cglobal predict_4x4_vr, 1,1
247
movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
248
mova m4, m1
249
palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
250
pavgb m4, m1
251
palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
252
mova m0, m1
253
palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
254
mova m2, m1
255
palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
256
PRED8x8_LOWPASS m2, m0, m1, m2, m3
257
pshufw m0, m2, 0
258
psrlq m2, 16
259
movd [r0+0*FDEC_STRIDEB], m4
260
palignr m4, m0, 7
261
movd [r0+1*FDEC_STRIDEB], m2
262
psllq m0, 8
263
movd [r0+2*FDEC_STRIDEB], m4
264
palignr m2, m0, 7
265
movd [r0+3*FDEC_STRIDEB], m2
266
RET
267
%endif ; !HIGH_BIT_DEPTH
268
269
;-----------------------------------------------------------------------------
270
; void predict_4x4_ddr( pixel *src )
271
;-----------------------------------------------------------------------------
272
%macro PREDICT_4x4 4
273
cglobal predict_4x4_ddr, 1,1
274
%if HIGH_BIT_DEPTH
275
movu m2, [r0-1*FDEC_STRIDEB-8]
276
pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
277
pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
278
pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
279
movhps m3, [r0+3*FDEC_STRIDEB-8]
280
%else ; !HIGH_BIT_DEPTH
281
movd m0, [r0+2*FDEC_STRIDEB-4]
282
movd m1, [r0+0*FDEC_STRIDEB-4]
283
punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
284
punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
285
punpckhwd m0, m1
286
movd m2, [r0-1*FDEC_STRIDEB]
287
%if cpuflag(ssse3)
288
palignr m2, m0, 4
289
%else
290
psllq m2, 32
291
punpckhdq m0, m2
292
SWAP 2, 0
293
%endif
294
movd m3, [r0+3*FDEC_STRIDEB-4]
295
psllq m3, 32
296
%endif ; !HIGH_BIT_DEPTH
297
298
PSRLPIX m1, m2, 1
299
mova m0, m2
300
PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
301
PRED8x8_LOWPASS m0, m2, m1, m0, m3
302
%assign Y 3
303
movh [r0+Y*FDEC_STRIDEB], m0
304
%rep 3
305
%assign Y (Y-1)
306
PSRLPIX m0, m0, 1
307
movh [r0+Y*FDEC_STRIDEB], m0
308
%endrep
309
RET
310
311
;-----------------------------------------------------------------------------
312
; void predict_4x4_vr( pixel *src )
313
;-----------------------------------------------------------------------------
314
cglobal predict_4x4_vr, 1,1
315
%if HIGH_BIT_DEPTH
316
movu m1, [r0-1*FDEC_STRIDEB-8]
317
pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
318
pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
319
pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
320
%else ; !HIGH_BIT_DEPTH
321
movd m0, [r0+2*FDEC_STRIDEB-4]
322
movd m1, [r0+0*FDEC_STRIDEB-4]
323
punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
324
punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
325
punpckhwd m0, m1
326
movd m1, [r0-1*FDEC_STRIDEB]
327
%if cpuflag(ssse3)
328
palignr m1, m0, 4
329
%else
330
psllq m1, 32
331
punpckhdq m0, m1
332
SWAP 1, 0
333
%endif
334
%endif ; !HIGH_BIT_DEPTH
335
PSRLPIX m2, m1, 1
336
PSRLPIX m0, m1, 2
337
pavg%1 m4, m1, m2
338
PSRLPIX m4, m4, 3
339
PRED8x8_LOWPASS m2, m0, m1, m2, m3
340
PSLLPIX m0, m2, 6
341
PSRLPIX m2, m2, 2
342
movh [r0+0*FDEC_STRIDEB], m4
343
PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
344
movh [r0+1*FDEC_STRIDEB], m2
345
PSLLPIX m0, m0, 1
346
movh [r0+2*FDEC_STRIDEB], m4
347
PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
348
movh [r0+3*FDEC_STRIDEB], m2
349
RET
350
351
;-----------------------------------------------------------------------------
352
; void predict_4x4_hd( pixel *src )
353
;-----------------------------------------------------------------------------
354
cglobal predict_4x4_hd, 1,1
355
%if HIGH_BIT_DEPTH
356
movu m1, [r0-1*FDEC_STRIDEB-8]
357
PSLLPIX m1, m1, 1
358
pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
359
pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
360
pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
361
pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
362
%else
363
movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
364
punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
365
PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
366
movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
367
punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
368
movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
369
punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
370
punpckh%3 m1, m2 ; l0 l1 l2 l3
371
punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
372
%endif
373
PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
374
PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
375
pavg%1 m5, m1, m2
376
PRED8x8_LOWPASS m3, m1, m0, m2, m4
377
punpckl%2 m5, m3
378
PSRLPIX m3, m3, 4
379
PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
380
%assign Y 3
381
movh [r0+Y*FDEC_STRIDEB], m5
382
%rep 2
383
%assign Y (Y-1)
384
PSRLPIX m5, m5, 2
385
movh [r0+Y*FDEC_STRIDEB], m5
386
%endrep
387
movh [r0+0*FDEC_STRIDEB], m3
388
RET
389
%endmacro ; PREDICT_4x4
390
391
;-----------------------------------------------------------------------------
392
; void predict_4x4_ddr( pixel *src )
393
;-----------------------------------------------------------------------------
394
%if HIGH_BIT_DEPTH
395
INIT_MMX mmx2
396
cglobal predict_4x4_ddr, 1,1
397
mova m0, [r0+1*FDEC_STRIDEB-8]
398
punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
399
mova m3, [r0+3*FDEC_STRIDEB-8]
400
punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
401
punpckhdq m3, m0
402
403
pshufw m0, m3, q3321
404
pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
405
pshufw m1, m0, q3321
406
PRED8x8_LOWPASS m0, m1, m3, m0
407
movq [r0+3*FDEC_STRIDEB], m0
408
409
movq m2, [r0-1*FDEC_STRIDEB-0]
410
pshufw m4, m2, q2100
411
pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
412
movq m1, m4
413
PALIGNR m4, m3, 6, m3
414
PRED8x8_LOWPASS m1, m4, m2, m1
415
movq [r0+0*FDEC_STRIDEB], m1
416
417
pshufw m2, m0, q3321
418
punpckldq m2, m1
419
psllq m0, 16
420
PALIGNR m1, m0, 6, m0
421
movq [r0+1*FDEC_STRIDEB], m1
422
movq [r0+2*FDEC_STRIDEB], m2
423
movd [r0+3*FDEC_STRIDEB+4], m1
424
RET
425
426
;-----------------------------------------------------------------------------
427
; void predict_4x4_hd( pixel *src )
428
;-----------------------------------------------------------------------------
429
cglobal predict_4x4_hd, 1,1
430
mova m0, [r0+1*FDEC_STRIDEB-8]
431
punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
432
mova m1, [r0+3*FDEC_STRIDEB-8]
433
punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
434
punpckhdq m1, m0
435
mova m0, m1
436
437
movu m3, [r0-1*FDEC_STRIDEB-2]
438
pshufw m4, m1, q0032
439
mova m7, m3
440
punpckldq m4, m3
441
PALIGNR m3, m1, 2, m2
442
PRED8x8_LOWPASS m2, m4, m1, m3
443
444
pavgw m0, m3
445
punpcklwd m5, m0, m2
446
punpckhwd m4, m0, m2
447
mova [r0+3*FDEC_STRIDEB], m5
448
mova [r0+1*FDEC_STRIDEB], m4
449
psrlq m5, 32
450
punpckldq m5, m4
451
mova [r0+2*FDEC_STRIDEB], m5
452
453
pshufw m4, m7, q2100
454
mova m6, [r0-1*FDEC_STRIDEB+0]
455
pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
456
PRED8x8_LOWPASS m3, m4, m6, m7
457
PALIGNR m3, m0, 6, m0
458
mova [r0+0*FDEC_STRIDEB], m3
459
RET
460
461
INIT_XMM sse2
462
PREDICT_4x4 w, wd, dq, qdq
463
INIT_XMM ssse3
464
PREDICT_4x4 w, wd, dq, qdq
465
INIT_XMM avx
466
PREDICT_4x4 w, wd, dq, qdq
467
%else ; !HIGH_BIT_DEPTH
468
INIT_MMX mmx2
469
PREDICT_4x4 b, bw, wd, dq
470
INIT_MMX ssse3
471
%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
472
PREDICT_4x4 b, bw, wd, dq
473
%endif
474
475
;-----------------------------------------------------------------------------
476
; void predict_4x4_hu( pixel *src )
477
;-----------------------------------------------------------------------------
478
%if HIGH_BIT_DEPTH
479
INIT_MMX
480
cglobal predict_4x4_hu_mmx2, 1,1
481
movq m0, [r0+0*FDEC_STRIDEB-8]
482
punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
483
movq m1, [r0+2*FDEC_STRIDEB-8]
484
punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
485
punpckhdq m0, m1
486
pshufw m1, m1, q3333
487
movq [r0+3*FDEC_STRIDEB], m1
488
pshufw m3, m0, q3321
489
pshufw m4, m0, q3332
490
pavgw m2, m0, m3
491
PRED8x8_LOWPASS m3, m0, m4, m3
492
punpcklwd m4, m2, m3
493
mova [r0+0*FDEC_STRIDEB], m4
494
psrlq m2, 16
495
psrlq m3, 16
496
punpcklwd m2, m3
497
mova [r0+1*FDEC_STRIDEB], m2
498
punpckhdq m2, m1
499
mova [r0+2*FDEC_STRIDEB], m2
500
RET
501
502
%else ; !HIGH_BIT_DEPTH
503
INIT_MMX
504
cglobal predict_4x4_hu_mmx2, 1,1
505
movd m1, [r0+0*FDEC_STRIDEB-4]
506
punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
507
movd m0, [r0+2*FDEC_STRIDEB-4]
508
punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
509
punpckhwd m1, m0
510
movq m0, m1
511
punpckhbw m1, m1
512
pshufw m1, m1, q3333
513
punpckhdq m0, m1
514
movq m2, m0
515
movq m3, m0
516
movq m5, m0
517
psrlq m3, 8
518
psrlq m2, 16
519
pavgb m5, m3
520
PRED8x8_LOWPASS m3, m0, m2, m3, m4
521
movd [r0+3*FDEC_STRIDEB], m1
522
punpcklbw m5, m3
523
movd [r0+0*FDEC_STRIDEB], m5
524
psrlq m5, 16
525
movd [r0+1*FDEC_STRIDEB], m5
526
psrlq m5, 16
527
movd [r0+2*FDEC_STRIDEB], m5
528
RET
529
%endif ; HIGH_BIT_DEPTH
530
531
;-----------------------------------------------------------------------------
532
; void predict_4x4_vl( pixel *src )
533
;-----------------------------------------------------------------------------
534
%macro PREDICT_4x4_V1 1
535
cglobal predict_4x4_vl, 1,1
536
movu m1, [r0-FDEC_STRIDEB]
537
PSRLPIX m3, m1, 1
538
PSRLPIX m2, m1, 2
539
pavg%1 m4, m3, m1
540
PRED8x8_LOWPASS m0, m1, m2, m3, m5
541
542
movh [r0+0*FDEC_STRIDEB], m4
543
movh [r0+1*FDEC_STRIDEB], m0
544
PSRLPIX m4, m4, 1
545
PSRLPIX m0, m0, 1
546
movh [r0+2*FDEC_STRIDEB], m4
547
movh [r0+3*FDEC_STRIDEB], m0
548
RET
549
%endmacro
550
551
%if HIGH_BIT_DEPTH
552
INIT_XMM sse2
553
PREDICT_4x4_V1 w
554
INIT_XMM avx
555
PREDICT_4x4_V1 w
556
557
INIT_MMX mmx2
558
cglobal predict_4x4_vl, 1,4
559
mova m1, [r0-FDEC_STRIDEB+0]
560
mova m2, [r0-FDEC_STRIDEB+8]
561
mova m0, m2
562
PALIGNR m2, m1, 4, m4
563
PALIGNR m0, m1, 2, m4
564
mova m3, m0
565
pavgw m3, m1
566
mova [r0+0*FDEC_STRIDEB], m3
567
psrlq m3, 16
568
mova [r0+2*FDEC_STRIDEB], m3
569
PRED8x8_LOWPASS m0, m1, m2, m0
570
mova [r0+1*FDEC_STRIDEB], m0
571
psrlq m0, 16
572
mova [r0+3*FDEC_STRIDEB], m0
573
574
movzx r1d, word [r0-FDEC_STRIDEB+ 8]
575
movzx r2d, word [r0-FDEC_STRIDEB+10]
576
movzx r3d, word [r0-FDEC_STRIDEB+12]
577
lea r1d, [r1+r2+1]
578
add r3d, r2d
579
lea r3d, [r3+r1+1]
580
shr r1d, 1
581
shr r3d, 2
582
mov [r0+2*FDEC_STRIDEB+6], r1w
583
mov [r0+3*FDEC_STRIDEB+6], r3w
584
RET
585
%else ; !HIGH_BIT_DEPTH
586
INIT_MMX mmx2
587
PREDICT_4x4_V1 b
588
%endif
589
590
;-----------------------------------------------------------------------------
591
; void predict_4x4_dc( pixel *src )
592
;-----------------------------------------------------------------------------
593
INIT_MMX mmx2
594
%if HIGH_BIT_DEPTH
595
cglobal predict_4x4_dc, 1,1
596
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
597
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
598
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
599
paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
600
psrlq m2, 48
601
mova m0, [r0-FDEC_STRIDEB]
602
HADDW m0, m1
603
paddw m0, [pw_4]
604
paddw m0, m2
605
psrlw m0, 3
606
SPLATW m0, m0
607
mova [r0+0*FDEC_STRIDEB], m0
608
mova [r0+1*FDEC_STRIDEB], m0
609
mova [r0+2*FDEC_STRIDEB], m0
610
mova [r0+3*FDEC_STRIDEB], m0
611
RET
612
613
%else ; !HIGH_BIT_DEPTH
614
cglobal predict_4x4_dc, 1,4
615
pxor mm7, mm7
616
movd mm0, [r0-FDEC_STRIDEB]
617
psadbw mm0, mm7
618
movd r3d, mm0
619
movzx r1d, byte [r0-1]
620
%assign Y 1
621
%rep 3
622
movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
623
add r1d, r2d
624
%assign Y Y+1
625
%endrep
626
lea r1d, [r1+r3+4]
627
shr r1d, 3
628
imul r1d, 0x01010101
629
mov [r0+FDEC_STRIDEB*0], r1d
630
mov [r0+FDEC_STRIDEB*1], r1d
631
mov [r0+FDEC_STRIDEB*2], r1d
632
mov [r0+FDEC_STRIDEB*3], r1d
633
RET
634
%endif ; HIGH_BIT_DEPTH
635
636
%macro PREDICT_FILTER 4
637
;-----------------------------------------------------------------------------
638
;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
639
;-----------------------------------------------------------------------------
640
cglobal predict_8x8_filter, 4,6,6
641
add r0, 0x58*SIZEOF_PIXEL
642
%define src r0-0x58*SIZEOF_PIXEL
643
%if ARCH_X86_64 == 0
644
mov r4, r1
645
%define t1 r4
646
%define t4 r1
647
%else
648
%define t1 r1
649
%define t4 r4
650
%endif
651
test r3b, 1
652
je .check_top
653
mov t4d, r2d
654
and t4d, 8
655
neg t4
656
mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
657
punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
658
mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
659
punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
660
punpckh%2%3 m1, m0
661
mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
662
punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
663
mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
664
punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
665
punpckh%2%3 m3, m2
666
punpckh%3%4 m3, m1
667
mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
668
mova m1, [src-1*FDEC_STRIDEB]
669
PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
670
PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
671
PRED8x8_LOWPASS m3, m1, m4, m3, m5
672
mova [t1+8*SIZEOF_PIXEL], m3
673
movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
674
movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
675
lea t4d, [t4*3+2]
676
add t4d, r5d
677
shr t4d, 2
678
mov [t1+7*SIZEOF_PIXEL], t4%1
679
mov [t1+6*SIZEOF_PIXEL], t4%1
680
test r3b, 2
681
je .done
682
.check_top:
683
%if SIZEOF_PIXEL==1 && cpuflag(ssse3)
684
INIT_XMM cpuname
685
movu m3, [src-1*FDEC_STRIDEB]
686
movhps m0, [src-1*FDEC_STRIDEB-8]
687
test r2b, 8
688
je .fix_lt_2
689
.do_top:
690
and r2d, 4
691
%ifdef PIC
692
lea r3, [shuf_fixtr]
693
pshufb m3, [r3+r2*4]
694
%else
695
pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
696
%endif
697
psrldq m1, m3, 15
698
PALIGNR m2, m3, m0, 15, m0
699
PALIGNR m1, m3, 1, m5
700
PRED8x8_LOWPASS m0, m2, m1, m3, m5
701
mova [t1+16*SIZEOF_PIXEL], m0
702
psrldq m0, 15
703
movd [t1+32*SIZEOF_PIXEL], m0
704
.done:
705
REP_RET
706
.fix_lt_2:
707
pslldq m0, m3, 15
708
jmp .do_top
709
710
%else
711
mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
712
mova m3, [src-1*FDEC_STRIDEB]
713
mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
714
test r2b, 8
715
je .fix_lt_2
716
test r2b, 4
717
je .fix_tr_1
718
.do_top:
719
PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
720
PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
721
PRED8x8_LOWPASS m4, m2, m0, m3, m5
722
mova [t1+16*SIZEOF_PIXEL], m4
723
test r3b, 4
724
je .done
725
PSRLPIX m5, m1, 7
726
PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
727
PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
728
PRED8x8_LOWPASS m0, m2, m5, m1, m4
729
mova [t1+24*SIZEOF_PIXEL], m0
730
PSRLPIX m0, m0, 7
731
movd [t1+32*SIZEOF_PIXEL], m0
732
.done:
733
REP_RET
734
.fix_lt_2:
735
PSLLPIX m0, m3, 7
736
test r2b, 4
737
jne .do_top
738
.fix_tr_1:
739
punpckh%1%2 m1, m3, m3
740
pshuf%2 m1, m1, q3333
741
jmp .do_top
742
%endif
743
%endmacro
744
745
%if HIGH_BIT_DEPTH
746
INIT_XMM sse2
747
PREDICT_FILTER w, d, q, dq
748
INIT_XMM ssse3
749
PREDICT_FILTER w, d, q, dq
750
INIT_XMM avx
751
PREDICT_FILTER w, d, q, dq
752
%else
753
INIT_MMX mmx2
754
PREDICT_FILTER b, w, d, q
755
INIT_MMX ssse3
756
PREDICT_FILTER b, w, d, q
757
%endif
758
759
;-----------------------------------------------------------------------------
760
; void predict_8x8_v( pixel *src, pixel *edge )
761
;-----------------------------------------------------------------------------
762
%macro PREDICT_8x8_V 0
763
cglobal predict_8x8_v, 2,2
764
mova m0, [r1+16*SIZEOF_PIXEL]
765
STORE8 m0
766
RET
767
%endmacro
768
769
%if HIGH_BIT_DEPTH
770
INIT_XMM sse
771
PREDICT_8x8_V
772
%else
773
INIT_MMX mmx2
774
PREDICT_8x8_V
775
%endif
776
777
;-----------------------------------------------------------------------------
778
; void predict_8x8_h( pixel *src, pixel edge[36] )
779
;-----------------------------------------------------------------------------
780
%macro PREDICT_8x8_H 2
781
cglobal predict_8x8_h, 2,2
782
movu m1, [r1+7*SIZEOF_PIXEL]
783
add r0, 4*FDEC_STRIDEB
784
punpckl%1 m2, m1, m1
785
punpckh%1 m1, m1
786
%assign Y 0
787
%rep 8
788
%assign i 1+Y/4
789
SPLAT%2 m0, m %+ i, (3-Y)&3
790
mova [r0+(Y-4)*FDEC_STRIDEB], m0
791
%assign Y Y+1
792
%endrep
793
RET
794
%endmacro
795
796
%if HIGH_BIT_DEPTH
797
INIT_XMM sse2
798
PREDICT_8x8_H wd, D
799
%else
800
INIT_MMX mmx2
801
PREDICT_8x8_H bw, W
802
%endif
803
804
;-----------------------------------------------------------------------------
805
; void predict_8x8_dc( pixel *src, pixel *edge );
806
;-----------------------------------------------------------------------------
807
%if HIGH_BIT_DEPTH
808
INIT_XMM sse2
809
cglobal predict_8x8_dc, 2,2
810
movu m0, [r1+14]
811
paddw m0, [r1+32]
812
HADDW m0, m1
813
paddw m0, [pw_8]
814
psrlw m0, 4
815
SPLATW m0, m0
816
STORE8 m0
817
RET
818
819
%else ; !HIGH_BIT_DEPTH
820
INIT_MMX mmx2
821
cglobal predict_8x8_dc, 2,2
822
pxor mm0, mm0
823
pxor mm1, mm1
824
psadbw mm0, [r1+7]
825
psadbw mm1, [r1+16]
826
paddw mm0, [pw_8]
827
paddw mm0, mm1
828
psrlw mm0, 4
829
pshufw mm0, mm0, 0
830
packuswb mm0, mm0
831
STORE8 mm0
832
RET
833
%endif ; HIGH_BIT_DEPTH
834
835
;-----------------------------------------------------------------------------
836
; void predict_8x8_dc_top ( pixel *src, pixel *edge );
837
; void predict_8x8_dc_left( pixel *src, pixel *edge );
838
;-----------------------------------------------------------------------------
839
%if HIGH_BIT_DEPTH
840
%macro PREDICT_8x8_DC 3
841
cglobal %1, 2,2
842
%3 m0, [r1+%2]
843
HADDW m0, m1
844
paddw m0, [pw_4]
845
psrlw m0, 3
846
SPLATW m0, m0
847
STORE8 m0
848
RET
849
%endmacro
850
INIT_XMM sse2
851
PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
852
PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
853
854
%else ; !HIGH_BIT_DEPTH
855
%macro PREDICT_8x8_DC 2
856
cglobal %1, 2,2
857
pxor mm0, mm0
858
psadbw mm0, [r1+%2]
859
paddw mm0, [pw_4]
860
psrlw mm0, 3
861
pshufw mm0, mm0, 0
862
packuswb mm0, mm0
863
STORE8 mm0
864
RET
865
%endmacro
866
INIT_MMX
867
PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
868
PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
869
%endif ; HIGH_BIT_DEPTH
870
871
; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
872
; size on the 8-bit mmx functions below if we know sse2 is available.
873
%macro PREDICT_8x8_DDLR 0
874
;-----------------------------------------------------------------------------
875
; void predict_8x8_ddl( pixel *src, pixel *edge )
876
;-----------------------------------------------------------------------------
877
cglobal predict_8x8_ddl, 2,2,7
878
mova m0, [r1+16*SIZEOF_PIXEL]
879
mova m1, [r1+24*SIZEOF_PIXEL]
880
%if cpuflag(cache64)
881
movd m5, [r1+32*SIZEOF_PIXEL]
882
palignr m3, m1, m0, 1*SIZEOF_PIXEL
883
palignr m5, m5, m1, 1*SIZEOF_PIXEL
884
palignr m4, m1, m0, 7*SIZEOF_PIXEL
885
%else
886
movu m3, [r1+17*SIZEOF_PIXEL]
887
movu m4, [r1+23*SIZEOF_PIXEL]
888
movu m5, [r1+25*SIZEOF_PIXEL]
889
%endif
890
PSLLPIX m2, m0, 1
891
add r0, FDEC_STRIDEB*4
892
PRED8x8_LOWPASS m0, m2, m3, m0, m6
893
PRED8x8_LOWPASS m1, m4, m5, m1, m6
894
mova [r0+3*FDEC_STRIDEB], m1
895
%assign Y 2
896
%rep 6
897
PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
898
PSLLPIX m0, m0, 1
899
mova [r0+Y*FDEC_STRIDEB], m1
900
%assign Y (Y-1)
901
%endrep
902
PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
903
mova [r0+Y*FDEC_STRIDEB], m1
904
RET
905
906
;-----------------------------------------------------------------------------
907
; void predict_8x8_ddr( pixel *src, pixel *edge )
908
;-----------------------------------------------------------------------------
909
cglobal predict_8x8_ddr, 2,2,7
910
add r0, FDEC_STRIDEB*4
911
mova m0, [r1+ 8*SIZEOF_PIXEL]
912
mova m1, [r1+16*SIZEOF_PIXEL]
913
; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
914
movu m2, [r1+ 7*SIZEOF_PIXEL]
915
movu m5, [r1+17*SIZEOF_PIXEL]
916
%if cpuflag(cache64)
917
palignr m3, m1, m0, 1*SIZEOF_PIXEL
918
palignr m4, m1, m0, 7*SIZEOF_PIXEL
919
%else
920
movu m3, [r1+ 9*SIZEOF_PIXEL]
921
movu m4, [r1+15*SIZEOF_PIXEL]
922
%endif
923
PRED8x8_LOWPASS m0, m2, m3, m0, m6
924
PRED8x8_LOWPASS m1, m4, m5, m1, m6
925
mova [r0+3*FDEC_STRIDEB], m0
926
%assign Y -4
927
%rep 6
928
PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
929
PSLLPIX m0, m0, 1
930
mova [r0+Y*FDEC_STRIDEB], m1
931
%assign Y (Y+1)
932
%endrep
933
PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
934
mova [r0+Y*FDEC_STRIDEB], m1
935
RET
936
%endmacro ; PREDICT_8x8_DDLR
937
938
%if HIGH_BIT_DEPTH
939
INIT_XMM sse2
940
PREDICT_8x8_DDLR
941
INIT_XMM ssse3
942
PREDICT_8x8_DDLR
943
INIT_XMM ssse3, cache64
944
PREDICT_8x8_DDLR
945
%elif ARCH_X86_64 == 0
946
INIT_MMX mmx2
947
PREDICT_8x8_DDLR
948
%endif
949
950
;-----------------------------------------------------------------------------
951
; void predict_8x8_hu( pixel *src, pixel *edge )
952
;-----------------------------------------------------------------------------
953
%macro PREDICT_8x8_HU 2
954
cglobal predict_8x8_hu, 2,2,8
955
add r0, 4*FDEC_STRIDEB
956
%if HIGH_BIT_DEPTH
957
%if cpuflag(ssse3)
958
movu m5, [r1+7*SIZEOF_PIXEL]
959
pshufb m5, [pw_reverse]
960
%else
961
movq m6, [r1+7*SIZEOF_PIXEL]
962
movq m5, [r1+11*SIZEOF_PIXEL]
963
pshuflw m6, m6, q0123
964
pshuflw m5, m5, q0123
965
movlhps m5, m6
966
%endif ; cpuflag
967
psrldq m2, m5, 2
968
pshufd m3, m5, q0321
969
pshufhw m2, m2, q2210
970
pshufhw m3, m3, q1110
971
pavgw m4, m5, m2
972
%else ; !HIGH_BIT_DEPTH
973
movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
974
pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
975
psllq m1, 56 ; l7 .. .. .. .. .. .. ..
976
mova m2, m0
977
psllw m0, 8
978
psrlw m2, 8
979
por m2, m0
980
mova m3, m2
981
mova m4, m2
982
mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
983
psrlq m3, 16
984
psrlq m2, 8
985
por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
986
punpckhbw m1, m1
987
por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
988
pavgb m4, m2
989
%endif ; !HIGH_BIT_DEPTH
990
PRED8x8_LOWPASS m2, m3, m5, m2, m6
991
punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
992
punpckl%2 m4, m2 ; p4 p3 p2 p1
993
PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
994
pshuf%1 m1, m0, q3321
995
PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
996
pshuf%1 m2, m0, q3332
997
PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
998
pshuf%1 m3, m0, q3333
999
mova [r0-4*FDEC_STRIDEB], m4
1000
mova [r0-3*FDEC_STRIDEB], m5
1001
mova [r0-2*FDEC_STRIDEB], m6
1002
mova [r0-1*FDEC_STRIDEB], m7
1003
mova [r0+0*FDEC_STRIDEB], m0
1004
mova [r0+1*FDEC_STRIDEB], m1
1005
mova [r0+2*FDEC_STRIDEB], m2
1006
mova [r0+3*FDEC_STRIDEB], m3
1007
RET
1008
%endmacro
1009
1010
%if HIGH_BIT_DEPTH
1011
INIT_XMM sse2
1012
PREDICT_8x8_HU d, wd
1013
INIT_XMM ssse3
1014
PREDICT_8x8_HU d, wd
1015
INIT_XMM avx
1016
PREDICT_8x8_HU d, wd
1017
%elif ARCH_X86_64 == 0
1018
INIT_MMX mmx2
1019
PREDICT_8x8_HU w, bw
1020
%endif
1021
1022
;-----------------------------------------------------------------------------
1023
; void predict_8x8_vr( pixel *src, pixel *edge )
1024
;-----------------------------------------------------------------------------
1025
%macro PREDICT_8x8_VR 1
1026
cglobal predict_8x8_vr, 2,3
1027
mova m2, [r1+16*SIZEOF_PIXEL]
1028
%ifidn cpuname, ssse3
1029
mova m0, [r1+8*SIZEOF_PIXEL]
1030
palignr m3, m2, m0, 7*SIZEOF_PIXEL
1031
palignr m1, m2, m0, 6*SIZEOF_PIXEL
1032
%else
1033
movu m3, [r1+15*SIZEOF_PIXEL]
1034
movu m1, [r1+14*SIZEOF_PIXEL]
1035
%endif
1036
pavg%1 m4, m3, m2
1037
add r0, FDEC_STRIDEB*4
1038
PRED8x8_LOWPASS m3, m1, m2, m3, m5
1039
mova [r0-4*FDEC_STRIDEB], m4
1040
mova [r0-3*FDEC_STRIDEB], m3
1041
mova m1, [r1+8*SIZEOF_PIXEL]
1042
PSLLPIX m0, m1, 1
1043
PSLLPIX m2, m1, 2
1044
PRED8x8_LOWPASS m0, m1, m2, m0, m6
1045
1046
%assign Y -2
1047
%rep 5
1048
PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
1049
mova [r0+Y*FDEC_STRIDEB], m4
1050
PSLLPIX m0, m0, 1
1051
SWAP 3, 4
1052
%assign Y (Y+1)
1053
%endrep
1054
PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
1055
mova [r0+Y*FDEC_STRIDEB], m4
1056
RET
1057
%endmacro
1058
1059
%if HIGH_BIT_DEPTH
1060
INIT_XMM sse2
1061
PREDICT_8x8_VR w
1062
INIT_XMM ssse3
1063
PREDICT_8x8_VR w
1064
INIT_XMM avx
1065
PREDICT_8x8_VR w
1066
%elif ARCH_X86_64 == 0
1067
INIT_MMX mmx2
1068
PREDICT_8x8_VR b
1069
%endif
1070
1071
%macro LOAD_PLANE_ARGS 0
1072
%if cpuflag(avx2) && ARCH_X86_64 == 0
1073
vpbroadcastw m0, r1m
1074
vpbroadcastw m2, r2m
1075
vpbroadcastw m4, r3m
1076
%elif mmsize == 8 ; MMX is only used on x86_32
1077
SPLATW m0, r1m
1078
SPLATW m2, r2m
1079
SPLATW m4, r3m
1080
%else
1081
movd xm0, r1m
1082
movd xm2, r2m
1083
movd xm4, r3m
1084
SPLATW m0, xm0
1085
SPLATW m2, xm2
1086
SPLATW m4, xm4
1087
%endif
1088
%endmacro
1089
1090
;-----------------------------------------------------------------------------
1091
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1092
;-----------------------------------------------------------------------------
1093
%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
1094
%macro PREDICT_CHROMA_P_MMX 1
1095
cglobal predict_8x%1c_p_core, 1,2
1096
LOAD_PLANE_ARGS
1097
movq m1, m2
1098
pmullw m2, [pw_0to15]
1099
psllw m1, 2
1100
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
1101
paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
1102
mov r1d, %1
1103
ALIGN 4
1104
.loop:
1105
movq m5, m0
1106
movq m6, m1
1107
psraw m5, 5
1108
psraw m6, 5
1109
packuswb m5, m6
1110
movq [r0], m5
1111
1112
paddsw m0, m4
1113
paddsw m1, m4
1114
add r0, FDEC_STRIDE
1115
dec r1d
1116
jg .loop
1117
RET
1118
%endmacro ; PREDICT_CHROMA_P_MMX
1119
1120
INIT_MMX mmx2
1121
PREDICT_CHROMA_P_MMX 8
1122
PREDICT_CHROMA_P_MMX 16
1123
%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
1124
1125
%macro PREDICT_CHROMA_P 1
1126
%if HIGH_BIT_DEPTH
1127
cglobal predict_8x%1c_p_core, 1,2,7
1128
LOAD_PLANE_ARGS
1129
mova m3, [pw_pixel_max]
1130
pxor m1, m1
1131
pmullw m2, [pw_43210123] ; b
1132
%if %1 == 16
1133
pmullw m5, m4, [pw_m7] ; c
1134
%else
1135
pmullw m5, m4, [pw_m3]
1136
%endif
1137
paddw m5, [pw_16]
1138
%if mmsize == 32
1139
mova xm6, xm4
1140
paddw m4, m4
1141
paddw m5, m6
1142
%endif
1143
mov r1d, %1/(mmsize/16)
1144
.loop:
1145
paddsw m6, m2, m5
1146
paddsw m6, m0
1147
psraw m6, 5
1148
CLIPW m6, m1, m3
1149
paddw m5, m4
1150
%if mmsize == 32
1151
vextracti128 [r0], m6, 1
1152
mova [r0+FDEC_STRIDEB], xm6
1153
add r0, 2*FDEC_STRIDEB
1154
%else
1155
mova [r0], m6
1156
add r0, FDEC_STRIDEB
1157
%endif
1158
dec r1d
1159
jg .loop
1160
RET
1161
%else ; !HIGH_BIT_DEPTH
1162
cglobal predict_8x%1c_p_core, 1,2
1163
LOAD_PLANE_ARGS
1164
%if mmsize == 32
1165
vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1166
pmullw m2, m1
1167
mova xm1, xm4 ; zero upper half
1168
paddsw m4, m4
1169
paddsw m0, m1
1170
%else
1171
pmullw m2, [pw_0to15]
1172
%endif
1173
paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1174
paddsw m1, m0, m4
1175
paddsw m4, m4
1176
mov r1d, %1/(mmsize/8)
1177
.loop:
1178
psraw m2, m0, 5
1179
psraw m3, m1, 5
1180
paddsw m0, m4
1181
paddsw m1, m4
1182
packuswb m2, m3
1183
%if mmsize == 32
1184
movq [r0+FDEC_STRIDE*1], xm2
1185
movhps [r0+FDEC_STRIDE*3], xm2
1186
vextracti128 xm2, m2, 1
1187
movq [r0+FDEC_STRIDE*0], xm2
1188
movhps [r0+FDEC_STRIDE*2], xm2
1189
%else
1190
movq [r0+FDEC_STRIDE*0], xm2
1191
movhps [r0+FDEC_STRIDE*1], xm2
1192
%endif
1193
add r0, FDEC_STRIDE*mmsize/8
1194
dec r1d
1195
jg .loop
1196
RET
1197
%endif ; HIGH_BIT_DEPTH
1198
%endmacro ; PREDICT_CHROMA_P
1199
1200
INIT_XMM sse2
1201
PREDICT_CHROMA_P 8
1202
PREDICT_CHROMA_P 16
1203
INIT_XMM avx
1204
PREDICT_CHROMA_P 8
1205
PREDICT_CHROMA_P 16
1206
INIT_YMM avx2
1207
PREDICT_CHROMA_P 8
1208
PREDICT_CHROMA_P 16
1209
1210
;-----------------------------------------------------------------------------
1211
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1212
;-----------------------------------------------------------------------------
1213
%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
1214
INIT_MMX mmx2
1215
cglobal predict_16x16_p_core, 1,2
1216
LOAD_PLANE_ARGS
1217
movq mm5, mm2
1218
movq mm1, mm2
1219
pmullw mm5, [pw_0to15]
1220
psllw mm2, 3
1221
psllw mm1, 2
1222
movq mm3, mm2
1223
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
1224
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1225
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
1226
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
1227
1228
mov r1d, 16
1229
ALIGN 4
1230
.loop:
1231
movq mm5, mm0
1232
movq mm6, mm1
1233
psraw mm5, 5
1234
psraw mm6, 5
1235
packuswb mm5, mm6
1236
movq [r0], mm5
1237
1238
movq mm5, mm2
1239
movq mm6, mm3
1240
psraw mm5, 5
1241
psraw mm6, 5
1242
packuswb mm5, mm6
1243
movq [r0+8], mm5
1244
1245
paddsw mm0, mm4
1246
paddsw mm1, mm4
1247
paddsw mm2, mm4
1248
paddsw mm3, mm4
1249
add r0, FDEC_STRIDE
1250
dec r1d
1251
jg .loop
1252
RET
1253
%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
1254
1255
%macro PREDICT_16x16_P 0
1256
cglobal predict_16x16_p_core, 1,2,8
1257
movd m0, r1m
1258
movd m1, r2m
1259
movd m2, r3m
1260
SPLATW m0, m0, 0
1261
SPLATW m1, m1, 0
1262
SPLATW m2, m2, 0
1263
pmullw m3, m1, [pw_0to15]
1264
psllw m1, 3
1265
%if HIGH_BIT_DEPTH
1266
pxor m6, m6
1267
mov r1d, 16
1268
.loop:
1269
mova m4, m0
1270
mova m5, m0
1271
mova m7, m3
1272
paddsw m7, m6
1273
paddsw m4, m7
1274
paddsw m7, m1
1275
paddsw m5, m7
1276
psraw m4, 5
1277
psraw m5, 5
1278
CLIPW m4, [pb_0], [pw_pixel_max]
1279
CLIPW m5, [pb_0], [pw_pixel_max]
1280
mova [r0], m4
1281
mova [r0+16], m5
1282
add r0, FDEC_STRIDEB
1283
paddw m6, m2
1284
%else ; !HIGH_BIT_DEPTH
1285
paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1286
paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1287
paddsw m7, m2, m2
1288
mov r1d, 8
1289
ALIGN 4
1290
.loop:
1291
psraw m3, m0, 5
1292
psraw m4, m1, 5
1293
paddsw m5, m0, m2
1294
paddsw m6, m1, m2
1295
psraw m5, 5
1296
psraw m6, 5
1297
packuswb m3, m4
1298
packuswb m5, m6
1299
mova [r0+FDEC_STRIDE*0], m3
1300
mova [r0+FDEC_STRIDE*1], m5
1301
paddsw m0, m7
1302
paddsw m1, m7
1303
add r0, FDEC_STRIDE*2
1304
%endif ; !HIGH_BIT_DEPTH
1305
dec r1d
1306
jg .loop
1307
RET
1308
%endmacro ; PREDICT_16x16_P
1309
1310
INIT_XMM sse2
1311
PREDICT_16x16_P
1312
%if HIGH_BIT_DEPTH == 0
1313
INIT_XMM avx
1314
PREDICT_16x16_P
1315
%endif
1316
1317
INIT_YMM avx2
1318
cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
1319
LOAD_PLANE_ARGS
1320
%if HIGH_BIT_DEPTH
1321
pmullw m2, [pw_0to15]
1322
pxor m5, m5
1323
pxor m6, m6
1324
mova m7, [pw_pixel_max]
1325
mov r1d, 8
1326
.loop:
1327
paddsw m1, m2, m5
1328
paddw m5, m4
1329
paddsw m1, m0
1330
paddsw m3, m2, m5
1331
psraw m1, 5
1332
paddsw m3, m0
1333
psraw m3, 5
1334
CLIPW m1, m6, m7
1335
mova [r0+0*FDEC_STRIDEB], m1
1336
CLIPW m3, m6, m7
1337
mova [r0+1*FDEC_STRIDEB], m3
1338
paddw m5, m4
1339
add r0, 2*FDEC_STRIDEB
1340
%else ; !HIGH_BIT_DEPTH
1341
vbroadcasti128 m1, [pw_0to15]
1342
mova xm3, xm4 ; zero high bits
1343
pmullw m1, m2
1344
psllw m2, 3
1345
paddsw m0, m3
1346
paddsw m0, m1 ; X+1*C X+0*C
1347
paddsw m1, m0, m2 ; Y+1*C Y+0*C
1348
paddsw m4, m4
1349
mov r1d, 4
1350
.loop:
1351
psraw m2, m0, 5
1352
psraw m3, m1, 5
1353
paddsw m0, m4
1354
paddsw m1, m4
1355
packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
1356
vextracti128 [r0+0*FDEC_STRIDE], m2, 1
1357
mova [r0+1*FDEC_STRIDE], xm2
1358
psraw m2, m0, 5
1359
psraw m3, m1, 5
1360
paddsw m0, m4
1361
paddsw m1, m4
1362
packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
1363
vextracti128 [r0+2*FDEC_STRIDE], m2, 1
1364
mova [r0+3*FDEC_STRIDE], xm2
1365
add r0, FDEC_STRIDE*4
1366
%endif ; !HIGH_BIT_DEPTH
1367
dec r1d
1368
jg .loop
1369
RET
1370
1371
%if HIGH_BIT_DEPTH == 0
1372
%macro PREDICT_8x8 0
1373
;-----------------------------------------------------------------------------
1374
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
1375
;-----------------------------------------------------------------------------
1376
cglobal predict_8x8_ddl, 2,2
1377
mova m0, [r1+16]
1378
%ifidn cpuname, ssse3
1379
movd m2, [r1+32]
1380
palignr m2, m0, 1
1381
%else
1382
movu m2, [r1+17]
1383
%endif
1384
pslldq m1, m0, 1
1385
add r0, FDEC_STRIDE*4
1386
PRED8x8_LOWPASS m0, m1, m2, m0, m3
1387
1388
%assign Y -4
1389
%rep 8
1390
psrldq m0, 1
1391
movq [r0+Y*FDEC_STRIDE], m0
1392
%assign Y (Y+1)
1393
%endrep
1394
RET
1395
1396
%ifnidn cpuname, ssse3
1397
;-----------------------------------------------------------------------------
1398
; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
1399
;-----------------------------------------------------------------------------
1400
cglobal predict_8x8_ddr, 2,2
1401
movu m0, [r1+8]
1402
movu m1, [r1+7]
1403
psrldq m2, m0, 1
1404
add r0, FDEC_STRIDE*4
1405
PRED8x8_LOWPASS m0, m1, m2, m0, m3
1406
1407
psrldq m1, m0, 1
1408
%assign Y 3
1409
%rep 3
1410
movq [r0+Y*FDEC_STRIDE], m0
1411
movq [r0+(Y-1)*FDEC_STRIDE], m1
1412
psrldq m0, 2
1413
psrldq m1, 2
1414
%assign Y (Y-2)
1415
%endrep
1416
movq [r0-3*FDEC_STRIDE], m0
1417
movq [r0-4*FDEC_STRIDE], m1
1418
RET
1419
1420
;-----------------------------------------------------------------------------
1421
; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1422
;-----------------------------------------------------------------------------
1423
cglobal predict_8x8_vl, 2,2
1424
mova m0, [r1+16]
1425
pslldq m1, m0, 1
1426
psrldq m2, m0, 1
1427
pavgb m3, m0, m2
1428
add r0, FDEC_STRIDE*4
1429
PRED8x8_LOWPASS m0, m1, m2, m0, m5
1430
; m0: (t0 + 2*t1 + t2 + 2) >> 2
1431
; m3: (t0 + t1 + 1) >> 1
1432
1433
%assign Y -4
1434
%rep 3
1435
psrldq m0, 1
1436
movq [r0+ Y *FDEC_STRIDE], m3
1437
movq [r0+(Y+1)*FDEC_STRIDE], m0
1438
psrldq m3, 1
1439
%assign Y (Y+2)
1440
%endrep
1441
psrldq m0, 1
1442
movq [r0+ Y *FDEC_STRIDE], m3
1443
movq [r0+(Y+1)*FDEC_STRIDE], m0
1444
RET
1445
%endif ; !ssse3
1446
1447
;-----------------------------------------------------------------------------
1448
; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1449
;-----------------------------------------------------------------------------
1450
cglobal predict_8x8_vr, 2,2
1451
movu m2, [r1+8]
1452
add r0, 4*FDEC_STRIDE
1453
pslldq m1, m2, 2
1454
pslldq m0, m2, 1
1455
pavgb m3, m2, m0
1456
PRED8x8_LOWPASS m0, m2, m1, m0, m4
1457
movhps [r0-4*FDEC_STRIDE], m3
1458
movhps [r0-3*FDEC_STRIDE], m0
1459
%if cpuflag(ssse3)
1460
punpckhqdq m3, m3
1461
pshufb m0, [shuf_vr]
1462
palignr m3, m0, 13
1463
%else
1464
mova m2, m0
1465
mova m1, [pw_00ff]
1466
pand m1, m0
1467
psrlw m0, 8
1468
packuswb m1, m0
1469
pslldq m1, 4
1470
movhlps m3, m1
1471
shufps m1, m2, q3210
1472
psrldq m3, 5
1473
psrldq m1, 5
1474
SWAP 0, 1
1475
%endif
1476
movq [r0+3*FDEC_STRIDE], m0
1477
movq [r0+2*FDEC_STRIDE], m3
1478
psrldq m0, 1
1479
psrldq m3, 1
1480
movq [r0+1*FDEC_STRIDE], m0
1481
movq [r0+0*FDEC_STRIDE], m3
1482
psrldq m0, 1
1483
psrldq m3, 1
1484
movq [r0-1*FDEC_STRIDE], m0
1485
movq [r0-2*FDEC_STRIDE], m3
1486
RET
1487
%endmacro ; PREDICT_8x8
1488
1489
INIT_XMM sse2
1490
PREDICT_8x8
1491
INIT_XMM ssse3
1492
PREDICT_8x8
1493
INIT_XMM avx
1494
PREDICT_8x8
1495
1496
%endif ; !HIGH_BIT_DEPTH
1497
1498
;-----------------------------------------------------------------------------
1499
; void predict_8x8_vl( pixel *src, pixel *edge )
1500
;-----------------------------------------------------------------------------
1501
%macro PREDICT_8x8_VL_10 1
1502
cglobal predict_8x8_vl, 2,2,8
1503
mova m0, [r1+16*SIZEOF_PIXEL]
1504
mova m1, [r1+24*SIZEOF_PIXEL]
1505
PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
1506
PSRLPIX m4, m1, 1
1507
pavg%1 m6, m0, m2
1508
pavg%1 m7, m1, m4
1509
add r0, FDEC_STRIDEB*4
1510
mova [r0-4*FDEC_STRIDEB], m6
1511
PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
1512
mova [r0-2*FDEC_STRIDEB], m3
1513
PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
1514
mova [r0+0*FDEC_STRIDEB], m3
1515
PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
1516
mova [r0+2*FDEC_STRIDEB], m7
1517
PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
1518
PSLLPIX m5, m0, 1
1519
PRED8x8_LOWPASS m0, m5, m2, m0, m7
1520
PRED8x8_LOWPASS m1, m3, m4, m1, m7
1521
PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
1522
mova [r0-3*FDEC_STRIDEB], m4
1523
PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
1524
mova [r0-1*FDEC_STRIDEB], m4
1525
PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
1526
mova [r0+1*FDEC_STRIDEB], m4
1527
PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
1528
mova [r0+3*FDEC_STRIDEB], m1
1529
RET
1530
%endmacro
1531
%if HIGH_BIT_DEPTH
1532
INIT_XMM sse2
1533
PREDICT_8x8_VL_10 w
1534
INIT_XMM ssse3
1535
PREDICT_8x8_VL_10 w
1536
INIT_XMM avx
1537
PREDICT_8x8_VL_10 w
1538
%else
1539
INIT_MMX mmx2
1540
PREDICT_8x8_VL_10 b
1541
%endif
1542
1543
;-----------------------------------------------------------------------------
1544
; void predict_8x8_hd( pixel *src, pixel *edge )
1545
;-----------------------------------------------------------------------------
1546
%macro PREDICT_8x8_HD 2
1547
cglobal predict_8x8_hd, 2,2
1548
add r0, 4*FDEC_STRIDEB
1549
mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
1550
movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
1551
%ifidn cpuname, ssse3
1552
mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
1553
mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
1554
palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
1555
palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
1556
%else
1557
movu m2, [r1+15*SIZEOF_PIXEL]
1558
movu m4, [r1+ 9*SIZEOF_PIXEL]
1559
%endif ; cpuflag
1560
pavg%1 m3, m0, m1
1561
PRED8x8_LOWPASS m0, m4, m1, m0, m5
1562
PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
1563
PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
1564
PRED8x8_LOWPASS m1, m4, m2, m1, m5
1565
; .. p11 p10 p9
1566
punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
1567
punpckl%2 m3, m0 ; p4 p3 p2 p1
1568
mova [r0+3*FDEC_STRIDEB], m3
1569
PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
1570
mova [r0+2*FDEC_STRIDEB], m0
1571
PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
1572
mova [r0+1*FDEC_STRIDEB], m0
1573
PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
1574
mova [r0+0*FDEC_STRIDEB], m0
1575
mova [r0-1*FDEC_STRIDEB], m2
1576
PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
1577
mova [r0-2*FDEC_STRIDEB], m0
1578
PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
1579
mova [r0-3*FDEC_STRIDEB], m0
1580
PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
1581
mova [r0-4*FDEC_STRIDEB], m1
1582
RET
1583
%endmacro
1584
1585
%if HIGH_BIT_DEPTH
1586
INIT_XMM sse2
1587
PREDICT_8x8_HD w, wd
1588
INIT_XMM ssse3
1589
PREDICT_8x8_HD w, wd
1590
INIT_XMM avx
1591
PREDICT_8x8_HD w, wd
1592
%else
1593
INIT_MMX mmx2
1594
PREDICT_8x8_HD b, bw
1595
1596
;-----------------------------------------------------------------------------
1597
; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1598
;-----------------------------------------------------------------------------
1599
%macro PREDICT_8x8_HD 0
1600
cglobal predict_8x8_hd, 2,2
1601
add r0, 4*FDEC_STRIDE
1602
movu m1, [r1+7]
1603
movu m3, [r1+8]
1604
movu m2, [r1+9]
1605
pavgb m4, m1, m3
1606
PRED8x8_LOWPASS m0, m1, m2, m3, m5
1607
punpcklbw m4, m0
1608
movhlps m0, m4
1609
1610
%assign Y 3
1611
%rep 3
1612
movq [r0+(Y)*FDEC_STRIDE], m4
1613
movq [r0+(Y-4)*FDEC_STRIDE], m0
1614
psrldq m4, 2
1615
psrldq m0, 2
1616
%assign Y (Y-1)
1617
%endrep
1618
movq [r0+(Y)*FDEC_STRIDE], m4
1619
movq [r0+(Y-4)*FDEC_STRIDE], m0
1620
RET
1621
%endmacro
1622
1623
INIT_XMM sse2
1624
PREDICT_8x8_HD
1625
INIT_XMM avx
1626
PREDICT_8x8_HD
1627
%endif ; HIGH_BIT_DEPTH
1628
1629
%if HIGH_BIT_DEPTH == 0
1630
;-----------------------------------------------------------------------------
1631
; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1632
;-----------------------------------------------------------------------------
1633
INIT_MMX
1634
cglobal predict_8x8_hu_sse2, 2,2
1635
add r0, 4*FDEC_STRIDE
1636
movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
1637
pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
1638
movq mm2, mm0
1639
psllw mm0, 8
1640
psrlw mm2, 8
1641
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
1642
psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
1643
movq mm3, mm2
1644
movq mm4, mm2
1645
movq mm5, mm2
1646
psrlq mm2, 8
1647
psrlq mm3, 16
1648
por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
1649
punpckhbw mm1, mm1
1650
por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
1651
pavgb mm4, mm2
1652
PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
1653
1654
movq2dq xmm0, mm4
1655
movq2dq xmm1, mm1
1656
punpcklbw xmm0, xmm1
1657
punpckhbw mm4, mm1
1658
%assign Y -4
1659
%rep 3
1660
movq [r0+Y*FDEC_STRIDE], xmm0
1661
psrldq xmm0, 2
1662
%assign Y (Y+1)
1663
%endrep
1664
pshufw mm5, mm4, q3321
1665
pshufw mm6, mm4, q3332
1666
pshufw mm7, mm4, q3333
1667
movq [r0+Y*FDEC_STRIDE], xmm0
1668
movq [r0+0*FDEC_STRIDE], mm4
1669
movq [r0+1*FDEC_STRIDE], mm5
1670
movq [r0+2*FDEC_STRIDE], mm6
1671
movq [r0+3*FDEC_STRIDE], mm7
1672
RET
1673
1674
INIT_XMM
1675
cglobal predict_8x8_hu_ssse3, 2,2
1676
add r0, 4*FDEC_STRIDE
1677
movq m3, [r1+7]
1678
pshufb m3, [shuf_hu]
1679
psrldq m1, m3, 1
1680
psrldq m2, m3, 2
1681
pavgb m0, m1, m3
1682
PRED8x8_LOWPASS m1, m3, m2, m1, m4
1683
punpcklbw m0, m1
1684
%assign Y -4
1685
%rep 3
1686
movq [r0+ Y *FDEC_STRIDE], m0
1687
movhps [r0+(Y+4)*FDEC_STRIDE], m0
1688
psrldq m0, 2
1689
pshufhw m0, m0, q2210
1690
%assign Y (Y+1)
1691
%endrep
1692
movq [r0+ Y *FDEC_STRIDE], m0
1693
movhps [r0+(Y+4)*FDEC_STRIDE], m0
1694
RET
1695
%endif ; !HIGH_BIT_DEPTH
1696
1697
;-----------------------------------------------------------------------------
1698
; void predict_8x8c_v( uint8_t *src )
1699
;-----------------------------------------------------------------------------
1700
1701
%macro PREDICT_8x8C_V 0
1702
cglobal predict_8x8c_v, 1,1
1703
mova m0, [r0 - FDEC_STRIDEB]
1704
STORE8 m0
1705
RET
1706
%endmacro
1707
1708
%if HIGH_BIT_DEPTH
1709
INIT_XMM sse
1710
PREDICT_8x8C_V
1711
%else
1712
INIT_MMX mmx
1713
PREDICT_8x8C_V
1714
%endif
1715
1716
%if HIGH_BIT_DEPTH
1717
1718
INIT_MMX
1719
cglobal predict_8x8c_v_mmx, 1,1
1720
mova m0, [r0 - FDEC_STRIDEB]
1721
mova m1, [r0 - FDEC_STRIDEB + 8]
1722
%assign Y 0
1723
%rep 8
1724
mova [r0 + (Y&1)*FDEC_STRIDEB], m0
1725
mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
1726
%if (Y&1) && (Y!=7)
1727
add r0, FDEC_STRIDEB*2
1728
%endif
1729
%assign Y Y+1
1730
%endrep
1731
RET
1732
1733
%endif
1734
1735
%macro PREDICT_8x16C_V 0
1736
cglobal predict_8x16c_v, 1,1
1737
mova m0, [r0 - FDEC_STRIDEB]
1738
STORE16 m0
1739
RET
1740
%endmacro
1741
1742
%if HIGH_BIT_DEPTH
1743
INIT_XMM sse
1744
PREDICT_8x16C_V
1745
%else
1746
INIT_MMX mmx
1747
PREDICT_8x16C_V
1748
%endif
1749
1750
;-----------------------------------------------------------------------------
1751
; void predict_8x8c_h( uint8_t *src )
1752
;-----------------------------------------------------------------------------
1753
%macro PREDICT_C_H 0
1754
cglobal predict_8x8c_h, 1,1
1755
%if cpuflag(ssse3) && notcpuflag(avx2)
1756
mova m2, [pb_3]
1757
%endif
1758
PRED_H_4ROWS 8, 1
1759
PRED_H_4ROWS 8, 0
1760
RET
1761
1762
cglobal predict_8x16c_h, 1,2
1763
%if cpuflag(ssse3) && notcpuflag(avx2)
1764
mova m2, [pb_3]
1765
%endif
1766
mov r1d, 4
1767
.loop:
1768
PRED_H_4ROWS 8, 1
1769
dec r1d
1770
jg .loop
1771
RET
1772
%endmacro
1773
1774
INIT_MMX mmx2
1775
PREDICT_C_H
1776
%if HIGH_BIT_DEPTH
1777
INIT_XMM sse2
1778
PREDICT_C_H
1779
INIT_XMM avx2
1780
PREDICT_C_H
1781
%else
1782
INIT_MMX ssse3
1783
PREDICT_C_H
1784
%endif
1785
1786
;-----------------------------------------------------------------------------
1787
; void predict_8x8c_dc( pixel *src )
1788
;-----------------------------------------------------------------------------
1789
%macro LOAD_LEFT 1
1790
movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
1791
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
1792
add r1d, r2d
1793
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
1794
add r1d, r2d
1795
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
1796
add r1d, r2d
1797
%endmacro
1798
1799
%macro PREDICT_8x8C_DC 0
1800
cglobal predict_8x8c_dc, 1,3
1801
pxor m7, m7
1802
%if HIGH_BIT_DEPTH
1803
movq m0, [r0-FDEC_STRIDEB+0]
1804
movq m1, [r0-FDEC_STRIDEB+8]
1805
HADDW m0, m2
1806
HADDW m1, m2
1807
%else ; !HIGH_BIT_DEPTH
1808
movd m0, [r0-FDEC_STRIDEB+0]
1809
movd m1, [r0-FDEC_STRIDEB+4]
1810
psadbw m0, m7 ; s0
1811
psadbw m1, m7 ; s1
1812
%endif
1813
add r0, FDEC_STRIDEB*4
1814
1815
LOAD_LEFT 0 ; s2
1816
movd m2, r1d
1817
LOAD_LEFT 4 ; s3
1818
movd m3, r1d
1819
1820
punpcklwd m0, m1
1821
punpcklwd m2, m3
1822
punpckldq m0, m2 ; s0, s1, s2, s3
1823
pshufw m3, m0, q3312 ; s2, s1, s3, s3
1824
pshufw m0, m0, q1310 ; s0, s1, s3, s1
1825
paddw m0, m3
1826
psrlw m0, 2
1827
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
1828
%if HIGH_BIT_DEPTH
1829
%if cpuflag(sse2)
1830
movq2dq xmm0, m0
1831
punpcklwd xmm0, xmm0
1832
pshufd xmm1, xmm0, q3322
1833
punpckldq xmm0, xmm0
1834
%assign Y 0
1835
%rep 8
1836
%assign i (0 + (Y/4))
1837
movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
1838
%assign Y Y+1
1839
%endrep
1840
%else ; !sse2
1841
pshufw m1, m0, q0000
1842
pshufw m2, m0, q1111
1843
pshufw m3, m0, q2222
1844
pshufw m4, m0, q3333
1845
%assign Y 0
1846
%rep 8
1847
%assign i (1 + (Y/4)*2)
1848
%assign j (2 + (Y/4)*2)
1849
movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
1850
movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
1851
%assign Y Y+1
1852
%endrep
1853
%endif
1854
%else ; !HIGH_BIT_DEPTH
1855
packuswb m0, m0
1856
punpcklbw m0, m0
1857
movq m1, m0
1858
punpcklbw m0, m0
1859
punpckhbw m1, m1
1860
%assign Y 0
1861
%rep 8
1862
%assign i (0 + (Y/4))
1863
movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
1864
%assign Y Y+1
1865
%endrep
1866
%endif
1867
RET
1868
%endmacro
1869
1870
INIT_MMX mmx2
1871
PREDICT_8x8C_DC
1872
%if HIGH_BIT_DEPTH
1873
INIT_MMX sse2
1874
PREDICT_8x8C_DC
1875
%endif
1876
1877
%if HIGH_BIT_DEPTH
1878
%macro STORE_4LINES 3
1879
%if cpuflag(sse2)
1880
movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
1881
movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
1882
movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
1883
movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
1884
%else
1885
movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
1886
movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
1887
movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
1888
movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
1889
movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
1890
movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
1891
movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
1892
movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
1893
%endif
1894
%endmacro
1895
%else
1896
%macro STORE_4LINES 2
1897
movq [r0+FDEC_STRIDEB*(%2-4)], %1
1898
movq [r0+FDEC_STRIDEB*(%2-3)], %1
1899
movq [r0+FDEC_STRIDEB*(%2-2)], %1
1900
movq [r0+FDEC_STRIDEB*(%2-1)], %1
1901
%endmacro
1902
%endif
1903
1904
%macro PREDICT_8x16C_DC 0
1905
cglobal predict_8x16c_dc, 1,3
1906
pxor m7, m7
1907
%if HIGH_BIT_DEPTH
1908
movq m0, [r0-FDEC_STRIDEB+0]
1909
movq m1, [r0-FDEC_STRIDEB+8]
1910
HADDW m0, m2
1911
HADDW m1, m2
1912
%else
1913
movd m0, [r0-FDEC_STRIDEB+0]
1914
movd m1, [r0-FDEC_STRIDEB+4]
1915
psadbw m0, m7 ; s0
1916
psadbw m1, m7 ; s1
1917
%endif
1918
punpcklwd m0, m1 ; s0, s1
1919
1920
add r0, FDEC_STRIDEB*4
1921
LOAD_LEFT 0 ; s2
1922
pinsrw m0, r1d, 2
1923
LOAD_LEFT 4 ; s3
1924
pinsrw m0, r1d, 3 ; s0, s1, s2, s3
1925
add r0, FDEC_STRIDEB*8
1926
LOAD_LEFT 0 ; s4
1927
pinsrw m1, r1d, 2
1928
LOAD_LEFT 4 ; s5
1929
pinsrw m1, r1d, 3 ; s1, __, s4, s5
1930
sub r0, FDEC_STRIDEB*8
1931
1932
pshufw m2, m0, q1310 ; s0, s1, s3, s1
1933
pshufw m0, m0, q3312 ; s2, s1, s3, s3
1934
pshufw m3, m1, q0302 ; s4, s1, s5, s1
1935
pshufw m1, m1, q3322 ; s4, s4, s5, s5
1936
paddw m0, m2
1937
paddw m1, m3
1938
psrlw m0, 2
1939
psrlw m1, 2
1940
pavgw m0, m7
1941
pavgw m1, m7
1942
%if HIGH_BIT_DEPTH
1943
%if cpuflag(sse2)
1944
movq2dq xmm0, m0
1945
movq2dq xmm1, m1
1946
punpcklwd xmm0, xmm0
1947
punpcklwd xmm1, xmm1
1948
pshufd xmm2, xmm0, q3322
1949
pshufd xmm3, xmm1, q3322
1950
punpckldq xmm0, xmm0
1951
punpckldq xmm1, xmm1
1952
STORE_4LINES xmm0, xmm0, 0
1953
STORE_4LINES xmm2, xmm2, 4
1954
STORE_4LINES xmm1, xmm1, 8
1955
STORE_4LINES xmm3, xmm3, 12
1956
%else
1957
pshufw m2, m0, q0000
1958
pshufw m3, m0, q1111
1959
pshufw m4, m0, q2222
1960
pshufw m5, m0, q3333
1961
STORE_4LINES m2, m3, 0
1962
STORE_4LINES m4, m5, 4
1963
pshufw m2, m1, q0000
1964
pshufw m3, m1, q1111
1965
pshufw m4, m1, q2222
1966
pshufw m5, m1, q3333
1967
STORE_4LINES m2, m3, 8
1968
STORE_4LINES m4, m5, 12
1969
%endif
1970
%else
1971
packuswb m0, m0 ; dc0, dc1, dc2, dc3
1972
packuswb m1, m1 ; dc4, dc5, dc6, dc7
1973
punpcklbw m0, m0
1974
punpcklbw m1, m1
1975
pshufw m2, m0, q1100
1976
pshufw m3, m0, q3322
1977
pshufw m4, m1, q1100
1978
pshufw m5, m1, q3322
1979
STORE_4LINES m2, 0
1980
STORE_4LINES m3, 4
1981
add r0, FDEC_STRIDEB*8
1982
STORE_4LINES m4, 0
1983
STORE_4LINES m5, 4
1984
%endif
1985
RET
1986
%endmacro
1987
1988
INIT_MMX mmx2
1989
PREDICT_8x16C_DC
1990
%if HIGH_BIT_DEPTH
1991
INIT_MMX sse2
1992
PREDICT_8x16C_DC
1993
%endif
1994
1995
%macro PREDICT_C_DC_TOP 1
1996
%if HIGH_BIT_DEPTH
1997
INIT_XMM
1998
cglobal predict_8x%1c_dc_top_sse2, 1,1
1999
pxor m2, m2
2000
mova m0, [r0 - FDEC_STRIDEB]
2001
pshufd m1, m0, q2301
2002
paddw m0, m1
2003
pshuflw m1, m0, q2301
2004
pshufhw m1, m1, q2301
2005
paddw m0, m1
2006
psrlw m0, 1
2007
pavgw m0, m2
2008
STORE%1 m0
2009
RET
2010
%else ; !HIGH_BIT_DEPTH
2011
INIT_MMX
2012
cglobal predict_8x%1c_dc_top_mmx2, 1,1
2013
movq mm0, [r0 - FDEC_STRIDE]
2014
pxor mm1, mm1
2015
pxor mm2, mm2
2016
punpckhbw mm1, mm0
2017
punpcklbw mm0, mm2
2018
psadbw mm1, mm2 ; s1
2019
psadbw mm0, mm2 ; s0
2020
psrlw mm1, 1
2021
psrlw mm0, 1
2022
pavgw mm1, mm2
2023
pavgw mm0, mm2
2024
pshufw mm1, mm1, 0
2025
pshufw mm0, mm0, 0 ; dc0 (w)
2026
packuswb mm0, mm1 ; dc0,dc1 (b)
2027
STORE%1 mm0
2028
RET
2029
%endif
2030
%endmacro
2031
2032
PREDICT_C_DC_TOP 8
2033
PREDICT_C_DC_TOP 16
2034
2035
;-----------------------------------------------------------------------------
2036
; void predict_16x16_v( pixel *src )
2037
;-----------------------------------------------------------------------------
2038
2039
%macro PREDICT_16x16_V 0
2040
cglobal predict_16x16_v, 1,2
2041
%assign %%i 0
2042
%rep 16*SIZEOF_PIXEL/mmsize
2043
mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
2044
%assign %%i %%i+1
2045
%endrep
2046
%if 16*SIZEOF_PIXEL/mmsize == 4
2047
STORE16 m0, m1, m2, m3
2048
%elif 16*SIZEOF_PIXEL/mmsize == 2
2049
STORE16 m0, m1
2050
%else
2051
STORE16 m0
2052
%endif
2053
RET
2054
%endmacro
2055
2056
INIT_MMX mmx2
2057
PREDICT_16x16_V
2058
INIT_XMM sse
2059
PREDICT_16x16_V
2060
%if HIGH_BIT_DEPTH
2061
INIT_YMM avx
2062
PREDICT_16x16_V
2063
%endif
2064
2065
;-----------------------------------------------------------------------------
2066
; void predict_16x16_h( pixel *src )
2067
;-----------------------------------------------------------------------------
2068
%macro PREDICT_16x16_H 0
2069
cglobal predict_16x16_h, 1,2
2070
%if cpuflag(ssse3) && notcpuflag(avx2)
2071
mova m2, [pb_3]
2072
%endif
2073
mov r1d, 4
2074
.loop:
2075
PRED_H_4ROWS 16, 1
2076
dec r1d
2077
jg .loop
2078
RET
2079
%endmacro
2080
2081
INIT_MMX mmx2
2082
PREDICT_16x16_H
2083
%if HIGH_BIT_DEPTH
2084
INIT_XMM sse2
2085
PREDICT_16x16_H
2086
INIT_YMM avx2
2087
PREDICT_16x16_H
2088
%else
2089
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
2090
INIT_XMM ssse3
2091
PREDICT_16x16_H
2092
%endif
2093
2094
;-----------------------------------------------------------------------------
2095
; void predict_16x16_dc_core( pixel *src, int i_dc_left )
2096
;-----------------------------------------------------------------------------
2097
%macro PRED16x16_DC_MMX 2
2098
%if HIGH_BIT_DEPTH
2099
mova m0, [r0 - FDEC_STRIDEB+ 0]
2100
paddw m0, [r0 - FDEC_STRIDEB+ 8]
2101
paddw m0, [r0 - FDEC_STRIDEB+16]
2102
paddw m0, [r0 - FDEC_STRIDEB+24]
2103
HADDW m0, m1
2104
paddw m0, %1
2105
psrlw m0, %2
2106
SPLATW m0, m0
2107
STORE16 m0, m0, m0, m0
2108
%else ; !HIGH_BIT_DEPTH
2109
pxor m0, m0
2110
pxor m1, m1
2111
psadbw m0, [r0 - FDEC_STRIDE]
2112
psadbw m1, [r0 - FDEC_STRIDE + 8]
2113
paddusw m0, m1
2114
paddusw m0, %1
2115
psrlw m0, %2 ; dc
2116
pshufw m0, m0, 0
2117
packuswb m0, m0 ; dc in bytes
2118
STORE16 m0, m0
2119
%endif
2120
%endmacro
2121
2122
INIT_MMX mmx2
2123
cglobal predict_16x16_dc_core, 1,2
2124
%if ARCH_X86_64
2125
movd m6, r1d
2126
PRED16x16_DC_MMX m6, 5
2127
%else
2128
PRED16x16_DC_MMX r1m, 5
2129
%endif
2130
RET
2131
2132
INIT_MMX mmx2
2133
cglobal predict_16x16_dc_top, 1,2
2134
PRED16x16_DC_MMX [pw_8], 4
2135
RET
2136
2137
INIT_MMX mmx2
2138
%if HIGH_BIT_DEPTH
2139
cglobal predict_16x16_dc_left_core, 1,2
2140
movd m0, r1m
2141
SPLATW m0, m0
2142
STORE16 m0, m0, m0, m0
2143
RET
2144
%else ; !HIGH_BIT_DEPTH
2145
cglobal predict_16x16_dc_left_core, 1,1
2146
movd m0, r1m
2147
pshufw m0, m0, 0
2148
packuswb m0, m0
2149
STORE16 m0, m0
2150
RET
2151
%endif
2152
2153
%macro PRED16x16_DC 2
2154
%if HIGH_BIT_DEPTH
2155
mova xm0, [r0 - FDEC_STRIDEB+ 0]
2156
paddw xm0, [r0 - FDEC_STRIDEB+16]
2157
HADDW xm0, xm2
2158
paddw xm0, %1
2159
psrlw xm0, %2
2160
SPLATW m0, xm0
2161
%if mmsize == 32
2162
STORE16 m0
2163
%else
2164
STORE16 m0, m0
2165
%endif
2166
%else ; !HIGH_BIT_DEPTH
2167
pxor m0, m0
2168
psadbw m0, [r0 - FDEC_STRIDE]
2169
MOVHL m1, m0
2170
paddw m0, m1
2171
paddusw m0, %1
2172
psrlw m0, %2 ; dc
2173
SPLATW m0, m0
2174
packuswb m0, m0 ; dc in bytes
2175
STORE16 m0
2176
%endif
2177
%endmacro
2178
2179
%macro PREDICT_16x16_DC_CORE 0
2180
cglobal predict_16x16_dc_core, 2,2,4
2181
movd xm3, r1m
2182
PRED16x16_DC xm3, 5
2183
RET
2184
2185
cglobal predict_16x16_dc_top, 1,2
2186
PRED16x16_DC [pw_8], 4
2187
RET
2188
2189
cglobal predict_16x16_dc_left_core, 1,2
2190
movd xm0, r1m
2191
SPLATW m0, xm0
2192
%if HIGH_BIT_DEPTH && mmsize == 16
2193
STORE16 m0, m0
2194
%else
2195
%if HIGH_BIT_DEPTH == 0
2196
packuswb m0, m0
2197
%endif
2198
STORE16 m0
2199
%endif
2200
RET
2201
%endmacro
2202
2203
INIT_XMM sse2
2204
PREDICT_16x16_DC_CORE
2205
%if HIGH_BIT_DEPTH
2206
INIT_YMM avx2
2207
PREDICT_16x16_DC_CORE
2208
%else
2209
INIT_XMM avx2
2210
PREDICT_16x16_DC_CORE
2211
%endif
2212
2213