Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52869 views
1
;******************************************************************************
2
;* x86-optimized vertical line scaling functions
3
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4
;* Kieran Kunhya <kieran@kunhya.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22
23
%include "libavutil/x86/x86util.asm"
24
25
SECTION_RODATA
26
27
minshort: times 8 dw 0x8000
28
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
29
yuv2yuvX_10_start: times 4 dd 0x10000
30
yuv2yuvX_9_start: times 4 dd 0x20000
31
yuv2yuvX_10_upper: times 8 dw 0x3ff
32
yuv2yuvX_9_upper: times 8 dw 0x1ff
33
pd_4: times 4 dd 4
34
pd_4min0x40000:times 4 dd 4 - (0x40000)
35
pw_16: times 8 dw 16
36
pw_32: times 8 dw 32
37
pw_512: times 8 dw 512
38
pw_1024: times 8 dw 1024
39
40
SECTION .text
41
42
;-----------------------------------------------------------------------------
43
; vertical line scaling
44
;
45
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
46
; const uint8_t *dither, int offset)
47
; and
48
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
49
; const int16_t **src, uint8_t *dst, int dstW,
50
; const uint8_t *dither, int offset)
51
;
52
; Scale one or $filterSize lines of source data to generate one line of output
53
; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
54
; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
55
; of 2. $offset is either 0 or 3. $dither holds 8 values.
56
;-----------------------------------------------------------------------------
57
58
%macro yuv2planeX_fn 3
59
60
%if ARCH_X86_32
61
%define cntr_reg fltsizeq
62
%define movsx mov
63
%else
64
%define cntr_reg r7
65
%define movsx movsxd
66
%endif
67
68
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
69
%if %1 == 8 || %1 == 9 || %1 == 10
70
pxor m6, m6
71
%endif ; %1 == 8/9/10
72
73
%if %1 == 8
74
%if ARCH_X86_32
75
%assign pad 0x2c - (stack_offset & 15)
76
SUB rsp, pad
77
%define m_dith m7
78
%else ; x86-64
79
%define m_dith m9
80
%endif ; x86-32
81
82
; create registers holding dither
83
movq m_dith, [ditherq] ; dither
84
test offsetd, offsetd
85
jz .no_rot
86
%if mmsize == 16
87
punpcklqdq m_dith, m_dith
88
%endif ; mmsize == 16
89
PALIGNR m_dith, m_dith, 3, m0
90
.no_rot:
91
%if mmsize == 16
92
punpcklbw m_dith, m6
93
%if ARCH_X86_64
94
punpcklwd m8, m_dith, m6
95
pslld m8, 12
96
%else ; x86-32
97
punpcklwd m5, m_dith, m6
98
pslld m5, 12
99
%endif ; x86-32/64
100
punpckhwd m_dith, m6
101
pslld m_dith, 12
102
%if ARCH_X86_32
103
mova [rsp+ 0], m5
104
mova [rsp+16], m_dith
105
%endif
106
%else ; mmsize == 8
107
punpcklbw m5, m_dith, m6
108
punpckhbw m_dith, m6
109
punpcklwd m4, m5, m6
110
punpckhwd m5, m6
111
punpcklwd m3, m_dith, m6
112
punpckhwd m_dith, m6
113
pslld m4, 12
114
pslld m5, 12
115
pslld m3, 12
116
pslld m_dith, 12
117
mova [rsp+ 0], m4
118
mova [rsp+ 8], m5
119
mova [rsp+16], m3
120
mova [rsp+24], m_dith
121
%endif ; mmsize == 8/16
122
%endif ; %1 == 8
123
124
xor r5, r5
125
126
.pixelloop:
127
%assign %%i 0
128
; the rep here is for the 8bit output mmx case, where dither covers
129
; 8 pixels but we can only handle 2 pixels per register, and thus 4
130
; pixels per iteration. In order to not have to keep track of where
131
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
132
%if %1 == 8
133
%assign %%repcnt 16/mmsize
134
%else
135
%assign %%repcnt 1
136
%endif
137
138
%rep %%repcnt
139
140
%if %1 == 8
141
%if ARCH_X86_32
142
mova m2, [rsp+mmsize*(0+%%i)]
143
mova m1, [rsp+mmsize*(1+%%i)]
144
%else ; x86-64
145
mova m2, m8
146
mova m1, m_dith
147
%endif ; x86-32/64
148
%else ; %1 == 9/10/16
149
mova m1, [yuv2yuvX_%1_start]
150
mova m2, m1
151
%endif ; %1 == 8/9/10/16
152
movsx cntr_reg, fltsizem
153
.filterloop_ %+ %%i:
154
; input pixels
155
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
156
%if %1 == 16
157
mova m3, [r6+r5*4]
158
mova m5, [r6+r5*4+mmsize]
159
%else ; %1 == 8/9/10
160
mova m3, [r6+r5*2]
161
%endif ; %1 == 8/9/10/16
162
mov r6, [srcq+gprsize*cntr_reg-gprsize]
163
%if %1 == 16
164
mova m4, [r6+r5*4]
165
mova m6, [r6+r5*4+mmsize]
166
%else ; %1 == 8/9/10
167
mova m4, [r6+r5*2]
168
%endif ; %1 == 8/9/10/16
169
170
; coefficients
171
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
172
%if %1 == 16
173
pshuflw m7, m0, 0 ; coeff[0]
174
pshuflw m0, m0, 0x55 ; coeff[1]
175
pmovsxwd m7, m7 ; word -> dword
176
pmovsxwd m0, m0 ; word -> dword
177
178
pmulld m3, m7
179
pmulld m5, m7
180
pmulld m4, m0
181
pmulld m6, m0
182
183
paddd m2, m3
184
paddd m1, m5
185
paddd m2, m4
186
paddd m1, m6
187
%else ; %1 == 10/9/8
188
punpcklwd m5, m3, m4
189
punpckhwd m3, m4
190
SPLATD m0
191
192
pmaddwd m5, m0
193
pmaddwd m3, m0
194
195
paddd m2, m5
196
paddd m1, m3
197
%endif ; %1 == 8/9/10/16
198
199
sub cntr_reg, 2
200
jg .filterloop_ %+ %%i
201
202
%if %1 == 16
203
psrad m2, 31 - %1
204
psrad m1, 31 - %1
205
%else ; %1 == 10/9/8
206
psrad m2, 27 - %1
207
psrad m1, 27 - %1
208
%endif ; %1 == 8/9/10/16
209
210
%if %1 == 8
211
packssdw m2, m1
212
packuswb m2, m2
213
movh [dstq+r5*1], m2
214
%else ; %1 == 9/10/16
215
%if %1 == 16
216
packssdw m2, m1
217
paddw m2, [minshort]
218
%else ; %1 == 9/10
219
%if cpuflag(sse4)
220
packusdw m2, m1
221
%else ; mmxext/sse2
222
packssdw m2, m1
223
pmaxsw m2, m6
224
%endif ; mmxext/sse2/sse4/avx
225
pminsw m2, [yuv2yuvX_%1_upper]
226
%endif ; %1 == 9/10/16
227
mova [dstq+r5*2], m2
228
%endif ; %1 == 8/9/10/16
229
230
add r5, mmsize/2
231
sub wd, mmsize/2
232
233
%assign %%i %%i+2
234
%endrep
235
jg .pixelloop
236
237
%if %1 == 8
238
%if ARCH_X86_32
239
ADD rsp, pad
240
RET
241
%else ; x86-64
242
REP_RET
243
%endif ; x86-32/64
244
%else ; %1 == 9/10/16
245
REP_RET
246
%endif ; %1 == 8/9/10/16
247
%endmacro
248
249
%if ARCH_X86_32
250
INIT_MMX mmxext
251
yuv2planeX_fn 8, 0, 7
252
yuv2planeX_fn 9, 0, 5
253
yuv2planeX_fn 10, 0, 5
254
%endif
255
256
INIT_XMM sse2
257
yuv2planeX_fn 8, 10, 7
258
yuv2planeX_fn 9, 7, 5
259
yuv2planeX_fn 10, 7, 5
260
261
INIT_XMM sse4
262
yuv2planeX_fn 8, 10, 7
263
yuv2planeX_fn 9, 7, 5
264
yuv2planeX_fn 10, 7, 5
265
yuv2planeX_fn 16, 8, 5
266
267
%if HAVE_AVX_EXTERNAL
268
INIT_XMM avx
269
yuv2planeX_fn 8, 10, 7
270
yuv2planeX_fn 9, 7, 5
271
yuv2planeX_fn 10, 7, 5
272
%endif
273
274
; %1=outout-bpc, %2=alignment (u/a)
275
%macro yuv2plane1_mainloop 2
276
.loop_%2:
277
%if %1 == 8
278
paddsw m0, m2, [srcq+wq*2+mmsize*0]
279
paddsw m1, m3, [srcq+wq*2+mmsize*1]
280
psraw m0, 7
281
psraw m1, 7
282
packuswb m0, m1
283
mov%2 [dstq+wq], m0
284
%elif %1 == 16
285
paddd m0, m4, [srcq+wq*4+mmsize*0]
286
paddd m1, m4, [srcq+wq*4+mmsize*1]
287
paddd m2, m4, [srcq+wq*4+mmsize*2]
288
paddd m3, m4, [srcq+wq*4+mmsize*3]
289
psrad m0, 3
290
psrad m1, 3
291
psrad m2, 3
292
psrad m3, 3
293
%if cpuflag(sse4) ; avx/sse4
294
packusdw m0, m1
295
packusdw m2, m3
296
%else ; mmx/sse2
297
packssdw m0, m1
298
packssdw m2, m3
299
paddw m0, m5
300
paddw m2, m5
301
%endif ; mmx/sse2/sse4/avx
302
mov%2 [dstq+wq*2+mmsize*0], m0
303
mov%2 [dstq+wq*2+mmsize*1], m2
304
%else ; %1 == 9/10
305
paddsw m0, m2, [srcq+wq*2+mmsize*0]
306
paddsw m1, m2, [srcq+wq*2+mmsize*1]
307
psraw m0, 15 - %1
308
psraw m1, 15 - %1
309
pmaxsw m0, m4
310
pmaxsw m1, m4
311
pminsw m0, m3
312
pminsw m1, m3
313
mov%2 [dstq+wq*2+mmsize*0], m0
314
mov%2 [dstq+wq*2+mmsize*1], m1
315
%endif
316
add wq, mmsize
317
jl .loop_%2
318
%endmacro
319
320
%macro yuv2plane1_fn 3
321
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
322
movsxdifnidn wq, wd
323
add wq, mmsize - 1
324
and wq, ~(mmsize - 1)
325
%if %1 == 8
326
add dstq, wq
327
%else ; %1 != 8
328
lea dstq, [dstq+wq*2]
329
%endif ; %1 == 8
330
%if %1 == 16
331
lea srcq, [srcq+wq*4]
332
%else ; %1 != 16
333
lea srcq, [srcq+wq*2]
334
%endif ; %1 == 16
335
neg wq
336
337
%if %1 == 8
338
pxor m4, m4 ; zero
339
340
; create registers holding dither
341
movq m3, [ditherq] ; dither
342
test offsetd, offsetd
343
jz .no_rot
344
%if mmsize == 16
345
punpcklqdq m3, m3
346
%endif ; mmsize == 16
347
PALIGNR m3, m3, 3, m2
348
.no_rot:
349
%if mmsize == 8
350
mova m2, m3
351
punpckhbw m3, m4 ; byte->word
352
punpcklbw m2, m4 ; byte->word
353
%else
354
punpcklbw m3, m4
355
mova m2, m3
356
%endif
357
%elif %1 == 9
358
pxor m4, m4
359
mova m3, [pw_512]
360
mova m2, [pw_32]
361
%elif %1 == 10
362
pxor m4, m4
363
mova m3, [pw_1024]
364
mova m2, [pw_16]
365
%else ; %1 == 16
366
%if cpuflag(sse4) ; sse4/avx
367
mova m4, [pd_4]
368
%else ; mmx/sse2
369
mova m4, [pd_4min0x40000]
370
mova m5, [minshort]
371
%endif ; mmx/sse2/sse4/avx
372
%endif ; %1 == ..
373
374
; actual pixel scaling
375
%if mmsize == 8
376
yuv2plane1_mainloop %1, a
377
%else ; mmsize == 16
378
test dstq, 15
379
jnz .unaligned
380
yuv2plane1_mainloop %1, a
381
REP_RET
382
.unaligned:
383
yuv2plane1_mainloop %1, u
384
%endif ; mmsize == 8/16
385
REP_RET
386
%endmacro
387
388
%if ARCH_X86_32
389
INIT_MMX mmx
390
yuv2plane1_fn 8, 0, 5
391
yuv2plane1_fn 16, 0, 3
392
393
INIT_MMX mmxext
394
yuv2plane1_fn 9, 0, 3
395
yuv2plane1_fn 10, 0, 3
396
%endif
397
398
INIT_XMM sse2
399
yuv2plane1_fn 8, 5, 5
400
yuv2plane1_fn 9, 5, 3
401
yuv2plane1_fn 10, 5, 3
402
yuv2plane1_fn 16, 6, 3
403
404
INIT_XMM sse4
405
yuv2plane1_fn 16, 5, 3
406
407
%if HAVE_AVX_EXTERNAL
408
INIT_XMM avx
409
yuv2plane1_fn 8, 5, 5
410
yuv2plane1_fn 9, 5, 3
411
yuv2plane1_fn 10, 5, 3
412
yuv2plane1_fn 16, 5, 3
413
%endif
414
415