Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
;******************************************************************************
2
;* x86-optimized horizontal line scaling functions
3
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21
22
%include "libavutil/x86/x86util.asm"
23
24
SECTION_RODATA
25
26
max_19bit_int: times 4 dd 0x7ffff
27
max_19bit_flt: times 4 dd 524287.0
28
minshort: times 8 dw 0x8000
29
unicoeff: times 4 dd 0x20000000
30
31
SECTION .text
32
33
;-----------------------------------------------------------------------------
34
; horizontal line scaling
35
;
36
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
37
; (SwsContext *c, int{16,32}_t *dst,
38
; int dstW, const uint{8,16}_t *src,
39
; const int16_t *filter,
40
; const int32_t *filterPos, int filterSize);
41
;
42
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
43
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
44
; downscale before multiplying). Filter is 14-bits. Output is either 15bits
45
; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
46
; output pixel is generated from $filterSize input pixels, the position of
47
; the first pixel is given in filterPos[nOutputPixel].
48
;-----------------------------------------------------------------------------
49
50
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51
%macro SCALE_FUNC 6
52
%ifnidn %3, X
53
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54
%else
55
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
56
%endif
57
%if ARCH_X86_64
58
movsxd wq, wd
59
%define mov32 movsxd
60
%else ; x86-32
61
%define mov32 mov
62
%endif ; x86-64
63
%if %2 == 19
64
%if mmsize == 8 ; mmx
65
mova m2, [max_19bit_int]
66
%elif cpuflag(sse4)
67
mova m2, [max_19bit_int]
68
%else ; ssse3/sse2
69
mova m2, [max_19bit_flt]
70
%endif ; mmx/sse2/ssse3/sse4
71
%endif ; %2 == 19
72
%if %1 == 16
73
mova m6, [minshort]
74
mova m7, [unicoeff]
75
%elif %1 == 8
76
pxor m3, m3
77
%endif ; %1 == 8/16
78
79
%if %1 == 8
80
%define movlh movd
81
%define movbh movh
82
%define srcmul 1
83
%else ; %1 == 9-16
84
%define movlh movq
85
%define movbh movu
86
%define srcmul 2
87
%endif ; %1 == 8/9-16
88
89
%ifnidn %3, X
90
91
; setup loop
92
%if %3 == 8
93
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
94
%define wshr 1
95
%else ; %3 == 4
96
%define wshr 0
97
%endif ; %3 == 8
98
lea filterq, [filterq+wq*8]
99
%if %2 == 15
100
lea dstq, [dstq+wq*(2>>wshr)]
101
%else ; %2 == 19
102
lea dstq, [dstq+wq*(4>>wshr)]
103
%endif ; %2 == 15/19
104
lea fltposq, [fltposq+wq*(4>>wshr)]
105
neg wq
106
107
.loop:
108
%if %3 == 4 ; filterSize == 4 scaling
109
; load 2x4 or 4x4 source pixels into m0/m1
110
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
111
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
112
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
113
%if mmsize == 8
114
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
115
%else ; mmsize == 16
116
%if %1 > 8
117
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
118
%else ; %1 == 8
119
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
120
%endif
121
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
122
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
123
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
124
%if %1 > 8
125
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
126
%else ; %1 == 8
127
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
128
punpckldq m0, m4
129
punpckldq m1, m5
130
%endif ; %1 == 8
131
%endif ; mmsize == 8/16
132
%if %1 == 8
133
punpcklbw m0, m3 ; byte -> word
134
punpcklbw m1, m3 ; byte -> word
135
%endif ; %1 == 8
136
137
; multiply with filter coefficients
138
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
139
; add back 0x8000 * sum(coeffs) after the horizontal add
140
psubw m0, m6
141
psubw m1, m6
142
%endif ; %1 == 16
143
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
144
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
145
146
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
147
%if mmsize == 8 ; mmx
148
movq m4, m0
149
punpckldq m0, m1
150
punpckhdq m4, m1
151
paddd m0, m4
152
%elif notcpuflag(ssse3) ; sse2
153
mova m4, m0
154
shufps m0, m1, 10001000b
155
shufps m4, m1, 11011101b
156
paddd m0, m4
157
%else ; ssse3/sse4
158
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
159
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
160
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
161
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
162
%endif ; mmx/sse2/ssse3/sse4
163
%else ; %3 == 8, i.e. filterSize == 8 scaling
164
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
165
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
166
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
167
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
168
%if mmsize == 8
169
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
170
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
171
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
172
%else ; mmsize == 16
173
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
174
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
175
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
176
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
177
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
178
%endif ; mmsize == 8/16
179
%if %1 == 8
180
punpcklbw m0, m3 ; byte -> word
181
punpcklbw m1, m3 ; byte -> word
182
punpcklbw m4, m3 ; byte -> word
183
punpcklbw m5, m3 ; byte -> word
184
%endif ; %1 == 8
185
186
; multiply
187
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
188
; add back 0x8000 * sum(coeffs) after the horizontal add
189
psubw m0, m6
190
psubw m1, m6
191
psubw m4, m6
192
psubw m5, m6
193
%endif ; %1 == 16
194
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
195
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
196
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
197
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
198
199
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
200
%if mmsize == 8
201
paddd m0, m1
202
paddd m4, m5
203
movq m1, m0
204
punpckldq m0, m4
205
punpckhdq m1, m4
206
paddd m0, m1
207
%elif notcpuflag(ssse3) ; sse2
208
%if %1 == 8
209
%define mex m6
210
%else
211
%define mex m3
212
%endif
213
; emulate horizontal add as transpose + vertical add
214
mova mex, m0
215
punpckldq m0, m1
216
punpckhdq mex, m1
217
paddd m0, mex
218
mova m1, m4
219
punpckldq m4, m5
220
punpckhdq m1, m5
221
paddd m4, m1
222
mova m1, m0
223
punpcklqdq m0, m4
224
punpckhqdq m1, m4
225
paddd m0, m1
226
%else ; ssse3/sse4
227
; FIXME if we rearrange the filter in pairs of 4, we can
228
; load pixels likewise and use 2 x paddd + phaddd instead
229
; of 3 x phaddd here, faster on older cpus
230
phaddd m0, m1
231
phaddd m4, m5
232
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
233
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
234
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
235
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
236
%endif ; mmx/sse2/ssse3/sse4
237
%endif ; %3 == 4/8
238
239
%else ; %3 == X, i.e. any filterSize scaling
240
241
%ifidn %4, X4
242
%define dlt 4
243
%else ; %4 == X || %4 == X8
244
%define dlt 0
245
%endif ; %4 ==/!= X4
246
%if ARCH_X86_64
247
%define srcq r8
248
%define pos1q r7
249
%define srcendq r9
250
movsxd fltsizeq, fltsized ; filterSize
251
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
252
%else ; x86-32
253
%define srcq srcmemq
254
%define pos1q dstq
255
%define srcendq r6m
256
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
257
mov srcendq, pos0q
258
%endif ; x86-32/64
259
lea fltposq, [fltposq+wq*4]
260
%if %2 == 15
261
lea dstq, [dstq+wq*2]
262
%else ; %2 == 19
263
lea dstq, [dstq+wq*4]
264
%endif ; %2 == 15/19
265
movifnidn dstmp, dstq
266
neg wq
267
268
.loop:
269
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
270
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
271
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
272
pxor m4, m4
273
pxor m5, m5
274
mov srcq, srcmemmp
275
276
.innerloop:
277
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
278
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
279
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
280
%if %1 == 8
281
punpcklbw m0, m3
282
punpcklbw m1, m3
283
%endif ; %1 == 8
284
285
; multiply
286
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
287
; add back 0x8000 * sum(coeffs) after the horizontal add
288
psubw m0, m6
289
psubw m1, m6
290
%endif ; %1 == 16
291
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
292
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
293
paddd m4, m0
294
paddd m5, m1
295
add filterq, mmsize
296
add srcq, srcmul*mmsize/2
297
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
298
jl .innerloop
299
300
%ifidn %4, X4
301
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
302
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
303
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
304
%if %1 > 8
305
movhps m0, [srcq+(pos1q+dlt)*srcmul]
306
%else ; %1 == 8
307
movd m1, [srcq+(pos1q+dlt)*srcmul]
308
punpckldq m0, m1
309
%endif ; %1 == 8
310
%if %1 == 8
311
punpcklbw m0, m3
312
%endif ; %1 == 8
313
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
314
; add back 0x8000 * sum(coeffs) after the horizontal add
315
psubw m0, m6
316
%endif ; %1 == 16
317
pmaddwd m0, [filterq]
318
%endif ; %4 == X4
319
320
lea filterq, [filterq+(fltsizeq+dlt)*2]
321
322
%if mmsize == 8 ; mmx
323
movq m0, m4
324
punpckldq m4, m5
325
punpckhdq m0, m5
326
paddd m0, m4
327
%else ; mmsize == 16
328
%if notcpuflag(ssse3) ; sse2
329
mova m1, m4
330
punpcklqdq m4, m5
331
punpckhqdq m1, m5
332
paddd m4, m1
333
%else ; ssse3/sse4
334
phaddd m4, m5
335
%endif ; sse2/ssse3/sse4
336
%ifidn %4, X4
337
paddd m4, m0
338
%endif ; %3 == X4
339
%if notcpuflag(ssse3) ; sse2
340
pshufd m4, m4, 11011000b
341
movhlps m0, m4
342
paddd m0, m4
343
%else ; ssse3/sse4
344
phaddd m4, m4
345
SWAP 0, 4
346
%endif ; sse2/ssse3/sse4
347
%endif ; mmsize == 8/16
348
%endif ; %3 ==/!= X
349
350
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
351
paddd m0, m7
352
%endif ; %1 == 16
353
354
; clip, store
355
psrad m0, 14 + %1 - %2
356
%ifidn %3, X
357
movifnidn dstq, dstmp
358
%endif ; %3 == X
359
%if %2 == 15
360
packssdw m0, m0
361
%ifnidn %3, X
362
movh [dstq+wq*(2>>wshr)], m0
363
%else ; %3 == X
364
movd [dstq+wq*2], m0
365
%endif ; %3 ==/!= X
366
%else ; %2 == 19
367
%if mmsize == 8
368
PMINSD_MMX m0, m2, m4
369
%elif cpuflag(sse4)
370
pminsd m0, m2
371
%else ; sse2/ssse3
372
cvtdq2ps m0, m0
373
minps m0, m2
374
cvtps2dq m0, m0
375
%endif ; mmx/sse2/ssse3/sse4
376
%ifnidn %3, X
377
mova [dstq+wq*(4>>wshr)], m0
378
%else ; %3 == X
379
movq [dstq+wq*4], m0
380
%endif ; %3 ==/!= X
381
%endif ; %2 == 15/19
382
%ifnidn %3, X
383
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
384
; per iteration. see "shl wq,1" above as for why we do this
385
%else ; %3 == X
386
add wq, 2
387
%endif ; %3 ==/!= X
388
jl .loop
389
REP_RET
390
%endmacro
391
392
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
393
%macro SCALE_FUNCS 3
394
SCALE_FUNC %1, %2, 4, 4, 6, %3
395
SCALE_FUNC %1, %2, 8, 8, 6, %3
396
%if mmsize == 8
397
SCALE_FUNC %1, %2, X, X, 7, %3
398
%else
399
SCALE_FUNC %1, %2, X, X4, 7, %3
400
SCALE_FUNC %1, %2, X, X8, 7, %3
401
%endif
402
%endmacro
403
404
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
405
%macro SCALE_FUNCS2 3
406
%if notcpuflag(sse4)
407
SCALE_FUNCS 8, 15, %1
408
SCALE_FUNCS 9, 15, %2
409
SCALE_FUNCS 10, 15, %2
410
SCALE_FUNCS 12, 15, %2
411
SCALE_FUNCS 14, 15, %2
412
SCALE_FUNCS 16, 15, %3
413
%endif ; !sse4
414
SCALE_FUNCS 8, 19, %1
415
SCALE_FUNCS 9, 19, %2
416
SCALE_FUNCS 10, 19, %2
417
SCALE_FUNCS 12, 19, %2
418
SCALE_FUNCS 14, 19, %2
419
SCALE_FUNCS 16, 19, %3
420
%endmacro
421
422
%if ARCH_X86_32
423
INIT_MMX mmx
424
SCALE_FUNCS2 0, 0, 0
425
%endif
426
INIT_XMM sse2
427
SCALE_FUNCS2 7, 6, 8
428
INIT_XMM ssse3
429
SCALE_FUNCS2 6, 6, 8
430
INIT_XMM sse4
431
SCALE_FUNCS2 6, 6, 8
432
433