Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52869 views
1
;******************************************************************************
2
;* x86 optimized Format Conversion Utils
3
;* Copyright (c) 2008 Loren Merritt
4
;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22
23
%include "libavutil/x86/x86util.asm"
24
%include "util.asm"
25
26
SECTION_RODATA 32
27
28
pf_s32_inv_scale: times 8 dd 0x30000000
29
pf_s32_scale: times 8 dd 0x4f000000
30
pf_s32_clip: times 8 dd 0x4effffff
31
pf_s16_inv_scale: times 4 dd 0x38000000
32
pf_s16_scale: times 4 dd 0x47000000
33
pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
34
pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
35
pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
36
pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
37
pw_zero_even: times 4 dw 0x0000, 0xffff
38
39
SECTION .text
40
41
;------------------------------------------------------------------------------
42
; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43
;------------------------------------------------------------------------------
44
45
INIT_XMM sse2
46
cglobal conv_s16_to_s32, 3,3,3, dst, src, len
47
lea lenq, [2*lend]
48
lea dstq, [dstq+2*lenq]
49
add srcq, lenq
50
neg lenq
51
.loop:
52
mova m2, [srcq+lenq]
53
pxor m0, m0
54
pxor m1, m1
55
punpcklwd m0, m2
56
punpckhwd m1, m2
57
mova [dstq+2*lenq ], m0
58
mova [dstq+2*lenq+mmsize], m1
59
add lenq, mmsize
60
jl .loop
61
REP_RET
62
63
;------------------------------------------------------------------------------
64
; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65
;------------------------------------------------------------------------------
66
67
%macro CONV_S16_TO_FLT 0
68
cglobal conv_s16_to_flt, 3,3,3, dst, src, len
69
lea lenq, [2*lend]
70
add srcq, lenq
71
lea dstq, [dstq + 2*lenq]
72
neg lenq
73
mova m2, [pf_s16_inv_scale]
74
ALIGN 16
75
.loop:
76
mova m0, [srcq+lenq]
77
S16_TO_S32_SX 0, 1
78
cvtdq2ps m0, m0
79
cvtdq2ps m1, m1
80
mulps m0, m2
81
mulps m1, m2
82
mova [dstq+2*lenq ], m0
83
mova [dstq+2*lenq+mmsize], m1
84
add lenq, mmsize
85
jl .loop
86
REP_RET
87
%endmacro
88
89
INIT_XMM sse2
90
CONV_S16_TO_FLT
91
INIT_XMM sse4
92
CONV_S16_TO_FLT
93
94
;------------------------------------------------------------------------------
95
; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96
;------------------------------------------------------------------------------
97
98
%macro CONV_S32_TO_S16 0
99
cglobal conv_s32_to_s16, 3,3,4, dst, src, len
100
lea lenq, [2*lend]
101
lea srcq, [srcq+2*lenq]
102
add dstq, lenq
103
neg lenq
104
.loop:
105
mova m0, [srcq+2*lenq ]
106
mova m1, [srcq+2*lenq+ mmsize]
107
mova m2, [srcq+2*lenq+2*mmsize]
108
mova m3, [srcq+2*lenq+3*mmsize]
109
psrad m0, 16
110
psrad m1, 16
111
psrad m2, 16
112
psrad m3, 16
113
packssdw m0, m1
114
packssdw m2, m3
115
mova [dstq+lenq ], m0
116
mova [dstq+lenq+mmsize], m2
117
add lenq, mmsize*2
118
jl .loop
119
%if mmsize == 8
120
emms
121
RET
122
%else
123
REP_RET
124
%endif
125
%endmacro
126
127
INIT_MMX mmx
128
CONV_S32_TO_S16
129
INIT_XMM sse2
130
CONV_S32_TO_S16
131
132
;------------------------------------------------------------------------------
133
; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134
;------------------------------------------------------------------------------
135
136
%macro CONV_S32_TO_FLT 0
137
cglobal conv_s32_to_flt, 3,3,3, dst, src, len
138
lea lenq, [4*lend]
139
add srcq, lenq
140
add dstq, lenq
141
neg lenq
142
mova m0, [pf_s32_inv_scale]
143
ALIGN 16
144
.loop:
145
cvtdq2ps m1, [srcq+lenq ]
146
cvtdq2ps m2, [srcq+lenq+mmsize]
147
mulps m1, m1, m0
148
mulps m2, m2, m0
149
mova [dstq+lenq ], m1
150
mova [dstq+lenq+mmsize], m2
151
add lenq, mmsize*2
152
jl .loop
153
REP_RET
154
%endmacro
155
156
INIT_XMM sse2
157
CONV_S32_TO_FLT
158
%if HAVE_AVX_EXTERNAL
159
INIT_YMM avx
160
CONV_S32_TO_FLT
161
%endif
162
163
;------------------------------------------------------------------------------
164
; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
165
;------------------------------------------------------------------------------
166
167
INIT_XMM sse2
168
cglobal conv_flt_to_s16, 3,3,5, dst, src, len
169
lea lenq, [2*lend]
170
lea srcq, [srcq+2*lenq]
171
add dstq, lenq
172
neg lenq
173
mova m4, [pf_s16_scale]
174
.loop:
175
mova m0, [srcq+2*lenq ]
176
mova m1, [srcq+2*lenq+1*mmsize]
177
mova m2, [srcq+2*lenq+2*mmsize]
178
mova m3, [srcq+2*lenq+3*mmsize]
179
mulps m0, m4
180
mulps m1, m4
181
mulps m2, m4
182
mulps m3, m4
183
cvtps2dq m0, m0
184
cvtps2dq m1, m1
185
cvtps2dq m2, m2
186
cvtps2dq m3, m3
187
packssdw m0, m1
188
packssdw m2, m3
189
mova [dstq+lenq ], m0
190
mova [dstq+lenq+mmsize], m2
191
add lenq, mmsize*2
192
jl .loop
193
REP_RET
194
195
;------------------------------------------------------------------------------
196
; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
197
;------------------------------------------------------------------------------
198
199
%macro CONV_FLT_TO_S32 0
200
cglobal conv_flt_to_s32, 3,3,6, dst, src, len
201
lea lenq, [lend*4]
202
add srcq, lenq
203
add dstq, lenq
204
neg lenq
205
mova m4, [pf_s32_scale]
206
mova m5, [pf_s32_clip]
207
.loop:
208
mulps m0, m4, [srcq+lenq ]
209
mulps m1, m4, [srcq+lenq+1*mmsize]
210
mulps m2, m4, [srcq+lenq+2*mmsize]
211
mulps m3, m4, [srcq+lenq+3*mmsize]
212
minps m0, m0, m5
213
minps m1, m1, m5
214
minps m2, m2, m5
215
minps m3, m3, m5
216
cvtps2dq m0, m0
217
cvtps2dq m1, m1
218
cvtps2dq m2, m2
219
cvtps2dq m3, m3
220
mova [dstq+lenq ], m0
221
mova [dstq+lenq+1*mmsize], m1
222
mova [dstq+lenq+2*mmsize], m2
223
mova [dstq+lenq+3*mmsize], m3
224
add lenq, mmsize*4
225
jl .loop
226
REP_RET
227
%endmacro
228
229
INIT_XMM sse2
230
CONV_FLT_TO_S32
231
%if HAVE_AVX_EXTERNAL
232
INIT_YMM avx
233
CONV_FLT_TO_S32
234
%endif
235
236
;------------------------------------------------------------------------------
237
; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
238
; int channels);
239
;------------------------------------------------------------------------------
240
241
%macro CONV_S16P_TO_S16_2CH 0
242
cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
243
mov src1q, [src0q+gprsize]
244
mov src0q, [src0q ]
245
lea lenq, [2*lend]
246
add src0q, lenq
247
add src1q, lenq
248
lea dstq, [dstq+2*lenq]
249
neg lenq
250
.loop:
251
mova m0, [src0q+lenq ]
252
mova m1, [src1q+lenq ]
253
mova m2, [src0q+lenq+mmsize]
254
mova m3, [src1q+lenq+mmsize]
255
SBUTTERFLY2 wd, 0, 1, 4
256
SBUTTERFLY2 wd, 2, 3, 4
257
mova [dstq+2*lenq+0*mmsize], m0
258
mova [dstq+2*lenq+1*mmsize], m1
259
mova [dstq+2*lenq+2*mmsize], m2
260
mova [dstq+2*lenq+3*mmsize], m3
261
add lenq, 2*mmsize
262
jl .loop
263
REP_RET
264
%endmacro
265
266
INIT_XMM sse2
267
CONV_S16P_TO_S16_2CH
268
%if HAVE_AVX_EXTERNAL
269
INIT_XMM avx
270
CONV_S16P_TO_S16_2CH
271
%endif
272
273
;------------------------------------------------------------------------------
274
; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
275
; int channels);
276
;------------------------------------------------------------------------------
277
278
;------------------------------------------------------------------------------
279
; NOTE: In the 6-channel functions, len could be used as an index on x86-64
280
; instead of just a counter, which would avoid incrementing the
281
; pointers, but the extra complexity and amount of code is not worth
282
; the small gain. On x86-32 there are not enough registers to use len
283
; as an index without keeping two of the pointers on the stack and
284
; loading them in each iteration.
285
;------------------------------------------------------------------------------
286
287
%macro CONV_S16P_TO_S16_6CH 0
288
%if ARCH_X86_64
289
cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
290
%else
291
cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
292
%define lend dword r2m
293
%endif
294
mov src1q, [src0q+1*gprsize]
295
mov src2q, [src0q+2*gprsize]
296
mov src3q, [src0q+3*gprsize]
297
mov src4q, [src0q+4*gprsize]
298
mov src5q, [src0q+5*gprsize]
299
mov src0q, [src0q]
300
sub src1q, src0q
301
sub src2q, src0q
302
sub src3q, src0q
303
sub src4q, src0q
304
sub src5q, src0q
305
.loop:
306
%if cpuflag(sse2slow)
307
movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
308
movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
309
movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
310
movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
311
movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
312
movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
313
; unpack words:
314
punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
315
punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
316
punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
317
; blend dwords
318
shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
319
shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
320
shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
321
; shuffle dwords
322
pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
323
pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
324
pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
325
movq [dstq+0*mmsize/2], m1
326
movq [dstq+1*mmsize/2], m0
327
movq [dstq+2*mmsize/2], m2
328
movhps [dstq+3*mmsize/2], m1
329
movhps [dstq+4*mmsize/2], m0
330
movhps [dstq+5*mmsize/2], m2
331
add src0q, mmsize/2
332
add dstq, mmsize*3
333
sub lend, mmsize/4
334
%else
335
mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
336
mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
337
mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
338
mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
339
mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
340
mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
341
; unpack words:
342
SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
343
; m1 = 24, 25, 30, 31, 36, 37, 42, 43
344
SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
345
; m3 = 26, 27, 32, 33, 38, 39, 44, 45
346
SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
347
; m5 = 28, 29, 34, 35, 40, 41, 46, 47
348
; blend dwords
349
shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
350
shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
351
shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
352
SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
353
shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
354
shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
355
shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
356
SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
357
; shuffle dwords
358
pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
359
pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
360
pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
361
pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
362
pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
363
pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
364
; shuffle qwords
365
punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
366
punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
367
shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
368
SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
369
punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
370
punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
371
shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
372
SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
373
mova [dstq+0*mmsize], m4
374
mova [dstq+1*mmsize], m2
375
mova [dstq+2*mmsize], m0
376
mova [dstq+3*mmsize], m5
377
mova [dstq+4*mmsize], m3
378
mova [dstq+5*mmsize], m1
379
add src0q, mmsize
380
add dstq, mmsize*6
381
sub lend, mmsize/2
382
%endif
383
jg .loop
384
REP_RET
385
%endmacro
386
387
INIT_XMM sse2
388
CONV_S16P_TO_S16_6CH
389
INIT_XMM sse2slow
390
CONV_S16P_TO_S16_6CH
391
%if HAVE_AVX_EXTERNAL
392
INIT_XMM avx
393
CONV_S16P_TO_S16_6CH
394
%endif
395
396
;------------------------------------------------------------------------------
397
; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
398
; int channels);
399
;------------------------------------------------------------------------------
400
401
%macro CONV_S16P_TO_FLT_2CH 0
402
cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
403
lea lenq, [2*lend]
404
mov src1q, [src0q+gprsize]
405
mov src0q, [src0q ]
406
lea dstq, [dstq+4*lenq]
407
add src0q, lenq
408
add src1q, lenq
409
neg lenq
410
mova m5, [pf_s32_inv_scale]
411
.loop:
412
mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
413
mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
414
SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
415
; m4 = 8, 9, 10, 11, 12, 13, 14, 15
416
pxor m3, m3
417
punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
418
punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
419
punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
420
punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
421
cvtdq2ps m0, m0
422
cvtdq2ps m1, m1
423
cvtdq2ps m2, m2
424
cvtdq2ps m3, m3
425
mulps m0, m5
426
mulps m1, m5
427
mulps m2, m5
428
mulps m3, m5
429
mova [dstq+4*lenq ], m0
430
mova [dstq+4*lenq+ mmsize], m1
431
mova [dstq+4*lenq+2*mmsize], m2
432
mova [dstq+4*lenq+3*mmsize], m3
433
add lenq, mmsize
434
jl .loop
435
REP_RET
436
%endmacro
437
438
INIT_XMM sse2
439
CONV_S16P_TO_FLT_2CH
440
%if HAVE_AVX_EXTERNAL
441
INIT_XMM avx
442
CONV_S16P_TO_FLT_2CH
443
%endif
444
445
;------------------------------------------------------------------------------
446
; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
447
; int channels);
448
;------------------------------------------------------------------------------
449
450
%macro CONV_S16P_TO_FLT_6CH 0
451
%if ARCH_X86_64
452
cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
453
%else
454
cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
455
%define lend dword r2m
456
%endif
457
mov src1q, [srcq+1*gprsize]
458
mov src2q, [srcq+2*gprsize]
459
mov src3q, [srcq+3*gprsize]
460
mov src4q, [srcq+4*gprsize]
461
mov src5q, [srcq+5*gprsize]
462
mov srcq, [srcq]
463
sub src1q, srcq
464
sub src2q, srcq
465
sub src3q, srcq
466
sub src4q, srcq
467
sub src5q, srcq
468
mova m7, [pf_s32_inv_scale]
469
%if cpuflag(ssse3)
470
%define unpack_even m6
471
mova m6, [pb_shuf_unpack_even]
472
%if ARCH_X86_64
473
%define unpack_odd m8
474
mova m8, [pb_shuf_unpack_odd]
475
%else
476
%define unpack_odd [pb_shuf_unpack_odd]
477
%endif
478
%endif
479
.loop:
480
movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
481
movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
482
movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
483
movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
484
movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
485
movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
486
; unpack words:
487
punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
488
punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
489
punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
490
; blend dwords
491
shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
492
shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
493
shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
494
%if cpuflag(ssse3)
495
pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
496
pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
497
pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
498
pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
499
pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
500
pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
501
%else
502
; shuffle dwords
503
pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
504
pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
505
pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
506
pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
507
punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
508
punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
509
punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
510
punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
511
punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
512
punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
513
SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
514
%endif
515
cvtdq2ps m0, m0 ; convert s32 to float
516
cvtdq2ps m1, m1
517
cvtdq2ps m2, m2
518
cvtdq2ps m3, m3
519
cvtdq2ps m4, m4
520
cvtdq2ps m5, m5
521
mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
522
mulps m1, m7
523
mulps m2, m7
524
mulps m3, m7
525
mulps m4, m7
526
mulps m5, m7
527
mova [dstq ], m0
528
mova [dstq+ mmsize], m1
529
mova [dstq+2*mmsize], m2
530
mova [dstq+3*mmsize], m3
531
mova [dstq+4*mmsize], m4
532
mova [dstq+5*mmsize], m5
533
add srcq, mmsize/2
534
add dstq, mmsize*6
535
sub lend, mmsize/4
536
jg .loop
537
REP_RET
538
%endmacro
539
540
INIT_XMM sse2
541
CONV_S16P_TO_FLT_6CH
542
INIT_XMM ssse3
543
CONV_S16P_TO_FLT_6CH
544
%if HAVE_AVX_EXTERNAL
545
INIT_XMM avx
546
CONV_S16P_TO_FLT_6CH
547
%endif
548
549
;------------------------------------------------------------------------------
550
; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
551
; int channels);
552
;------------------------------------------------------------------------------
553
554
%macro CONV_FLTP_TO_S16_2CH 0
555
cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
556
lea lenq, [4*lend]
557
mov src1q, [src0q+gprsize]
558
mov src0q, [src0q ]
559
add dstq, lenq
560
add src0q, lenq
561
add src1q, lenq
562
neg lenq
563
mova m2, [pf_s16_scale]
564
%if cpuflag(ssse3)
565
mova m3, [pb_interleave_words]
566
%endif
567
.loop:
568
mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
569
mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
570
cvtps2dq m0, m0
571
cvtps2dq m1, m1
572
%if cpuflag(ssse3)
573
packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
574
pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
575
%else
576
packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
577
packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
578
punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
579
%endif
580
mova [dstq+lenq], m0
581
add lenq, mmsize
582
jl .loop
583
REP_RET
584
%endmacro
585
586
INIT_XMM sse2
587
CONV_FLTP_TO_S16_2CH
588
INIT_XMM ssse3
589
CONV_FLTP_TO_S16_2CH
590
591
;------------------------------------------------------------------------------
592
; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
593
; int channels);
594
;------------------------------------------------------------------------------
595
596
%macro CONV_FLTP_TO_S16_6CH 0
597
%if ARCH_X86_64
598
cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
599
%else
600
cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
601
%define lend dword r2m
602
%endif
603
mov src1q, [srcq+1*gprsize]
604
mov src2q, [srcq+2*gprsize]
605
mov src3q, [srcq+3*gprsize]
606
mov src4q, [srcq+4*gprsize]
607
mov src5q, [srcq+5*gprsize]
608
mov srcq, [srcq]
609
sub src1q, srcq
610
sub src2q, srcq
611
sub src3q, srcq
612
sub src4q, srcq
613
sub src5q, srcq
614
movaps xmm6, [pf_s16_scale]
615
.loop:
616
%if cpuflag(sse2)
617
mulps m0, m6, [srcq ]
618
mulps m1, m6, [srcq+src1q]
619
mulps m2, m6, [srcq+src2q]
620
mulps m3, m6, [srcq+src3q]
621
mulps m4, m6, [srcq+src4q]
622
mulps m5, m6, [srcq+src5q]
623
cvtps2dq m0, m0
624
cvtps2dq m1, m1
625
cvtps2dq m2, m2
626
cvtps2dq m3, m3
627
cvtps2dq m4, m4
628
cvtps2dq m5, m5
629
packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
630
packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
631
packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
632
; unpack words:
633
movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
634
punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
635
punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
636
punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
637
; blend dwords:
638
shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
639
shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
640
shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
641
; shuffle dwords:
642
shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
643
shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
644
shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
645
mova [dstq+0*mmsize], m3
646
mova [dstq+1*mmsize], m1
647
mova [dstq+2*mmsize], m0
648
%else ; sse
649
movlps xmm0, [srcq ]
650
movlps xmm1, [srcq+src1q]
651
movlps xmm2, [srcq+src2q]
652
movlps xmm3, [srcq+src3q]
653
movlps xmm4, [srcq+src4q]
654
movlps xmm5, [srcq+src5q]
655
mulps xmm0, xmm6
656
mulps xmm1, xmm6
657
mulps xmm2, xmm6
658
mulps xmm3, xmm6
659
mulps xmm4, xmm6
660
mulps xmm5, xmm6
661
cvtps2pi mm0, xmm0
662
cvtps2pi mm1, xmm1
663
cvtps2pi mm2, xmm2
664
cvtps2pi mm3, xmm3
665
cvtps2pi mm4, xmm4
666
cvtps2pi mm5, xmm5
667
packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
668
packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
669
packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
670
; unpack words
671
pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
672
punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
673
punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
674
punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
675
; unpack dwords
676
pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
677
punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
678
punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
679
punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
680
mova [dstq+0*mmsize], mm0
681
mova [dstq+1*mmsize], mm1
682
mova [dstq+2*mmsize], mm2
683
%endif
684
add srcq, mmsize
685
add dstq, mmsize*3
686
sub lend, mmsize/4
687
jg .loop
688
%if mmsize == 8
689
emms
690
RET
691
%else
692
REP_RET
693
%endif
694
%endmacro
695
696
INIT_MMX sse
697
CONV_FLTP_TO_S16_6CH
698
INIT_XMM sse2
699
CONV_FLTP_TO_S16_6CH
700
%if HAVE_AVX_EXTERNAL
701
INIT_XMM avx
702
CONV_FLTP_TO_S16_6CH
703
%endif
704
705
;------------------------------------------------------------------------------
706
; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
707
; int channels);
708
;------------------------------------------------------------------------------
709
710
%macro CONV_FLTP_TO_FLT_2CH 0
711
cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
712
mov src1q, [src0q+gprsize]
713
mov src0q, [src0q]
714
lea lenq, [4*lend]
715
add src0q, lenq
716
add src1q, lenq
717
lea dstq, [dstq+2*lenq]
718
neg lenq
719
.loop:
720
mova m0, [src0q+lenq ]
721
mova m1, [src1q+lenq ]
722
mova m2, [src0q+lenq+mmsize]
723
mova m3, [src1q+lenq+mmsize]
724
SBUTTERFLYPS 0, 1, 4
725
SBUTTERFLYPS 2, 3, 4
726
mova [dstq+2*lenq+0*mmsize], m0
727
mova [dstq+2*lenq+1*mmsize], m1
728
mova [dstq+2*lenq+2*mmsize], m2
729
mova [dstq+2*lenq+3*mmsize], m3
730
add lenq, 2*mmsize
731
jl .loop
732
REP_RET
733
%endmacro
734
735
INIT_XMM sse
736
CONV_FLTP_TO_FLT_2CH
737
%if HAVE_AVX_EXTERNAL
738
INIT_XMM avx
739
CONV_FLTP_TO_FLT_2CH
740
%endif
741
742
;-----------------------------------------------------------------------------
743
; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
744
; int channels);
745
;-----------------------------------------------------------------------------
746
747
%macro CONV_FLTP_TO_FLT_6CH 0
748
cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
749
%if ARCH_X86_64
750
mov lend, r2d
751
%else
752
%define lend dword r2m
753
%endif
754
mov src1q, [srcq+1*gprsize]
755
mov src2q, [srcq+2*gprsize]
756
mov src3q, [srcq+3*gprsize]
757
mov src4q, [srcq+4*gprsize]
758
mov src5q, [srcq+5*gprsize]
759
mov srcq, [srcq]
760
sub src1q, srcq
761
sub src2q, srcq
762
sub src3q, srcq
763
sub src4q, srcq
764
sub src5q, srcq
765
.loop:
766
mova m0, [srcq ]
767
mova m1, [srcq+src1q]
768
mova m2, [srcq+src2q]
769
mova m3, [srcq+src3q]
770
mova m4, [srcq+src4q]
771
mova m5, [srcq+src5q]
772
%if cpuflag(sse4)
773
SBUTTERFLYPS 0, 1, 6
774
SBUTTERFLYPS 2, 3, 6
775
SBUTTERFLYPS 4, 5, 6
776
777
blendps m6, m4, m0, 1100b
778
movlhps m0, m2
779
movhlps m4, m2
780
blendps m2, m5, m1, 1100b
781
movlhps m1, m3
782
movhlps m5, m3
783
784
movaps [dstq ], m0
785
movaps [dstq+16], m6
786
movaps [dstq+32], m4
787
movaps [dstq+48], m1
788
movaps [dstq+64], m2
789
movaps [dstq+80], m5
790
%else ; mmx
791
SBUTTERFLY dq, 0, 1, 6
792
SBUTTERFLY dq, 2, 3, 6
793
SBUTTERFLY dq, 4, 5, 6
794
795
movq [dstq ], m0
796
movq [dstq+ 8], m2
797
movq [dstq+16], m4
798
movq [dstq+24], m1
799
movq [dstq+32], m3
800
movq [dstq+40], m5
801
%endif
802
add srcq, mmsize
803
add dstq, mmsize*6
804
sub lend, mmsize/4
805
jg .loop
806
%if mmsize == 8
807
emms
808
RET
809
%else
810
REP_RET
811
%endif
812
%endmacro
813
814
INIT_MMX mmx
815
CONV_FLTP_TO_FLT_6CH
816
INIT_XMM sse4
817
CONV_FLTP_TO_FLT_6CH
818
%if HAVE_AVX_EXTERNAL
819
INIT_XMM avx
820
CONV_FLTP_TO_FLT_6CH
821
%endif
822
823
;------------------------------------------------------------------------------
824
; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
825
; int channels);
826
;------------------------------------------------------------------------------
827
828
%macro CONV_S16_TO_S16P_2CH 0
829
cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
830
lea lenq, [2*lend]
831
mov dst1q, [dst0q+gprsize]
832
mov dst0q, [dst0q ]
833
lea srcq, [srcq+2*lenq]
834
add dst0q, lenq
835
add dst1q, lenq
836
neg lenq
837
%if cpuflag(ssse3)
838
mova m3, [pb_deinterleave_words]
839
%endif
840
.loop:
841
mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
842
mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
843
%if cpuflag(ssse3)
844
pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
845
pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
846
SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
847
; m1 = 1, 3, 5, 7, 9, 11, 13, 15
848
%else ; sse2
849
pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
850
pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
851
pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
852
pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
853
DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
854
; m1 = 1, 3, 5, 7, 9, 11, 13, 15
855
%endif
856
mova [dst0q+lenq], m0
857
mova [dst1q+lenq], m1
858
add lenq, mmsize
859
jl .loop
860
REP_RET
861
%endmacro
862
863
INIT_XMM sse2
864
CONV_S16_TO_S16P_2CH
865
INIT_XMM ssse3
866
CONV_S16_TO_S16P_2CH
867
%if HAVE_AVX_EXTERNAL
868
INIT_XMM avx
869
CONV_S16_TO_S16P_2CH
870
%endif
871
872
;------------------------------------------------------------------------------
873
; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
874
; int channels);
875
;------------------------------------------------------------------------------
876
877
%macro CONV_S16_TO_S16P_6CH 0
878
%if ARCH_X86_64
879
cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
880
%else
881
cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
882
%define lend dword r2m
883
%endif
884
mov dst1q, [dstq+ gprsize]
885
mov dst2q, [dstq+2*gprsize]
886
mov dst3q, [dstq+3*gprsize]
887
mov dst4q, [dstq+4*gprsize]
888
mov dst5q, [dstq+5*gprsize]
889
mov dstq, [dstq ]
890
sub dst1q, dstq
891
sub dst2q, dstq
892
sub dst3q, dstq
893
sub dst4q, dstq
894
sub dst5q, dstq
895
.loop:
896
mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
897
mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
898
mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
899
PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
900
shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
901
psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
902
SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
903
; m1 = 4, 10, 5, 11, x, x, x, x
904
SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
905
; m2 = 16, 22, 17, 23, x, x, x, x
906
SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
907
; m3 = 2, 8, 14, 20, 3, 9, 15, 21
908
punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
909
movq [dstq ], m0
910
movhps [dstq+dst1q], m0
911
movq [dstq+dst2q], m3
912
movhps [dstq+dst3q], m3
913
movq [dstq+dst4q], m1
914
movhps [dstq+dst5q], m1
915
add srcq, mmsize*3
916
add dstq, mmsize/2
917
sub lend, mmsize/4
918
jg .loop
919
REP_RET
920
%endmacro
921
922
INIT_XMM sse2
923
CONV_S16_TO_S16P_6CH
924
INIT_XMM ssse3
925
CONV_S16_TO_S16P_6CH
926
%if HAVE_AVX_EXTERNAL
927
INIT_XMM avx
928
CONV_S16_TO_S16P_6CH
929
%endif
930
931
;------------------------------------------------------------------------------
932
; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
933
; int channels);
934
;------------------------------------------------------------------------------
935
936
%macro CONV_S16_TO_FLTP_2CH 0
937
cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
938
lea lenq, [4*lend]
939
mov dst1q, [dst0q+gprsize]
940
mov dst0q, [dst0q ]
941
add srcq, lenq
942
add dst0q, lenq
943
add dst1q, lenq
944
neg lenq
945
mova m3, [pf_s32_inv_scale]
946
mova m4, [pw_zero_even]
947
.loop:
948
mova m1, [srcq+lenq]
949
pslld m0, m1, 16
950
pand m1, m4
951
cvtdq2ps m0, m0
952
cvtdq2ps m1, m1
953
mulps m0, m0, m3
954
mulps m1, m1, m3
955
mova [dst0q+lenq], m0
956
mova [dst1q+lenq], m1
957
add lenq, mmsize
958
jl .loop
959
REP_RET
960
%endmacro
961
962
INIT_XMM sse2
963
CONV_S16_TO_FLTP_2CH
964
%if HAVE_AVX_EXTERNAL
965
INIT_XMM avx
966
CONV_S16_TO_FLTP_2CH
967
%endif
968
969
;------------------------------------------------------------------------------
970
; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
971
; int channels);
972
;------------------------------------------------------------------------------
973
974
%macro CONV_S16_TO_FLTP_6CH 0
975
%if ARCH_X86_64
976
cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
977
%else
978
cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
979
%define lend dword r2m
980
%endif
981
mov dst1q, [dstq+ gprsize]
982
mov dst2q, [dstq+2*gprsize]
983
mov dst3q, [dstq+3*gprsize]
984
mov dst4q, [dstq+4*gprsize]
985
mov dst5q, [dstq+5*gprsize]
986
mov dstq, [dstq ]
987
sub dst1q, dstq
988
sub dst2q, dstq
989
sub dst3q, dstq
990
sub dst4q, dstq
991
sub dst5q, dstq
992
mova m6, [pf_s16_inv_scale]
993
.loop:
994
mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
995
mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
996
mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
997
PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
998
shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
999
psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1000
SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1001
; m1 = 4, 10, 5, 11, x, x, x, x
1002
SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
1003
; m2 = 16, 22, 17, 23, x, x, x, x
1004
SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1005
; m3 = 2, 8, 14, 20, 3, 9, 15, 21
1006
punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
1007
S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
1008
; m2 = 1, 7, 13, 19
1009
S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
1010
; m4 = 3, 9, 15, 21
1011
S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
1012
; m5 = 5, 11, 17, 23
1013
SWAP 1,2,3,4
1014
cvtdq2ps m0, m0
1015
cvtdq2ps m1, m1
1016
cvtdq2ps m2, m2
1017
cvtdq2ps m3, m3
1018
cvtdq2ps m4, m4
1019
cvtdq2ps m5, m5
1020
mulps m0, m6
1021
mulps m1, m6
1022
mulps m2, m6
1023
mulps m3, m6
1024
mulps m4, m6
1025
mulps m5, m6
1026
mova [dstq ], m0
1027
mova [dstq+dst1q], m1
1028
mova [dstq+dst2q], m2
1029
mova [dstq+dst3q], m3
1030
mova [dstq+dst4q], m4
1031
mova [dstq+dst5q], m5
1032
add srcq, mmsize*3
1033
add dstq, mmsize
1034
sub lend, mmsize/4
1035
jg .loop
1036
REP_RET
1037
%endmacro
1038
1039
INIT_XMM sse2
1040
CONV_S16_TO_FLTP_6CH
1041
INIT_XMM ssse3
1042
CONV_S16_TO_FLTP_6CH
1043
INIT_XMM sse4
1044
CONV_S16_TO_FLTP_6CH
1045
%if HAVE_AVX_EXTERNAL
1046
INIT_XMM avx
1047
CONV_S16_TO_FLTP_6CH
1048
%endif
1049
1050
;------------------------------------------------------------------------------
1051
; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1052
; int channels);
1053
;------------------------------------------------------------------------------
1054
1055
%macro CONV_FLT_TO_S16P_2CH 0
1056
cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1057
lea lenq, [2*lend]
1058
mov dst1q, [dst0q+gprsize]
1059
mov dst0q, [dst0q ]
1060
lea srcq, [srcq+4*lenq]
1061
add dst0q, lenq
1062
add dst1q, lenq
1063
neg lenq
1064
mova m5, [pf_s16_scale]
1065
.loop:
1066
mova m0, [srcq+4*lenq ]
1067
mova m1, [srcq+4*lenq+ mmsize]
1068
mova m2, [srcq+4*lenq+2*mmsize]
1069
mova m3, [srcq+4*lenq+3*mmsize]
1070
DEINT2_PS 0, 1, 4
1071
DEINT2_PS 2, 3, 4
1072
mulps m0, m0, m5
1073
mulps m1, m1, m5
1074
mulps m2, m2, m5
1075
mulps m3, m3, m5
1076
cvtps2dq m0, m0
1077
cvtps2dq m1, m1
1078
cvtps2dq m2, m2
1079
cvtps2dq m3, m3
1080
packssdw m0, m2
1081
packssdw m1, m3
1082
mova [dst0q+lenq], m0
1083
mova [dst1q+lenq], m1
1084
add lenq, mmsize
1085
jl .loop
1086
REP_RET
1087
%endmacro
1088
1089
INIT_XMM sse2
1090
CONV_FLT_TO_S16P_2CH
1091
%if HAVE_AVX_EXTERNAL
1092
INIT_XMM avx
1093
CONV_FLT_TO_S16P_2CH
1094
%endif
1095
1096
;------------------------------------------------------------------------------
1097
; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1098
; int channels);
1099
;------------------------------------------------------------------------------
1100
1101
%macro CONV_FLT_TO_S16P_6CH 0
1102
%if ARCH_X86_64
1103
cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1104
%else
1105
cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1106
%define lend dword r2m
1107
%endif
1108
mov dst1q, [dstq+ gprsize]
1109
mov dst2q, [dstq+2*gprsize]
1110
mov dst3q, [dstq+3*gprsize]
1111
mov dst4q, [dstq+4*gprsize]
1112
mov dst5q, [dstq+5*gprsize]
1113
mov dstq, [dstq ]
1114
sub dst1q, dstq
1115
sub dst2q, dstq
1116
sub dst3q, dstq
1117
sub dst4q, dstq
1118
sub dst5q, dstq
1119
mova m6, [pf_s16_scale]
1120
.loop:
1121
mulps m0, m6, [srcq+0*mmsize]
1122
mulps m3, m6, [srcq+1*mmsize]
1123
mulps m1, m6, [srcq+2*mmsize]
1124
mulps m4, m6, [srcq+3*mmsize]
1125
mulps m2, m6, [srcq+4*mmsize]
1126
mulps m5, m6, [srcq+5*mmsize]
1127
cvtps2dq m0, m0
1128
cvtps2dq m1, m1
1129
cvtps2dq m2, m2
1130
cvtps2dq m3, m3
1131
cvtps2dq m4, m4
1132
cvtps2dq m5, m5
1133
packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
1134
packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
1135
packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1136
PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
1137
shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1138
psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1139
SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1140
; m3 = 4, 10, 5, 11, x, x, x, x
1141
SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1142
; m2 = 16, 22, 17, 23, x, x, x, x
1143
SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1144
; m1 = 2, 8, 14, 20, 3, 9, 15, 21
1145
punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
1146
movq [dstq ], m0
1147
movhps [dstq+dst1q], m0
1148
movq [dstq+dst2q], m1
1149
movhps [dstq+dst3q], m1
1150
movq [dstq+dst4q], m3
1151
movhps [dstq+dst5q], m3
1152
add srcq, mmsize*6
1153
add dstq, mmsize/2
1154
sub lend, mmsize/4
1155
jg .loop
1156
REP_RET
1157
%endmacro
1158
1159
INIT_XMM sse2
1160
CONV_FLT_TO_S16P_6CH
1161
INIT_XMM ssse3
1162
CONV_FLT_TO_S16P_6CH
1163
%if HAVE_AVX_EXTERNAL
1164
INIT_XMM avx
1165
CONV_FLT_TO_S16P_6CH
1166
%endif
1167
1168
;------------------------------------------------------------------------------
1169
; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1170
; int channels);
1171
;------------------------------------------------------------------------------
1172
1173
%macro CONV_FLT_TO_FLTP_2CH 0
1174
cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1175
lea lenq, [4*lend]
1176
mov dst1q, [dst0q+gprsize]
1177
mov dst0q, [dst0q ]
1178
lea srcq, [srcq+2*lenq]
1179
add dst0q, lenq
1180
add dst1q, lenq
1181
neg lenq
1182
.loop:
1183
mova m0, [srcq+2*lenq ]
1184
mova m1, [srcq+2*lenq+mmsize]
1185
DEINT2_PS 0, 1, 2
1186
mova [dst0q+lenq], m0
1187
mova [dst1q+lenq], m1
1188
add lenq, mmsize
1189
jl .loop
1190
REP_RET
1191
%endmacro
1192
1193
INIT_XMM sse
1194
CONV_FLT_TO_FLTP_2CH
1195
%if HAVE_AVX_EXTERNAL
1196
INIT_XMM avx
1197
CONV_FLT_TO_FLTP_2CH
1198
%endif
1199
1200
;------------------------------------------------------------------------------
1201
; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1202
; int channels);
1203
;------------------------------------------------------------------------------
1204
1205
%macro CONV_FLT_TO_FLTP_6CH 0
1206
%if ARCH_X86_64
1207
cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1208
%else
1209
cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1210
%define lend dword r2m
1211
%endif
1212
mov dst1q, [dstq+ gprsize]
1213
mov dst2q, [dstq+2*gprsize]
1214
mov dst3q, [dstq+3*gprsize]
1215
mov dst4q, [dstq+4*gprsize]
1216
mov dst5q, [dstq+5*gprsize]
1217
mov dstq, [dstq ]
1218
sub dst1q, dstq
1219
sub dst2q, dstq
1220
sub dst3q, dstq
1221
sub dst4q, dstq
1222
sub dst5q, dstq
1223
.loop:
1224
mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
1225
mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
1226
mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
1227
mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
1228
mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
1229
mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
1230
1231
SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
1232
; m3 = 2, 14, 3, 15
1233
SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
1234
; m4 = 6, 18, 7, 19
1235
SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
1236
; m5 = 10, 22, 11, 23
1237
SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
1238
; m4 = 1, 7, 13, 19
1239
SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
1240
; m2 = 3, 9, 15, 21
1241
SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
1242
; m5 = 5, 11, 17, 23
1243
mova [dstq ], m0
1244
mova [dstq+dst1q], m4
1245
mova [dstq+dst2q], m3
1246
mova [dstq+dst3q], m2
1247
mova [dstq+dst4q], m1
1248
mova [dstq+dst5q], m5
1249
add srcq, mmsize*6
1250
add dstq, mmsize
1251
sub lend, mmsize/4
1252
jg .loop
1253
REP_RET
1254
%endmacro
1255
1256
INIT_XMM sse2
1257
CONV_FLT_TO_FLTP_6CH
1258
%if HAVE_AVX_EXTERNAL
1259
INIT_XMM avx
1260
CONV_FLT_TO_FLTP_6CH
1261
%endif
1262
1263