Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*
2
* Copyright (c) 2008 Mans Rullgard <[email protected]>
3
* Copyright (c) 2014 Janne Grunau <[email protected]>
4
*
5
* This file is part of FFmpeg.
6
*
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
11
*
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
16
*
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
*/
21
22
#include "config.h"
23
#include "libavutil/aarch64/asm.S"
24
25
function swri_oldapi_conv_flt_to_s16_neon, export=1
26
subs x2, x2, #8
27
ld1 {v0.4s}, [x1], #16
28
fcvtzs v4.4s, v0.4s, #31
29
ld1 {v1.4s}, [x1], #16
30
fcvtzs v5.4s, v1.4s, #31
31
b.eq 3f
32
ands x12, x2, #~15
33
b.eq 2f
34
1: subs x12, x12, #16
35
sqrshrn v4.4h, v4.4s, #16
36
ld1 {v2.4s}, [x1], #16
37
fcvtzs v6.4s, v2.4s, #31
38
sqrshrn2 v4.8h, v5.4s, #16
39
ld1 {v3.4s}, [x1], #16
40
fcvtzs v7.4s, v3.4s, #31
41
sqrshrn v6.4h, v6.4s, #16
42
st1 {v4.8h}, [x0], #16
43
sqrshrn2 v6.8h, v7.4s, #16
44
ld1 {v0.4s}, [x1], #16
45
fcvtzs v4.4s, v0.4s, #31
46
ld1 {v1.4s}, [x1], #16
47
fcvtzs v5.4s, v1.4s, #31
48
st1 {v6.8h}, [x0], #16
49
b.ne 1b
50
ands x2, x2, #15
51
b.eq 3f
52
2: ld1 {v2.4s}, [x1], #16
53
sqrshrn v4.4h, v4.4s, #16
54
fcvtzs v6.4s, v2.4s, #31
55
ld1 {v3.4s}, [x1], #16
56
sqrshrn2 v4.8h, v5.4s, #16
57
fcvtzs v7.4s, v3.4s, #31
58
sqrshrn v6.4h, v6.4s, #16
59
st1 {v4.8h}, [x0], #16
60
sqrshrn2 v6.8h, v7.4s, #16
61
st1 {v6.8h}, [x0]
62
ret
63
3: sqrshrn v4.4h, v4.4s, #16
64
sqrshrn2 v4.8h, v5.4s, #16
65
st1 {v4.8h}, [x0]
66
ret
67
endfunc
68
69
function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1
70
ldp x4, x5, [x1]
71
subs x2, x2, #8
72
ld1 {v0.4s}, [x4], #16
73
fcvtzs v4.4s, v0.4s, #31
74
ld1 {v1.4s}, [x4], #16
75
fcvtzs v5.4s, v1.4s, #31
76
ld1 {v2.4s}, [x5], #16
77
fcvtzs v6.4s, v2.4s, #31
78
ld1 {v3.4s}, [x5], #16
79
fcvtzs v7.4s, v3.4s, #31
80
b.eq 3f
81
ands x12, x2, #~15
82
b.eq 2f
83
1: subs x12, x12, #16
84
ld1 {v16.4s}, [x4], #16
85
fcvtzs v20.4s, v16.4s, #31
86
sri v6.4s, v4.4s, #16
87
ld1 {v17.4s}, [x4], #16
88
fcvtzs v21.4s, v17.4s, #31
89
ld1 {v18.4s}, [x5], #16
90
fcvtzs v22.4s, v18.4s, #31
91
ld1 {v19.4s}, [x5], #16
92
sri v7.4s, v5.4s, #16
93
st1 {v6.4s}, [x0], #16
94
fcvtzs v23.4s, v19.4s, #31
95
st1 {v7.4s}, [x0], #16
96
sri v22.4s, v20.4s, #16
97
ld1 {v0.4s}, [x4], #16
98
sri v23.4s, v21.4s, #16
99
st1 {v22.4s}, [x0], #16
100
fcvtzs v4.4s, v0.4s, #31
101
ld1 {v1.4s}, [x4], #16
102
fcvtzs v5.4s, v1.4s, #31
103
ld1 {v2.4s}, [x5], #16
104
fcvtzs v6.4s, v2.4s, #31
105
ld1 {v3.4s}, [x5], #16
106
fcvtzs v7.4s, v3.4s, #31
107
st1 {v23.4s}, [x0], #16
108
b.ne 1b
109
ands x2, x2, #15
110
b.eq 3f
111
2: sri v6.4s, v4.4s, #16
112
ld1 {v0.4s}, [x4], #16
113
fcvtzs v0.4s, v0.4s, #31
114
ld1 {v1.4s}, [x4], #16
115
fcvtzs v1.4s, v1.4s, #31
116
ld1 {v2.4s}, [x5], #16
117
fcvtzs v2.4s, v2.4s, #31
118
sri v7.4s, v5.4s, #16
119
ld1 {v3.4s}, [x5], #16
120
fcvtzs v3.4s, v3.4s, #31
121
sri v2.4s, v0.4s, #16
122
st1 {v6.4s,v7.4s}, [x0], #32
123
sri v3.4s, v1.4s, #16
124
st1 {v2.4s,v3.4s}, [x0], #32
125
ret
126
3: sri v6.4s, v4.4s, #16
127
sri v7.4s, v5.4s, #16
128
st1 {v6.4s,v7.4s}, [x0]
129
ret
130
endfunc
131
132
function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1
133
cmp w3, #2
134
b.eq X(swri_oldapi_conv_fltp_to_s16_2ch_neon)
135
b.gt 1f
136
ldr x1, [x1]
137
b X(swri_oldapi_conv_flt_to_s16_neon)
138
1:
139
cmp w3, #4
140
lsl x12, x3, #1
141
b.lt 4f
142
143
5: // 4 channels
144
ldp x4, x5, [x1], #16
145
ldp x6, x7, [x1], #16
146
mov w9, w2
147
mov x8, x0
148
ld1 {v4.4s}, [x4], #16
149
fcvtzs v4.4s, v4.4s, #31
150
ld1 {v5.4s}, [x5], #16
151
fcvtzs v5.4s, v5.4s, #31
152
ld1 {v6.4s}, [x6], #16
153
fcvtzs v6.4s, v6.4s, #31
154
ld1 {v7.4s}, [x7], #16
155
fcvtzs v7.4s, v7.4s, #31
156
6:
157
subs w9, w9, #8
158
ld1 {v0.4s}, [x4], #16
159
fcvtzs v0.4s, v0.4s, #31
160
sri v5.4s, v4.4s, #16
161
ld1 {v1.4s}, [x5], #16
162
fcvtzs v1.4s, v1.4s, #31
163
sri v7.4s, v6.4s, #16
164
ld1 {v2.4s}, [x6], #16
165
fcvtzs v2.4s, v2.4s, #31
166
zip1 v16.4s, v5.4s, v7.4s
167
ld1 {v3.4s}, [x7], #16
168
fcvtzs v3.4s, v3.4s, #31
169
zip2 v17.4s, v5.4s, v7.4s
170
st1 {v16.d}[0], [x8], x12
171
sri v1.4s, v0.4s, #16
172
st1 {v16.d}[1], [x8], x12
173
sri v3.4s, v2.4s, #16
174
st1 {v17.d}[0], [x8], x12
175
zip1 v18.4s, v1.4s, v3.4s
176
st1 {v17.d}[1], [x8], x12
177
zip2 v19.4s, v1.4s, v3.4s
178
b.eq 7f
179
ld1 {v4.4s}, [x4], #16
180
fcvtzs v4.4s, v4.4s, #31
181
st1 {v18.d}[0], [x8], x12
182
ld1 {v5.4s}, [x5], #16
183
fcvtzs v5.4s, v5.4s, #31
184
st1 {v18.d}[1], [x8], x12
185
ld1 {v6.4s}, [x6], #16
186
fcvtzs v6.4s, v6.4s, #31
187
st1 {v19.d}[0], [x8], x12
188
ld1 {v7.4s}, [x7], #16
189
fcvtzs v7.4s, v7.4s, #31
190
st1 {v19.d}[1], [x8], x12
191
b 6b
192
7:
193
st1 {v18.d}[0], [x8], x12
194
st1 {v18.d}[1], [x8], x12
195
st1 {v19.d}[0], [x8], x12
196
st1 {v19.d}[1], [x8], x12
197
subs w3, w3, #4
198
b.eq end
199
cmp w3, #4
200
add x0, x0, #8
201
b.ge 5b
202
203
4: // 2 channels
204
cmp w3, #2
205
b.lt 4f
206
ldp x4, x5, [x1], #16
207
mov w9, w2
208
mov x8, x0
209
tst w9, #8
210
ld1 {v4.4s}, [x4], #16
211
fcvtzs v4.4s, v4.4s, #31
212
ld1 {v5.4s}, [x5], #16
213
fcvtzs v5.4s, v5.4s, #31
214
ld1 {v6.4s}, [x4], #16
215
fcvtzs v6.4s, v6.4s, #31
216
ld1 {v7.4s}, [x5], #16
217
fcvtzs v7.4s, v7.4s, #31
218
b.eq 6f
219
subs w9, w9, #8
220
b.eq 7f
221
sri v5.4s, v4.4s, #16
222
ld1 {v4.4s}, [x4], #16
223
fcvtzs v4.4s, v4.4s, #31
224
st1 {v5.s}[0], [x8], x12
225
sri v7.4s, v6.4s, #16
226
st1 {v5.s}[1], [x8], x12
227
ld1 {v6.4s}, [x4], #16
228
fcvtzs v6.4s, v6.4s, #31
229
st1 {v5.s}[2], [x8], x12
230
st1 {v5.s}[3], [x8], x12
231
st1 {v7.s}[0], [x8], x12
232
st1 {v7.s}[1], [x8], x12
233
ld1 {v5.4s}, [x5], #16
234
fcvtzs v5.4s, v5.4s, #31
235
st1 {v7.s}[2], [x8], x12
236
st1 {v7.s}[3], [x8], x12
237
ld1 {v7.4s}, [x5], #16
238
fcvtzs v7.4s, v7.4s, #31
239
6:
240
subs w9, w9, #16
241
ld1 {v0.4s}, [x4], #16
242
sri v5.4s, v4.4s, #16
243
fcvtzs v0.4s, v0.4s, #31
244
ld1 {v1.4s}, [x5], #16
245
sri v7.4s, v6.4s, #16
246
st1 {v5.s}[0], [x8], x12
247
st1 {v5.s}[1], [x8], x12
248
fcvtzs v1.4s, v1.4s, #31
249
st1 {v5.s}[2], [x8], x12
250
st1 {v5.s}[3], [x8], x12
251
ld1 {v2.4s}, [x4], #16
252
st1 {v7.s}[0], [x8], x12
253
fcvtzs v2.4s, v2.4s, #31
254
st1 {v7.s}[1], [x8], x12
255
ld1 {v3.4s}, [x5], #16
256
st1 {v7.s}[2], [x8], x12
257
fcvtzs v3.4s, v3.4s, #31
258
st1 {v7.s}[3], [x8], x12
259
sri v1.4s, v0.4s, #16
260
sri v3.4s, v2.4s, #16
261
b.eq 6f
262
ld1 {v4.4s}, [x4], #16
263
st1 {v1.s}[0], [x8], x12
264
fcvtzs v4.4s, v4.4s, #31
265
st1 {v1.s}[1], [x8], x12
266
ld1 {v5.4s}, [x5], #16
267
st1 {v1.s}[2], [x8], x12
268
fcvtzs v5.4s, v5.4s, #31
269
st1 {v1.s}[3], [x8], x12
270
ld1 {v6.4s}, [x4], #16
271
st1 {v3.s}[0], [x8], x12
272
fcvtzs v6.4s, v6.4s, #31
273
st1 {v3.s}[1], [x8], x12
274
ld1 {v7.4s}, [x5], #16
275
st1 {v3.s}[2], [x8], x12
276
fcvtzs v7.4s, v7.4s, #31
277
st1 {v3.s}[3], [x8], x12
278
b.gt 6b
279
6:
280
st1 {v1.s}[0], [x8], x12
281
st1 {v1.s}[1], [x8], x12
282
st1 {v1.s}[2], [x8], x12
283
st1 {v1.s}[3], [x8], x12
284
st1 {v3.s}[0], [x8], x12
285
st1 {v3.s}[1], [x8], x12
286
st1 {v3.s}[2], [x8], x12
287
st1 {v3.s}[3], [x8], x12
288
b 8f
289
7:
290
sri v5.4s, v4.4s, #16
291
sri v7.4s, v6.4s, #16
292
st1 {v5.s}[0], [x8], x12
293
st1 {v5.s}[1], [x8], x12
294
st1 {v5.s}[2], [x8], x12
295
st1 {v5.s}[3], [x8], x12
296
st1 {v7.s}[0], [x8], x12
297
st1 {v7.s}[1], [x8], x12
298
st1 {v7.s}[2], [x8], x12
299
st1 {v7.s}[3], [x8], x12
300
8:
301
subs w3, w3, #2
302
add x0, x0, #4
303
b.eq end
304
305
4: // 1 channel
306
ldr x4, [x1]
307
tst w2, #8
308
mov w9, w2
309
mov x5, x0
310
ld1 {v0.4s}, [x4], #16
311
fcvtzs v0.4s, v0.4s, #31
312
ld1 {v1.4s}, [x4], #16
313
fcvtzs v1.4s, v1.4s, #31
314
b.ne 8f
315
6:
316
subs w9, w9, #16
317
ld1 {v2.4s}, [x4], #16
318
fcvtzs v2.4s, v2.4s, #31
319
ld1 {v3.4s}, [x4], #16
320
fcvtzs v3.4s, v3.4s, #31
321
st1 {v0.h}[1], [x5], x12
322
st1 {v0.h}[3], [x5], x12
323
st1 {v0.h}[5], [x5], x12
324
st1 {v0.h}[7], [x5], x12
325
st1 {v1.h}[1], [x5], x12
326
st1 {v1.h}[3], [x5], x12
327
st1 {v1.h}[5], [x5], x12
328
st1 {v1.h}[7], [x5], x12
329
b.eq 7f
330
ld1 {v0.4s}, [x4], #16
331
fcvtzs v0.4s, v0.4s, #31
332
ld1 {v1.4s}, [x4], #16
333
fcvtzs v1.4s, v1.4s, #31
334
7:
335
st1 {v2.h}[1], [x5], x12
336
st1 {v2.h}[3], [x5], x12
337
st1 {v2.h}[5], [x5], x12
338
st1 {v2.h}[7], [x5], x12
339
st1 {v3.h}[1], [x5], x12
340
st1 {v3.h}[3], [x5], x12
341
st1 {v3.h}[5], [x5], x12
342
st1 {v3.h}[7], [x5], x12
343
b.gt 6b
344
ret
345
8:
346
subs w9, w9, #8
347
st1 {v0.h}[1], [x5], x12
348
st1 {v0.h}[3], [x5], x12
349
st1 {v0.h}[5], [x5], x12
350
st1 {v0.h}[7], [x5], x12
351
st1 {v1.h}[1], [x5], x12
352
st1 {v1.h}[3], [x5], x12
353
st1 {v1.h}[5], [x5], x12
354
st1 {v1.h}[7], [x5], x12
355
b.eq end
356
ld1 {v0.4s}, [x4], #16
357
fcvtzs v0.4s, v0.4s, #31
358
ld1 {v1.4s}, [x4], #16
359
fcvtzs v1.4s, v1.4s, #31
360
b 6b
361
end:
362
ret
363
endfunc
364
365