Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*
2
* Copyright (c) 2008 Siarhei Siamashka <[email protected]>
3
*
4
* This file is part of FFmpeg
5
*
6
* FFmpeg is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
10
*
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
*/
20
21
#include "config.h"
22
#include "asm.S"
23
24
/**
25
* Assume that len is a positive number and is multiple of 8
26
*/
27
@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
28
function ff_vector_fmul_vfp, export=1
29
vpush {d8-d15}
30
fmrx r12, fpscr
31
orr r12, r12, #(3 << 16) /* set vector size to 4 */
32
fmxr fpscr, r12
33
34
vldmia r1!, {s0-s3}
35
vldmia r2!, {s8-s11}
36
vldmia r1!, {s4-s7}
37
vldmia r2!, {s12-s15}
38
vmul.f32 s8, s0, s8
39
1:
40
subs r3, r3, #16
41
vmul.f32 s12, s4, s12
42
itttt ge
43
vldmiage r1!, {s16-s19}
44
vldmiage r2!, {s24-s27}
45
vldmiage r1!, {s20-s23}
46
vldmiage r2!, {s28-s31}
47
it ge
48
vmulge.f32 s24, s16, s24
49
vstmia r0!, {s8-s11}
50
vstmia r0!, {s12-s15}
51
it ge
52
vmulge.f32 s28, s20, s28
53
itttt gt
54
vldmiagt r1!, {s0-s3}
55
vldmiagt r2!, {s8-s11}
56
vldmiagt r1!, {s4-s7}
57
vldmiagt r2!, {s12-s15}
58
ittt ge
59
vmulge.f32 s8, s0, s8
60
vstmiage r0!, {s24-s27}
61
vstmiage r0!, {s28-s31}
62
bgt 1b
63
64
bic r12, r12, #(7 << 16) /* set vector size back to 1 */
65
fmxr fpscr, r12
66
vpop {d8-d15}
67
bx lr
68
endfunc
69
70
/**
71
* ARM VFP implementation of 'vector_fmul_window_c' function
72
* Assume that len is a positive non-zero number
73
*/
74
@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
75
@ const float *src1, const float *win, int len)
76
function ff_vector_fmul_window_vfp, export=1
77
DST0 .req a1
78
SRC0 .req a2
79
SRC1 .req a3
80
WIN0 .req a4
81
LEN .req v1
82
DST1 .req v2
83
WIN1 .req v3
84
OLDFPSCR .req ip
85
86
push {v1-v3,lr}
87
ldr LEN, [sp, #4*4+0]
88
vpush {s16-s31}
89
fmrx OLDFPSCR, FPSCR
90
add DST1, DST0, LEN, lsl #3
91
add SRC1, SRC1, LEN, lsl #2
92
add WIN1, WIN0, LEN, lsl #3
93
94
tst LEN, #7
95
beq 4f @ common case: len is a multiple of 8
96
97
ldr lr, =0x03000000 @ RunFast mode, scalar mode
98
fmxr FPSCR, lr
99
100
tst LEN, #1
101
beq 1f
102
vldmdb WIN1!, {s0}
103
vldmia SRC0!, {s8}
104
vldmia WIN0!, {s16}
105
vmul.f s24, s0, s8
106
vldmdb SRC1!, {s20}
107
vmul.f s8, s16, s8
108
vmls.f s24, s16, s20
109
vmla.f s8, s0, s20
110
vstmia DST0!, {s24}
111
vstmdb DST1!, {s8}
112
1:
113
tst LEN, #2
114
beq 2f
115
vldmdb WIN1!, {s0}
116
vldmdb WIN1!, {s1}
117
vldmia SRC0!, {s8-s9}
118
vldmia WIN0!, {s16-s17}
119
vmul.f s24, s0, s8
120
vmul.f s25, s1, s9
121
vldmdb SRC1!, {s20}
122
vldmdb SRC1!, {s21}
123
vmul.f s8, s16, s8
124
vmul.f s9, s17, s9
125
vmls.f s24, s16, s20
126
vmls.f s25, s17, s21
127
vmla.f s8, s0, s20
128
vmla.f s9, s1, s21
129
vstmia DST0!, {s24-s25}
130
vstmdb DST1!, {s8}
131
vstmdb DST1!, {s9}
132
2:
133
tst LEN, #4
134
beq 3f
135
vldmdb WIN1!, {s0}
136
vldmdb WIN1!, {s1}
137
vldmdb WIN1!, {s2}
138
vldmdb WIN1!, {s3}
139
vldmia SRC0!, {s8-s11}
140
vldmia WIN0!, {s16-s19}
141
vmul.f s24, s0, s8
142
vmul.f s25, s1, s9
143
vmul.f s26, s2, s10
144
vmul.f s27, s3, s11
145
vldmdb SRC1!, {s20}
146
vldmdb SRC1!, {s21}
147
vldmdb SRC1!, {s22}
148
vldmdb SRC1!, {s23}
149
vmul.f s8, s16, s8
150
vmul.f s9, s17, s9
151
vmul.f s10, s18, s10
152
vmul.f s11, s19, s11
153
vmls.f s24, s16, s20
154
vmls.f s25, s17, s21
155
vmls.f s26, s18, s22
156
vmls.f s27, s19, s23
157
vmla.f s8, s0, s20
158
vmla.f s9, s1, s21
159
vmla.f s10, s2, s22
160
vmla.f s11, s3, s23
161
vstmia DST0!, {s24-s27}
162
vstmdb DST1!, {s8}
163
vstmdb DST1!, {s9}
164
vstmdb DST1!, {s10}
165
vstmdb DST1!, {s11}
166
3:
167
bics LEN, LEN, #7
168
beq 7f
169
4:
170
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
171
fmxr FPSCR, lr
172
173
vldmdb WIN1!, {s0}
174
vldmdb WIN1!, {s1}
175
vldmdb WIN1!, {s2}
176
vldmdb WIN1!, {s3}
177
vldmia SRC0!, {s8-s11}
178
vldmia WIN0!, {s16-s19}
179
vmul.f s24, s0, s8 @ vector * vector
180
vldmdb SRC1!, {s20}
181
vldmdb SRC1!, {s21}
182
vldmdb SRC1!, {s22}
183
vldmdb SRC1!, {s23}
184
vmul.f s8, s16, s8 @ vector * vector
185
vmls.f s24, s16, s20 @ vector * vector
186
vldmdb WIN1!, {s4}
187
vldmdb WIN1!, {s5}
188
vldmdb WIN1!, {s6}
189
vldmdb WIN1!, {s7}
190
vldmia SRC0!, {s12-s13}
191
vmla.f s8, s0, s20 @ vector * vector
192
vldmia SRC0!, {s14-s15}
193
subs LEN, LEN, #8
194
beq 6f
195
5: vldmia WIN0!, {s20-s23}
196
vmul.f s28, s4, s12 @ vector * vector
197
vstmia DST0!, {s24-s25}
198
vldmdb SRC1!, {s16}
199
vldmdb SRC1!, {s17}
200
vldmdb SRC1!, {s18}
201
vldmdb SRC1!, {s19}
202
vmul.f s12, s20, s12 @ vector * vector
203
vstmia DST0!, {s26-s27}
204
vstmdb DST1!, {s8}
205
vstmdb DST1!, {s9}
206
vstmdb DST1!, {s10}
207
vstmdb DST1!, {s11}
208
vmls.f s28, s20, s16 @ vector * vector
209
vldmdb WIN1!, {s0}
210
vldmdb WIN1!, {s1}
211
vldmdb WIN1!, {s2}
212
vldmdb WIN1!, {s3}
213
vldmia SRC0!, {s8-s9}
214
vmla.f s12, s4, s16 @ vector * vector
215
vldmia SRC0!, {s10-s11}
216
subs LEN, LEN, #8
217
vldmia WIN0!, {s16-s19}
218
vmul.f s24, s0, s8 @ vector * vector
219
vstmia DST0!, {s28-s29}
220
vldmdb SRC1!, {s20}
221
vldmdb SRC1!, {s21}
222
vldmdb SRC1!, {s22}
223
vldmdb SRC1!, {s23}
224
vmul.f s8, s16, s8 @ vector * vector
225
vstmia DST0!, {s30-s31}
226
vstmdb DST1!, {s12}
227
vstmdb DST1!, {s13}
228
vstmdb DST1!, {s14}
229
vstmdb DST1!, {s15}
230
vmls.f s24, s16, s20 @ vector * vector
231
vldmdb WIN1!, {s4}
232
vldmdb WIN1!, {s5}
233
vldmdb WIN1!, {s6}
234
vldmdb WIN1!, {s7}
235
vldmia SRC0!, {s12-s13}
236
vmla.f s8, s0, s20 @ vector * vector
237
vldmia SRC0!, {s14-s15}
238
bne 5b
239
6: vldmia WIN0!, {s20-s23}
240
vmul.f s28, s4, s12 @ vector * vector
241
vstmia DST0!, {s24-s25}
242
vldmdb SRC1!, {s16}
243
vldmdb SRC1!, {s17}
244
vldmdb SRC1!, {s18}
245
vldmdb SRC1!, {s19}
246
vmul.f s12, s20, s12 @ vector * vector
247
vstmia DST0!, {s26-s27}
248
vstmdb DST1!, {s8}
249
vstmdb DST1!, {s9}
250
vstmdb DST1!, {s10}
251
vstmdb DST1!, {s11}
252
vmls.f s28, s20, s16 @ vector * vector
253
vmla.f s12, s4, s16 @ vector * vector
254
vstmia DST0!, {s28-s31}
255
vstmdb DST1!, {s12}
256
vstmdb DST1!, {s13}
257
vstmdb DST1!, {s14}
258
vstmdb DST1!, {s15}
259
7:
260
fmxr FPSCR, OLDFPSCR
261
vpop {s16-s31}
262
pop {v1-v3,pc}
263
264
.unreq DST0
265
.unreq SRC0
266
.unreq SRC1
267
.unreq WIN0
268
.unreq LEN
269
.unreq OLDFPSCR
270
.unreq DST1
271
.unreq WIN1
272
endfunc
273
274
/**
275
* ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
276
* Assume that len is a positive number and is multiple of 8
277
*/
278
@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
279
@ const float *src1, int len)
280
function ff_vector_fmul_reverse_vfp, export=1
281
vpush {d8-d15}
282
add r2, r2, r3, lsl #2
283
vldmdb r2!, {s0-s3}
284
vldmia r1!, {s8-s11}
285
vldmdb r2!, {s4-s7}
286
vldmia r1!, {s12-s15}
287
vmul.f32 s8, s3, s8
288
vmul.f32 s9, s2, s9
289
vmul.f32 s10, s1, s10
290
vmul.f32 s11, s0, s11
291
1:
292
subs r3, r3, #16
293
it ge
294
vldmdbge r2!, {s16-s19}
295
vmul.f32 s12, s7, s12
296
it ge
297
vldmiage r1!, {s24-s27}
298
vmul.f32 s13, s6, s13
299
it ge
300
vldmdbge r2!, {s20-s23}
301
vmul.f32 s14, s5, s14
302
it ge
303
vldmiage r1!, {s28-s31}
304
vmul.f32 s15, s4, s15
305
it ge
306
vmulge.f32 s24, s19, s24
307
it gt
308
vldmdbgt r2!, {s0-s3}
309
it ge
310
vmulge.f32 s25, s18, s25
311
vstmia r0!, {s8-s13}
312
it ge
313
vmulge.f32 s26, s17, s26
314
it gt
315
vldmiagt r1!, {s8-s11}
316
itt ge
317
vmulge.f32 s27, s16, s27
318
vmulge.f32 s28, s23, s28
319
it gt
320
vldmdbgt r2!, {s4-s7}
321
it ge
322
vmulge.f32 s29, s22, s29
323
vstmia r0!, {s14-s15}
324
ittt ge
325
vmulge.f32 s30, s21, s30
326
vmulge.f32 s31, s20, s31
327
vmulge.f32 s8, s3, s8
328
it gt
329
vldmiagt r1!, {s12-s15}
330
itttt ge
331
vmulge.f32 s9, s2, s9
332
vmulge.f32 s10, s1, s10
333
vstmiage r0!, {s24-s27}
334
vmulge.f32 s11, s0, s11
335
it ge
336
vstmiage r0!, {s28-s31}
337
bgt 1b
338
339
vpop {d8-d15}
340
bx lr
341
endfunc
342
343
/**
344
* ARM VFP implementation of 'butterflies_float_c' function
345
* Assume that len is a positive non-zero number
346
*/
347
@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
348
function ff_butterflies_float_vfp, export=1
349
BASE1 .req a1
350
BASE2 .req a2
351
LEN .req a3
352
OLDFPSCR .req a4
353
354
vpush {s16-s31}
355
fmrx OLDFPSCR, FPSCR
356
357
tst LEN, #7
358
beq 4f @ common case: len is a multiple of 8
359
360
ldr ip, =0x03000000 @ RunFast mode, scalar mode
361
fmxr FPSCR, ip
362
363
tst LEN, #1
364
beq 1f
365
vldmia BASE1!, {s0}
366
vldmia BASE2!, {s8}
367
vadd.f s16, s0, s8
368
vsub.f s24, s0, s8
369
vstr s16, [BASE1, #0-4*1]
370
vstr s24, [BASE2, #0-4*1]
371
1:
372
tst LEN, #2
373
beq 2f
374
vldmia BASE1!, {s0-s1}
375
vldmia BASE2!, {s8-s9}
376
vadd.f s16, s0, s8
377
vadd.f s17, s1, s9
378
vsub.f s24, s0, s8
379
vsub.f s25, s1, s9
380
vstr d8, [BASE1, #0-8*1] @ s16,s17
381
vstr d12, [BASE2, #0-8*1] @ s24,s25
382
2:
383
tst LEN, #4
384
beq 3f
385
vldmia BASE1!, {s0-s1}
386
vldmia BASE2!, {s8-s9}
387
vldmia BASE1!, {s2-s3}
388
vldmia BASE2!, {s10-s11}
389
vadd.f s16, s0, s8
390
vadd.f s17, s1, s9
391
vsub.f s24, s0, s8
392
vsub.f s25, s1, s9
393
vadd.f s18, s2, s10
394
vadd.f s19, s3, s11
395
vsub.f s26, s2, s10
396
vsub.f s27, s3, s11
397
vstr d8, [BASE1, #0-16*1] @ s16,s17
398
vstr d12, [BASE2, #0-16*1] @ s24,s25
399
vstr d9, [BASE1, #8-16*1] @ s18,s19
400
vstr d13, [BASE2, #8-16*1] @ s26,s27
401
3:
402
bics LEN, LEN, #7
403
beq 7f
404
4:
405
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
406
fmxr FPSCR, ip
407
408
vldmia BASE1!, {s0-s1}
409
vldmia BASE2!, {s8-s9}
410
vldmia BASE1!, {s2-s3}
411
vldmia BASE2!, {s10-s11}
412
vadd.f s16, s0, s8
413
vldmia BASE1!, {s4-s5}
414
vldmia BASE2!, {s12-s13}
415
vldmia BASE1!, {s6-s7}
416
vldmia BASE2!, {s14-s15}
417
vsub.f s24, s0, s8
418
vadd.f s20, s4, s12
419
subs LEN, LEN, #8
420
beq 6f
421
5: vldmia BASE1!, {s0-s3}
422
vldmia BASE2!, {s8-s11}
423
vsub.f s28, s4, s12
424
vstr d8, [BASE1, #0-16*3] @ s16,s17
425
vstr d9, [BASE1, #8-16*3] @ s18,s19
426
vstr d12, [BASE2, #0-16*3] @ s24,s25
427
vstr d13, [BASE2, #8-16*3] @ s26,s27
428
vadd.f s16, s0, s8
429
vldmia BASE1!, {s4-s7}
430
vldmia BASE2!, {s12-s15}
431
vsub.f s24, s0, s8
432
vstr d10, [BASE1, #0-16*3] @ s20,s21
433
vstr d11, [BASE1, #8-16*3] @ s22,s23
434
vstr d14, [BASE2, #0-16*3] @ s28,s29
435
vstr d15, [BASE2, #8-16*3] @ s30,s31
436
vadd.f s20, s4, s12
437
subs LEN, LEN, #8
438
bne 5b
439
6: vsub.f s28, s4, s12
440
vstr d8, [BASE1, #0-16*2] @ s16,s17
441
vstr d9, [BASE1, #8-16*2] @ s18,s19
442
vstr d12, [BASE2, #0-16*2] @ s24,s25
443
vstr d13, [BASE2, #8-16*2] @ s26,s27
444
vstr d10, [BASE1, #0-16*1] @ s20,s21
445
vstr d11, [BASE1, #8-16*1] @ s22,s23
446
vstr d14, [BASE2, #0-16*1] @ s28,s29
447
vstr d15, [BASE2, #8-16*1] @ s30,s31
448
7:
449
fmxr FPSCR, OLDFPSCR
450
vpop {s16-s31}
451
bx lr
452
453
.unreq BASE1
454
.unreq BASE2
455
.unreq LEN
456
.unreq OLDFPSCR
457
endfunc
458
459