Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*
2
* ARM NEON optimised Float DSP functions
3
* Copyright (c) 2008 Mans Rullgard <[email protected]>
4
*
5
* This file is part of FFmpeg.
6
*
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
11
*
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
16
*
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
*/
21
22
#include "config.h"
23
#include "asm.S"
24
25
function ff_vector_fmul_neon, export=1
26
subs r3, r3, #8
27
vld1.32 {d0-d3}, [r1,:128]!
28
vld1.32 {d4-d7}, [r2,:128]!
29
vmul.f32 q8, q0, q2
30
vmul.f32 q9, q1, q3
31
beq 3f
32
bics ip, r3, #15
33
beq 2f
34
1: subs ip, ip, #16
35
vld1.32 {d0-d1}, [r1,:128]!
36
vld1.32 {d4-d5}, [r2,:128]!
37
vmul.f32 q10, q0, q2
38
vld1.32 {d2-d3}, [r1,:128]!
39
vld1.32 {d6-d7}, [r2,:128]!
40
vmul.f32 q11, q1, q3
41
vst1.32 {d16-d19},[r0,:128]!
42
vld1.32 {d0-d1}, [r1,:128]!
43
vld1.32 {d4-d5}, [r2,:128]!
44
vmul.f32 q8, q0, q2
45
vld1.32 {d2-d3}, [r1,:128]!
46
vld1.32 {d6-d7}, [r2,:128]!
47
vmul.f32 q9, q1, q3
48
vst1.32 {d20-d23},[r0,:128]!
49
bne 1b
50
ands r3, r3, #15
51
beq 3f
52
2: vld1.32 {d0-d1}, [r1,:128]!
53
vld1.32 {d4-d5}, [r2,:128]!
54
vst1.32 {d16-d17},[r0,:128]!
55
vmul.f32 q8, q0, q2
56
vld1.32 {d2-d3}, [r1,:128]!
57
vld1.32 {d6-d7}, [r2,:128]!
58
vst1.32 {d18-d19},[r0,:128]!
59
vmul.f32 q9, q1, q3
60
3: vst1.32 {d16-d19},[r0,:128]!
61
bx lr
62
endfunc
63
64
function ff_vector_fmac_scalar_neon, export=1
65
VFP len .req r2
66
VFP acc .req r3
67
NOVFP len .req r3
68
NOVFP acc .req r2
69
VFP vdup.32 q15, d0[0]
70
NOVFP vdup.32 q15, r2
71
bics r12, len, #15
72
mov acc, r0
73
beq 3f
74
vld1.32 {q0}, [r1,:128]!
75
vld1.32 {q8}, [acc,:128]!
76
vld1.32 {q1}, [r1,:128]!
77
vld1.32 {q9}, [acc,:128]!
78
1: vmla.f32 q8, q0, q15
79
vld1.32 {q2}, [r1,:128]!
80
vld1.32 {q10}, [acc,:128]!
81
vmla.f32 q9, q1, q15
82
vld1.32 {q3}, [r1,:128]!
83
vld1.32 {q11}, [acc,:128]!
84
vmla.f32 q10, q2, q15
85
vst1.32 {q8}, [r0,:128]!
86
vmla.f32 q11, q3, q15
87
vst1.32 {q9}, [r0,:128]!
88
subs r12, r12, #16
89
beq 2f
90
vld1.32 {q0}, [r1,:128]!
91
vld1.32 {q8}, [acc,:128]!
92
vst1.32 {q10}, [r0,:128]!
93
vld1.32 {q1}, [r1,:128]!
94
vld1.32 {q9}, [acc,:128]!
95
vst1.32 {q11}, [r0,:128]!
96
b 1b
97
2: vst1.32 {q10}, [r0,:128]!
98
vst1.32 {q11}, [r0,:128]!
99
ands len, len, #15
100
it eq
101
bxeq lr
102
3: vld1.32 {q0}, [r1,:128]!
103
vld1.32 {q8}, [acc,:128]!
104
vmla.f32 q8, q0, q15
105
vst1.32 {q8}, [r0,:128]!
106
subs len, len, #4
107
bgt 3b
108
bx lr
109
.unreq len
110
endfunc
111
112
function ff_vector_fmul_scalar_neon, export=1
113
VFP len .req r2
114
NOVFP len .req r3
115
VFP vdup.32 q8, d0[0]
116
NOVFP vdup.32 q8, r2
117
bics r12, len, #15
118
beq 3f
119
vld1.32 {q0},[r1,:128]!
120
vld1.32 {q1},[r1,:128]!
121
1: vmul.f32 q0, q0, q8
122
vld1.32 {q2},[r1,:128]!
123
vmul.f32 q1, q1, q8
124
vld1.32 {q3},[r1,:128]!
125
vmul.f32 q2, q2, q8
126
vst1.32 {q0},[r0,:128]!
127
vmul.f32 q3, q3, q8
128
vst1.32 {q1},[r0,:128]!
129
subs r12, r12, #16
130
beq 2f
131
vld1.32 {q0},[r1,:128]!
132
vst1.32 {q2},[r0,:128]!
133
vld1.32 {q1},[r1,:128]!
134
vst1.32 {q3},[r0,:128]!
135
b 1b
136
2: vst1.32 {q2},[r0,:128]!
137
vst1.32 {q3},[r0,:128]!
138
ands len, len, #15
139
it eq
140
bxeq lr
141
3: vld1.32 {q0},[r1,:128]!
142
vmul.f32 q0, q0, q8
143
vst1.32 {q0},[r0,:128]!
144
subs len, len, #4
145
bgt 3b
146
bx lr
147
.unreq len
148
endfunc
149
150
function ff_vector_fmul_window_neon, export=1
151
push {r4,r5,lr}
152
ldr lr, [sp, #12]
153
sub r2, r2, #8
154
sub r5, lr, #2
155
add r2, r2, r5, lsl #2
156
add r4, r3, r5, lsl #3
157
add ip, r0, r5, lsl #3
158
mov r5, #-16
159
vld1.32 {d0,d1}, [r1,:128]!
160
vld1.32 {d2,d3}, [r2,:128], r5
161
vld1.32 {d4,d5}, [r3,:128]!
162
vld1.32 {d6,d7}, [r4,:128], r5
163
1: subs lr, lr, #4
164
vmul.f32 d22, d0, d4
165
vrev64.32 q3, q3
166
vmul.f32 d23, d1, d5
167
vrev64.32 q1, q1
168
vmul.f32 d20, d0, d7
169
vmul.f32 d21, d1, d6
170
beq 2f
171
vmla.f32 d22, d3, d7
172
vld1.32 {d0,d1}, [r1,:128]!
173
vmla.f32 d23, d2, d6
174
vld1.32 {d18,d19},[r2,:128], r5
175
vmls.f32 d20, d3, d4
176
vld1.32 {d24,d25},[r3,:128]!
177
vmls.f32 d21, d2, d5
178
vld1.32 {d6,d7}, [r4,:128], r5
179
vmov q1, q9
180
vrev64.32 q11, q11
181
vmov q2, q12
182
vswp d22, d23
183
vst1.32 {d20,d21},[r0,:128]!
184
vst1.32 {d22,d23},[ip,:128], r5
185
b 1b
186
2: vmla.f32 d22, d3, d7
187
vmla.f32 d23, d2, d6
188
vmls.f32 d20, d3, d4
189
vmls.f32 d21, d2, d5
190
vrev64.32 q11, q11
191
vswp d22, d23
192
vst1.32 {d20,d21},[r0,:128]!
193
vst1.32 {d22,d23},[ip,:128], r5
194
pop {r4,r5,pc}
195
endfunc
196
197
function ff_vector_fmul_add_neon, export=1
198
ldr r12, [sp]
199
vld1.32 {q0-q1}, [r1,:128]!
200
vld1.32 {q8-q9}, [r2,:128]!
201
vld1.32 {q2-q3}, [r3,:128]!
202
vmul.f32 q10, q0, q8
203
vmul.f32 q11, q1, q9
204
1: vadd.f32 q12, q2, q10
205
vadd.f32 q13, q3, q11
206
pld [r1, #16]
207
pld [r2, #16]
208
pld [r3, #16]
209
subs r12, r12, #8
210
beq 2f
211
vld1.32 {q0}, [r1,:128]!
212
vld1.32 {q8}, [r2,:128]!
213
vmul.f32 q10, q0, q8
214
vld1.32 {q1}, [r1,:128]!
215
vld1.32 {q9}, [r2,:128]!
216
vmul.f32 q11, q1, q9
217
vld1.32 {q2-q3}, [r3,:128]!
218
vst1.32 {q12-q13},[r0,:128]!
219
b 1b
220
2: vst1.32 {q12-q13},[r0,:128]!
221
bx lr
222
endfunc
223
224
function ff_vector_fmul_reverse_neon, export=1
225
add r2, r2, r3, lsl #2
226
sub r2, r2, #32
227
mov r12, #-32
228
vld1.32 {q0-q1}, [r1,:128]!
229
vld1.32 {q2-q3}, [r2,:128], r12
230
1: pld [r1, #32]
231
vrev64.32 q3, q3
232
vmul.f32 d16, d0, d7
233
vmul.f32 d17, d1, d6
234
pld [r2, #-32]
235
vrev64.32 q2, q2
236
vmul.f32 d18, d2, d5
237
vmul.f32 d19, d3, d4
238
subs r3, r3, #8
239
beq 2f
240
vld1.32 {q0-q1}, [r1,:128]!
241
vld1.32 {q2-q3}, [r2,:128], r12
242
vst1.32 {q8-q9}, [r0,:128]!
243
b 1b
244
2: vst1.32 {q8-q9}, [r0,:128]!
245
bx lr
246
endfunc
247
248
function ff_butterflies_float_neon, export=1
249
1: vld1.32 {q0},[r0,:128]
250
vld1.32 {q1},[r1,:128]
251
vsub.f32 q2, q0, q1
252
vadd.f32 q1, q0, q1
253
vst1.32 {q2},[r1,:128]!
254
vst1.32 {q1},[r0,:128]!
255
subs r2, r2, #4
256
bgt 1b
257
bx lr
258
endfunc
259
260
function ff_scalarproduct_float_neon, export=1
261
vmov.f32 q2, #0.0
262
1: vld1.32 {q0},[r0,:128]!
263
vld1.32 {q1},[r1,:128]!
264
vmla.f32 q2, q0, q1
265
subs r2, r2, #4
266
bgt 1b
267
vadd.f32 d0, d4, d5
268
vpadd.f32 d0, d0, d0
269
NOVFP vmov.32 r0, d0[0]
270
bx lr
271
endfunc
272
273