Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*
2
* Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
3
* Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
4
*
5
* This file is part of FFmpeg.
6
*
7
* FFmpeg is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU Lesser General Public
9
* License as published by the Free Software Foundation; either
10
* version 2.1 of the License, or (at your option) any later version.
11
*
12
* FFmpeg is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
* Lesser General Public License for more details.
16
*
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with FFmpeg; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
*/
21
22
#include "libavutil/arm/asm.S"
23
24
25
.macro compute_premult_16 half_u1, half_u2, half_v1, half_v2
26
vmov d2, \half_u1 @ copy left q14 to left q1
27
vmov d3, \half_u1 @ copy left q14 to right q1
28
vmov d4, \half_u2 @ copy right q14 to left q2
29
vmov d5, \half_u2 @ copy right q14 to right q2
30
31
vmov d6, \half_v1 @ copy left q15 to left q3
32
vmov d7, \half_v1 @ copy left q15 to right q3
33
vmov d8, \half_v2 @ copy right q15 to left q4
34
vmov d9, \half_v2 @ copy right q15 to right q4
35
36
vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4
37
vzip.16 d4, d5 @ U5U5U6U6U7U7U8U8
38
39
vzip.16 d6, d7 @ V1V1V2V2V3V3V4V4
40
vzip.16 d8, d9 @ V5V5V6V6V7V7V8V8
41
42
vmul.s16 q8, q3, d1[0] @ V * v2r (left, red)
43
vmul.s16 q9, q4, d1[0] @ V * v2r (right, red)
44
vmul.s16 q10, q1, d1[1] @ U * u2g
45
vmul.s16 q11, q2, d1[1] @ U * u2g
46
vmla.s16 q10, q3, d1[2] @ U * u2g + V * v2g (left, green)
47
vmla.s16 q11, q4, d1[2] @ U * u2g + V * v2g (right, green)
48
vmul.s16 q12, q1, d1[3] @ U * u2b (left, blue)
49
vmul.s16 q13, q2, d1[3] @ U * u2b (right, blue)
50
.endm
51
52
.macro compute_premult_32 half_u half_v
53
vmov d2, \half_u @ copy left q14 to left q1
54
vmov d3, \half_u @ copy left q14 to right q1
55
vmov d4, \half_v @ copy left q15 to left q2
56
vmov d5, \half_v @ copy left q15 to right q2
57
58
vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4
59
vzip.16 d4, d5 @ V1V1V2V2V3V3V4V4
60
61
vmull.s16 q8, d4, d1[0] @ V * v2r (left, red)
62
vmull.s16 q9, d5, d1[0] @ V * v2r (right, red)
63
vmull.s16 q10, d2, d1[1] @ U * u2g
64
vmull.s16 q11, d3, d1[1] @ U * u2g
65
vmlal.s16 q10, d4, d1[2] @ U * u2g + V * v2g (left, green)
66
vmlal.s16 q11, d5, d1[2] @ U * u2g + V * v2g (right, green)
67
vmull.s16 q12, d2, d1[3] @ U * u2b (left, blue)
68
vmull.s16 q13, d3, d1[3] @ U * u2b (right, blue)
69
.endm
70
71
.macro compute_color_16 dst_comp1 dst_comp2 pre1 pre2
72
vadd.s16 q1, q14, \pre1
73
vadd.s16 q2, q15, \pre2
74
vqrshrun.s16 \dst_comp1, q1, #6
75
vqrshrun.s16 \dst_comp2, q2, #6
76
.endm
77
78
.macro compute_color_32 dst_comp pre1 pre2
79
vadd.s32 q3, q1, \pre1
80
vadd.s32 q4, q2, \pre2
81
vqrshrun.s32 d10, q3, #13
82
vqrshrun.s32 d11, q4, #13 @ q5 = ({q3,q4} + (1<<12)) >> 13
83
vqmovn.u16 \dst_comp, q5 @ saturate 16bit -> 8bit
84
.endm
85
86
.macro compute_rgba_16 r1 r2 g1 g2 b1 b2 a1 a2
87
compute_color_16 \r1, \r2, q8, q9
88
compute_color_16 \g1, \g2, q10, q11
89
compute_color_16 \b1, \b2, q12, q13
90
vmov.u8 \a1, #255
91
vmov.u8 \a2, #255
92
.endm
93
94
.macro compute_rgba_32 r g b a
95
compute_color_32 \r, q8, q9
96
compute_color_32 \g, q10, q11
97
compute_color_32 \b, q12, q13
98
vmov.u8 \a, #255
99
.endm
100
101
.macro compute_16px_16 dst y0 y1 ofmt
102
vmovl.u8 q14, \y0 @ 8px of y
103
vmovl.u8 q15, \y1 @ 8px of y
104
105
vdup.16 q5, r9 @ q5 = y_offset
106
vmov d14, d0 @ q7 = y_coeff
107
vmov d15, d0 @ q7 = y_coeff
108
109
vsub.s16 q14, q5
110
vsub.s16 q15, q5
111
112
vmul.s16 q14, q7 @ q14 = (srcY - y_offset) * y_coeff (left)
113
vmul.s16 q15, q7 @ q15 = (srcY - y_offset) * y_coeff (right)
114
115
116
.ifc \ofmt,argb
117
compute_rgba_16 d7, d11, d8, d12, d9, d13, d6, d10
118
.endif
119
120
.ifc \ofmt,rgba
121
compute_rgba_16 d6, d10, d7, d11, d8, d12, d9, d13
122
.endif
123
124
.ifc \ofmt,abgr
125
compute_rgba_16 d9, d13, d8, d12, d7, d11, d6, d10
126
.endif
127
128
.ifc \ofmt,bgra
129
compute_rgba_16 d8, d12, d7, d11, d6, d10, d9, d13
130
.endif
131
vst4.8 {q3, q4}, [\dst,:128]!
132
vst4.8 {q5, q6}, [\dst,:128]!
133
134
.endm
135
136
.macro compute_8px_32 dst half_y ofmt
137
vmovl.u8 q7, \half_y @ 8px of Y
138
vdup.16 q5, r9
139
vsub.s16 q7, q5
140
vmull.s16 q1, d14, d0 @ q1 = (srcY - y_offset) * y_coeff (left)
141
vmull.s16 q2, d15, d0 @ q2 = (srcY - y_offset) * y_coeff (right)
142
143
.ifc \ofmt,argb
144
compute_rgba_32 d13, d14, d15, d12
145
.endif
146
147
.ifc \ofmt,rgba
148
compute_rgba_32 d12, d13, d14, d15
149
.endif
150
151
.ifc \ofmt,abgr
152
compute_rgba_32 d15, d14, d13, d12
153
.endif
154
155
.ifc \ofmt,bgra
156
compute_rgba_32 d14, d13, d12, d15
157
.endif
158
159
vst4.8 {q6, q7}, [\dst,:128]!
160
.endm
161
162
.macro process_1l_16px_16 ofmt
163
compute_premult_16 d28, d29, d30, d31
164
vld1.8 {q7}, [r4]!
165
compute_16px_16 r2, d14, d15, \ofmt
166
.endm
167
168
.macro process_1l_16px_32 ofmt
169
compute_premult_32 d28, d30
170
vld1.8 {q7}, [r4]!
171
vmov d28, d15 @ save right of the line of luma for later use
172
compute_8px_32 r2, d14, \ofmt
173
174
compute_premult_32 d29, d31
175
compute_8px_32 r2, d28, \ofmt
176
.endm
177
178
.macro process_2l_16px_16 ofmt
179
compute_premult_16 d28, d29, d30, d31
180
181
vld1.8 {q7}, [r4]! @ first line of luma
182
compute_16px_16 r2, d14, d15, \ofmt
183
184
vld1.8 {q7}, [r12]! @ second line of luma
185
compute_16px_16 r11, d14, d15, \ofmt
186
.endm
187
188
.macro process_2l_16px_32 ofmt
189
compute_premult_32 d28, d30
190
191
vld1.8 {q7}, [r4]! @ first line of luma
192
vmov d28, d15 @ save right of the first line of luma for later use
193
compute_8px_32 r2, d14, \ofmt
194
195
vld1.8 {q7}, [r12]! @ second line of luma
196
vmov d30, d15 @ save right of the second line of luma for later use
197
compute_8px_32 r11, d14, \ofmt
198
199
compute_premult_32 d29, d31
200
compute_8px_32 r2, d28, \ofmt
201
compute_8px_32 r11, d30, \ofmt
202
.endm
203
204
.macro load_args_nvx
205
push {r4-r12, lr}
206
vpush {q4-q7}
207
ldr r4, [sp, #104] @ r4 = srcY
208
ldr r5, [sp, #108] @ r5 = linesizeY
209
ldr r6, [sp, #112] @ r6 = srcC
210
ldr r7, [sp, #116] @ r7 = linesizeC
211
ldr r8, [sp, #120] @ r8 = table
212
ldr r9, [sp, #124] @ r9 = y_offset
213
ldr r10,[sp, #128] @ r10 = y_coeff
214
vdup.16 d0, r10 @ d0 = y_coeff
215
vld1.16 {d1}, [r8] @ d1 = *table
216
add r11, r2, r3 @ r11 = dst + linesize (dst2)
217
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
218
lsl r3, r3, #1
219
lsl r5, r5, #1
220
lsl r8, r0, #2
221
sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
222
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
223
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
224
.endm
225
226
.macro load_args_yuv420p
227
push {r4-r12, lr}
228
vpush {q4-q7}
229
ldr r4, [sp, #104] @ r4 = srcY
230
ldr r5, [sp, #108] @ r5 = linesizeY
231
ldr r6, [sp, #112] @ r6 = srcU
232
ldr r8, [sp, #128] @ r8 = table
233
ldr r9, [sp, #132] @ r9 = y_offset
234
ldr r10,[sp, #136] @ r10 = y_coeff
235
vdup.16 d0, r10 @ d0 = y_coeff
236
vld1.16 {d1}, [r8] @ d1 = *table
237
add r11, r2, r3 @ r11 = dst + linesize (dst2)
238
add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
239
lsl r3, r3, #1
240
lsl r5, r5, #1
241
lsl r8, r0, #2
242
sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
243
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
244
ldr r10,[sp, #120] @ r10 = srcV
245
.endm
246
247
.macro load_args_yuv422p
248
push {r4-r12, lr}
249
vpush {q4-q7}
250
ldr r4, [sp, #104] @ r4 = srcY
251
ldr r5, [sp, #108] @ r5 = linesizeY
252
ldr r6, [sp, #112] @ r6 = srcU
253
ldr r7, [sp, #116] @ r7 = linesizeU
254
ldr r12,[sp, #124] @ r12 = linesizeV
255
ldr r8, [sp, #128] @ r8 = table
256
ldr r9, [sp, #132] @ r9 = y_offset
257
ldr r10,[sp, #136] @ r10 = y_coeff
258
vdup.16 d0, r10 @ d0 = y_coeff
259
vld1.16 {d1}, [r8] @ d1 = *table
260
add r11, r2, r3 @ r11 = dst + linesize (dst2)
261
lsl r8, r0, #2
262
sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
263
sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
264
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
265
sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
266
ldr r10,[sp, #120] @ r10 = srcV
267
.endm
268
269
.macro declare_func ifmt ofmt precision
270
function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
271
272
.ifc \ifmt,nv12
273
load_args_nvx
274
.endif
275
276
.ifc \ifmt,nv21
277
load_args_nvx
278
.endif
279
280
.ifc \ifmt,yuv420p
281
load_args_yuv420p
282
.endif
283
284
285
.ifc \ifmt,yuv422p
286
load_args_yuv422p
287
.endif
288
289
1:
290
mov r8, r0 @ r8 = width
291
2:
292
pld [r6, #64*3]
293
pld [r4, #64*3]
294
295
vmov.i8 d10, #128
296
297
.ifc \ifmt,nv12
298
pld [r12, #64*3]
299
300
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
301
vsubl.u8 q14, d2, d10 @ q14 = U - 128
302
vsubl.u8 q15, d3, d10 @ q15 = V - 128
303
304
process_2l_16px_\precision \ofmt
305
.endif
306
307
.ifc \ifmt,nv21
308
pld [r12, #64*3]
309
310
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
311
vsubl.u8 q14, d3, d10 @ q14 = U - 128
312
vsubl.u8 q15, d2, d10 @ q15 = V - 128
313
314
process_2l_16px_\precision \ofmt
315
.endif
316
317
.ifc \ifmt,yuv420p
318
pld [r10, #64*3]
319
pld [r12, #64*3]
320
321
vld1.8 d2, [r6]! @ d2: chroma red line
322
vld1.8 d3, [r10]! @ d3: chroma blue line
323
vsubl.u8 q14, d2, d10 @ q14 = U - 128
324
vsubl.u8 q15, d3, d10 @ q15 = V - 128
325
326
process_2l_16px_\precision \ofmt
327
.endif
328
329
.ifc \ifmt,yuv422p
330
pld [r10, #64*3]
331
332
vld1.8 d2, [r6]! @ d2: chroma red line
333
vld1.8 d3, [r10]! @ d3: chroma blue line
334
vsubl.u8 q14, d2, d10 @ q14 = U - 128
335
vsubl.u8 q15, d3, d10 @ q15 = V - 128
336
337
process_1l_16px_\precision \ofmt
338
.endif
339
340
subs r8, r8, #16 @ width -= 16
341
bgt 2b
342
343
add r2, r2, r3 @ dst += padding
344
add r4, r4, r5 @ srcY += paddingY
345
346
.ifc \ifmt,nv12
347
add r11, r11, r3 @ dst2 += padding
348
add r12, r12, r5 @ srcY2 += paddingY
349
350
add r6, r6, r7 @ srcC += paddingC
351
352
subs r1, r1, #2 @ height -= 2
353
.endif
354
355
.ifc \ifmt,nv21
356
add r11, r11, r3 @ dst2 += padding
357
add r12, r12, r5 @ srcY2 += paddingY
358
359
add r6, r6, r7 @ srcC += paddingC
360
subs r1, r1, #2 @ height -= 2
361
.endif
362
363
.ifc \ifmt,yuv420p
364
add r11, r11, r3 @ dst2 += padding
365
add r12, r12, r5 @ srcY2 += paddingY
366
367
ldr r7, [sp, #116] @ r7 = linesizeU
368
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
369
add r6, r6, r7 @ srcU += paddingU
370
371
ldr r7, [sp, #124] @ r7 = linesizeV
372
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
373
add r10, r10, r7 @ srcV += paddingV
374
375
subs r1, r1, #2 @ height -= 2
376
.endif
377
378
.ifc \ifmt,yuv422p
379
add r6, r6, r7 @ srcU += paddingU
380
add r10,r10,r12 @ srcV += paddingV
381
382
subs r1, r1, #1 @ height -= 1
383
.endif
384
385
bgt 1b
386
387
vpop {q4-q7}
388
pop {r4-r12, lr}
389
mov pc, lr
390
endfunc
391
.endm
392
393
.macro declare_rgb_funcs ifmt precision
394
declare_func \ifmt, argb, \precision
395
declare_func \ifmt, rgba, \precision
396
declare_func \ifmt, abgr, \precision
397
declare_func \ifmt, bgra, \precision
398
.endm
399
400
declare_rgb_funcs nv12, 16
401
declare_rgb_funcs nv21, 16
402
declare_rgb_funcs nv12, 32
403
declare_rgb_funcs nv21, 32
404
declare_rgb_funcs yuv420p, 16
405
declare_rgb_funcs yuv420p, 32
406
declare_rgb_funcs yuv422p, 16
407
declare_rgb_funcs yuv422p, 32
408
409