Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/****************************************************************************
2
* quant.S: arm quantization and level-run
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Janne Grunau <[email protected]>
8
* Martin Storsjo <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include "asm.S"
29
30
.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
31
add v18.8h, v18.8h, \bias0
32
add v19.8h, v19.8h, \bias1
33
umull v20.4s, v18.4h, \mf0_1\().4h
34
umull2 v21.4s, v18.8h, \mf0_1\().8h
35
umull v22.4s, v19.4h, \mf2_3\().4h
36
umull2 v23.4s, v19.8h, \mf2_3\().8h
37
sshr v16.8h, v16.8h, #15
38
sshr v17.8h, v17.8h, #15
39
shrn v18.4h, v20.4s, #16
40
shrn2 v18.8h, v21.4s, #16
41
shrn v19.4h, v22.4s, #16
42
shrn2 v19.8h, v23.4s, #16
43
eor v18.16b, v18.16b, v16.16b
44
eor v19.16b, v19.16b, v17.16b
45
sub v18.8h, v18.8h, v16.8h
46
sub v19.8h, v19.8h, v17.8h
47
orr \mask, v18.16b, v19.16b
48
st1 {v18.8h,v19.8h}, [x0], #32
49
.endm
50
51
.macro QUANT_END d
52
fmov x2, \d
53
mov w0, #0
54
tst x2, x2
55
cinc w0, w0, ne
56
ret
57
.endm
58
59
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
60
function x264_quant_2x2_dc_neon, export=1
61
ld1 {v0.4h}, [x0]
62
dup v2.4h, w2
63
dup v1.4h, w1
64
abs v3.4h, v0.4h
65
add v3.4h, v3.4h, v2.4h
66
umull v3.4s, v3.4h, v1.4h
67
sshr v0.4h, v0.4h, #15
68
shrn v3.4h, v3.4s, #16
69
eor v3.8b, v3.8b, v0.8b
70
sub v3.4h, v3.4h, v0.4h
71
st1 {v3.4h}, [x0]
72
QUANT_END d3
73
endfunc
74
75
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
76
function x264_quant_4x4_dc_neon, export=1
77
ld1 {v16.8h,v17.8h}, [x0]
78
abs v18.8h, v16.8h
79
abs v19.8h, v17.8h
80
dup v0.8h, w2
81
dup v2.8h, w1
82
QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b
83
uqxtn v0.8b, v0.8h
84
QUANT_END d0
85
endfunc
86
87
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
88
function x264_quant_4x4_neon, export=1
89
ld1 {v16.8h,v17.8h}, [x0]
90
abs v18.8h, v16.8h
91
abs v19.8h, v17.8h
92
ld1 {v0.8h,v1.8h}, [x2]
93
ld1 {v2.8h,v3.8h}, [x1]
94
QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b
95
uqxtn v0.8b, v0.8h
96
QUANT_END d0
97
endfunc
98
99
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
100
function x264_quant_4x4x4_neon, export=1
101
ld1 {v16.8h,v17.8h}, [x0]
102
abs v18.8h, v16.8h
103
abs v19.8h, v17.8h
104
ld1 {v0.8h,v1.8h}, [x2]
105
ld1 {v2.8h,v3.8h}, [x1]
106
QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
107
ld1 {v16.8h,v17.8h}, [x0]
108
abs v18.8h, v16.8h
109
abs v19.8h, v17.8h
110
QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
111
ld1 {v16.8h,v17.8h}, [x0]
112
abs v18.8h, v16.8h
113
abs v19.8h, v17.8h
114
QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b
115
ld1 {v16.8h,v17.8h}, [x0]
116
abs v18.8h, v16.8h
117
abs v19.8h, v17.8h
118
QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b
119
uqxtn v4.8b, v4.8h
120
uqxtn v7.8b, v7.8h
121
uqxtn v6.8b, v6.8h
122
uqxtn v5.8b, v5.8h
123
fmov x7, d7
124
fmov x6, d6
125
fmov x5, d5
126
fmov x4, d4
127
mov w0, #0
128
tst x7, x7
129
cinc w0, w0, ne
130
lsl w0, w0, #1
131
tst x6, x6
132
cinc w0, w0, ne
133
lsl w0, w0, #1
134
tst x5, x5
135
cinc w0, w0, ne
136
lsl w0, w0, #1
137
tst x4, x4
138
cinc w0, w0, ne
139
ret
140
endfunc
141
142
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
143
function x264_quant_8x8_neon, export=1
144
ld1 {v16.8h,v17.8h}, [x0]
145
abs v18.8h, v16.8h
146
abs v19.8h, v17.8h
147
ld1 {v0.8h,v1.8h}, [x2], #32
148
ld1 {v2.8h,v3.8h}, [x1], #32
149
QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
150
.rept 3
151
ld1 {v16.8h,v17.8h}, [x0]
152
abs v18.8h, v16.8h
153
abs v19.8h, v17.8h
154
ld1 {v0.8h,v1.8h}, [x2], #32
155
ld1 {v2.8h,v3.8h}, [x1], #32
156
QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
157
orr v4.16b, v4.16b, v5.16b
158
.endr
159
uqxtn v0.8b, v4.8h
160
QUANT_END d0
161
endfunc
162
163
.macro DEQUANT_START mf_size offset dc=no
164
mov w3, #0x2b
165
mul w3, w3, w2
166
lsr w3, w3, #8 // i_qbits = i_qp / 6
167
add w5, w3, w3, lsl #1
168
sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6
169
lsl w2, w2, #\mf_size
170
.ifc \dc,no
171
add x1, x1, w2, sxtw // dequant_mf[i_mf]
172
.else
173
ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0]
174
.endif
175
subs w3, w3, #\offset // 6 for 8x8
176
.endm
177
178
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
179
.macro DEQUANT size bits
180
function x264_dequant_\size\()_neon, export=1
181
DEQUANT_START \bits+2, \bits
182
.ifc \size, 8x8
183
mov w2, #4
184
.endif
185
b.lt dequant_\size\()_rshift
186
187
dup v31.8h, w3
188
dequant_\size\()_lshift_loop:
189
.ifc \size, 8x8
190
subs w2, w2, #1
191
.endif
192
ld1 {v16.4s}, [x1], #16
193
ld1 {v17.4s}, [x1], #16
194
sqxtn v2.4h, v16.4s
195
ld1 {v18.4s}, [x1], #16
196
sqxtn2 v2.8h, v17.4s
197
ld1 {v19.4s}, [x1], #16
198
sqxtn v3.4h, v18.4s
199
ld1 {v0.8h,v1.8h}, [x0]
200
sqxtn2 v3.8h, v19.4s
201
mul v0.8h, v0.8h, v2.8h
202
mul v1.8h, v1.8h, v3.8h
203
sshl v0.8h, v0.8h, v31.8h
204
sshl v1.8h, v1.8h, v31.8h
205
st1 {v0.8h,v1.8h}, [x0], #32
206
.ifc \size, 8x8
207
b.gt dequant_\size\()_lshift_loop
208
.endif
209
ret
210
211
dequant_\size\()_rshift:
212
dup v31.4s, w3
213
neg w3, w3
214
mov w5, #1
215
sub w3, w3, #1
216
lsl w5, w5, w3
217
218
.ifc \size, 8x8
219
dequant_\size\()_rshift_loop:
220
subs w2, w2, #1
221
.endif
222
ld1 {v16.4s}, [x1], #16
223
ld1 {v17.4s}, [x1], #16
224
sqxtn v2.4h, v16.4s
225
ld1 {v18.4s}, [x1], #16
226
dup v16.4s, w5
227
sqxtn2 v2.8h, v17.4s
228
ld1 {v19.4s}, [x1], #16
229
dup v17.4s, w5
230
sqxtn v3.4h, v18.4s
231
ld1 {v0.8h,v1.8h}, [x0]
232
dup v18.4s, w5
233
sqxtn2 v3.8h, v19.4s
234
dup v19.4s, w5
235
236
smlal v16.4s, v0.4h, v2.4h
237
smlal2 v17.4s, v0.8h, v2.8h
238
smlal v18.4s, v1.4h, v3.4h
239
smlal2 v19.4s, v1.8h, v3.8h
240
sshl v16.4s, v16.4s, v31.4s
241
sshl v17.4s, v17.4s, v31.4s
242
sshl v18.4s, v18.4s, v31.4s
243
sshl v19.4s, v19.4s, v31.4s
244
245
sqxtn v0.4h, v16.4s
246
sqxtn2 v0.8h, v17.4s
247
sqxtn v1.4h, v18.4s
248
sqxtn2 v1.8h, v19.4s
249
st1 {v0.8h,v1.8h}, [x0], #32
250
.ifc \size, 8x8
251
b.gt dequant_\size\()_rshift_loop
252
.endif
253
ret
254
endfunc
255
.endm
256
257
DEQUANT 4x4, 4
258
DEQUANT 8x8, 6
259
260
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
261
function x264_dequant_4x4_dc_neon, export=1
262
DEQUANT_START 6, 6, yes
263
b.lt dequant_4x4_dc_rshift
264
265
lsl w1, w1, w3
266
dup v2.8h, w1
267
ld1 {v0.8h,v1.8h}, [x0]
268
269
mul v0.8h, v0.8h, v2.8h
270
mul v1.8h, v1.8h, v2.8h
271
st1 {v0.8h,v1.8h}, [x0]
272
ret
273
274
dequant_4x4_dc_rshift:
275
dup v4.8h, w1
276
dup v3.4s, w3
277
neg w3, w3
278
mov w5, #1
279
sub w3, w3, #1
280
lsl w5, w5, w3
281
282
dup v16.4s, w5
283
dup v17.4s, w5
284
ld1 {v0.8h,v1.8h}, [x0]
285
dup v18.4s, w5
286
dup v19.4s, w5
287
288
smlal v16.4s, v0.4h, v4.4h
289
smlal2 v17.4s, v0.8h, v4.8h
290
smlal v18.4s, v1.4h, v4.4h
291
smlal2 v19.4s, v1.8h, v4.8h
292
sshl v16.4s, v16.4s, v3.4s
293
sshl v17.4s, v17.4s, v3.4s
294
sshl v18.4s, v18.4s, v3.4s
295
sshl v19.4s, v19.4s, v3.4s
296
297
sqxtn v0.4h, v16.4s
298
sqxtn2 v0.8h, v17.4s
299
sqxtn v1.4h, v18.4s
300
sqxtn2 v1.8h, v19.4s
301
st1 {v0.8h,v1.8h}, [x0]
302
ret
303
endfunc
304
305
.macro decimate_score_1x size
306
function x264_decimate_score\size\()_neon, export=1
307
ld1 {v0.8h,v1.8h}, [x0]
308
movrel x5, X(x264_decimate_table4)
309
movi v3.16b, #0x01
310
sqxtn v0.8b, v0.8h
311
sqxtn2 v0.16b, v1.8h
312
abs v2.16b, v0.16b
313
cmeq v1.16b, v0.16b, #0
314
cmhi v2.16b, v2.16b, v3.16b
315
shrn v1.8b, v1.8h, #4
316
shrn v2.8b, v2.8h, #4
317
fmov x2, d2
318
fmov x1, d1
319
cbnz x2, 9f
320
mvn x1, x1
321
mov w0, #0
322
cbz x1, 0f
323
.ifc \size, 15
324
lsr x1, x1, #1
325
.endif
326
rbit x1, x1
327
1:
328
clz x3, x1
329
lsr x6, x3, #2
330
lsl x1, x1, x3
331
ldrb w7, [x5, x6]
332
lsl x1, x1, #4
333
add w0, w0, w7
334
cbnz x1, 1b
335
ret
336
9:
337
mov w0, #9
338
0:
339
ret
340
endfunc
341
.endm
342
343
decimate_score_1x 15
344
decimate_score_1x 16
345
346
const mask64, align=6
347
.byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
348
.byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
349
endconst
350
351
function x264_decimate_score64_neon, export=1
352
ld1 {v0.8h,v1.8h}, [x0], #32
353
ld1 {v2.8h,v3.8h}, [x0], #32
354
ld1 {v4.8h,v5.8h}, [x0], #32
355
ld1 {v6.8h,v7.8h}, [x0]
356
movrel x6, mask64
357
movi v31.16b, #0x01
358
sqxtn v16.8b, v1.8h
359
sqxtn2 v16.16b, v0.8h
360
sqxtn v17.8b, v3.8h
361
sqxtn2 v17.16b, v2.8h
362
sqxtn v18.8b, v5.8h
363
sqxtn2 v18.16b, v4.8h
364
sqxtn v19.8b, v7.8h
365
sqxtn2 v19.16b, v6.8h
366
abs v4.16b, v16.16b
367
abs v5.16b, v17.16b
368
abs v6.16b, v18.16b
369
abs v7.16b, v19.16b
370
ld1 {v30.16b}, [x6]
371
cmeq v0.16b, v16.16b, #0
372
cmeq v1.16b, v17.16b, #0
373
cmeq v2.16b, v18.16b, #0
374
cmeq v3.16b, v19.16b, #0
375
umax v4.16b, v4.16b, v5.16b
376
umax v6.16b, v6.16b, v7.16b
377
and v0.16b, v0.16b, v30.16b
378
and v1.16b, v1.16b, v30.16b
379
and v2.16b, v2.16b, v30.16b
380
and v3.16b, v3.16b, v30.16b
381
umax v4.16b, v4.16b, v6.16b
382
addp v0.16b, v1.16b, v0.16b
383
addp v2.16b, v3.16b, v2.16b
384
cmhi v4.16b, v4.16b, v31.16b
385
addp v0.16b, v2.16b, v0.16b
386
shrn v4.8b, v4.8h, #4
387
addp v0.16b, v0.16b, v0.16b
388
fmov x2, d4
389
fmov x1, d0
390
cbnz x2, 9f
391
mvn x1, x1
392
mov w0, #0
393
cbz x1, 0f
394
movrel x5, X(x264_decimate_table8)
395
1:
396
clz x3, x1
397
lsl x1, x1, x3
398
ldrb w7, [x5, x3]
399
lsl x1, x1, #1
400
add w0, w0, w7
401
cbnz x1, 1b
402
ret
403
9:
404
mov w0, #9
405
0:
406
ret
407
endfunc
408
409
// int coeff_last( int16_t *l )
410
function x264_coeff_last4_aarch64, export=1
411
ldr x2, [x0]
412
mov w4, #3
413
clz x0, x2
414
sub w0, w4, w0, lsr #4
415
ret
416
endfunc
417
418
function x264_coeff_last8_aarch64, export=1
419
ldr x3, [x0, #8]
420
mov w4, #7
421
clz x2, x3
422
cmp w2, #64
423
b.ne 1f
424
ldr x3, [x0]
425
sub w4, w4, #4
426
clz x2, x3
427
1:
428
sub w0, w4, w2, lsr #4
429
ret
430
endfunc
431
432
.macro COEFF_LAST_1x size
433
function x264_coeff_last\size\()_neon, export=1
434
.if \size == 15
435
sub x0, x0, #2
436
.endif
437
ld1 {v0.8h,v1.8h}, [x0]
438
uqxtn v0.8b, v0.8h
439
uqxtn2 v0.16b, v1.8h
440
cmtst v0.16b, v0.16b, v0.16b
441
shrn v0.8b, v0.8h, #4
442
fmov x1, d0
443
mov w3, #\size - 1
444
clz x2, x1
445
sub w0, w3, w2, lsr #2
446
ret
447
endfunc
448
.endm
449
450
COEFF_LAST_1x 15
451
COEFF_LAST_1x 16
452
453
function x264_coeff_last64_neon, export=1
454
ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
455
movi v31.8h, #8
456
movi v30.8h, #1
457
uqxtn v0.8b, v0.8h
458
uqxtn2 v0.16b, v1.8h
459
ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
460
uqxtn v1.8b, v2.8h
461
uqxtn2 v1.16b, v3.8h
462
uqxtn v2.8b, v4.8h
463
uqxtn2 v2.16b, v5.8h
464
uqxtn v3.8b, v6.8h
465
uqxtn2 v3.16b, v7.8h
466
467
cmtst v0.16b, v0.16b, v0.16b
468
cmtst v1.16b, v1.16b, v1.16b
469
cmtst v2.16b, v2.16b, v2.16b
470
cmtst v3.16b, v3.16b, v3.16b
471
472
shrn v0.8b, v0.8h, #4
473
shrn2 v0.16b, v1.8h, #4
474
shrn v1.8b, v2.8h, #4
475
shrn2 v1.16b, v3.8h, #4
476
477
clz v0.4s, v0.4s
478
clz v1.4s, v1.4s
479
480
shrn v0.4h, v0.4s, #2
481
shrn2 v0.8h, v1.4s, #2
482
483
sub v0.8h, v31.8h, v0.8h
484
sshl v0.8h, v30.8h, v0.8h
485
shrn v0.8b, v0.8h, #1
486
487
fmov x2, d0
488
mov w3, #63
489
clz x2, x2
490
sub w0, w3, w2
491
ret
492
endfunc
493
494
.macro coeff_level_run_start size
495
add x6, x1, #23 // runlevel->mask
496
mov w7, #0
497
mov w8, #0
498
mov w9, #1
499
and x6, x6, #~15
500
mov w4, #\size - 1
501
.endm
502
503
.macro coeff_level_run shift
504
clz x3, x2
505
subs w4, w4, w3, lsr #\shift
506
str w4, [x1], #4
507
1:
508
ldrh w5, [x0, x4, lsl #1]
509
strh w5, [x6], #2
510
add w7, w7, #1
511
lsl w10, w9, w4
512
orr w8, w8, w10
513
b.le 2f
514
add w3, w3, #1 << \shift
515
sub w4, w4, #1
516
and x3, x3, #~((1 << \shift) - 1)
517
lsl x2, x2, x3
518
clz x3, x2
519
subs w4, w4, w3, lsr #\shift
520
b.ge 1b
521
2:
522
str w8, [x1]
523
mov w0, w7
524
.endm
525
526
function x264_coeff_level_run4_aarch64, export=1
527
ldr x2, [x0]
528
529
coeff_level_run_start 4
530
531
coeff_level_run 4
532
533
ret
534
endfunc
535
536
.macro X264_COEFF_LEVEL_RUN size
537
function x264_coeff_level_run\size\()_neon, export=1
538
.if \size == 15
539
sub x0, x0, #2
540
.endif
541
.if \size < 15
542
ld1 {v0.8h}, [x0]
543
uqxtn v0.8b, v0.8h
544
cmtst v0.8b, v0.8b, v0.8b
545
.else
546
ld1 {v0.8h,v1.8h}, [x0]
547
uqxtn v0.8b, v0.8h
548
uqxtn2 v0.16b, v1.8h
549
cmtst v0.16b, v0.16b, v0.16b
550
shrn v0.8b, v0.8h, #4
551
.endif
552
fmov x2, d0
553
.if \size == 15
554
add x0, x0, #2
555
.endif
556
557
coeff_level_run_start \size
558
559
coeff_level_run (4 - (\size + 1) / 8)
560
561
ret
562
endfunc
563
.endm
564
565
X264_COEFF_LEVEL_RUN 8
566
X264_COEFF_LEVEL_RUN 15
567
X264_COEFF_LEVEL_RUN 16
568
569
function x264_denoise_dct_neon, export=1
570
1: subs w3, w3, #16
571
ld1 {v0.8h,v1.8h}, [x0]
572
ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
573
abs v16.8h, v0.8h
574
abs v17.8h, v1.8h
575
ld1 {v2.8h,v3.8h}, [x2], #32
576
cmlt v18.8h, v0.8h, #0
577
cmlt v19.8h, v1.8h, #0
578
uaddw v4.4s, v4.4s, v16.4h
579
uaddw2 v5.4s, v5.4s, v16.8h
580
uqsub v20.8h, v16.8h, v2.8h
581
uqsub v21.8h, v17.8h, v3.8h
582
uaddw v6.4s, v6.4s, v17.4h
583
uaddw2 v7.4s, v7.4s, v17.8h
584
neg v22.8h, v20.8h
585
neg v23.8h, v21.8h
586
bsl v18.16b, v22.16b, v20.16b
587
bsl v19.16b, v23.16b, v21.16b
588
st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
589
st1 {v18.8h,v19.8h}, [x0], #32
590
b.gt 1b
591
ret
592
endfunc
593
594