CoCalc -- quant-a.S

05. Matplotlib / ffmpeg-3.0 / libx264 / common / aarch64 / quant-a.S
⁵²⁸⁶⁷ views
1
/****************************************************************************
2
 * quant.S: arm quantization and level-run
3
 *****************************************************************************
4
 * Copyright (C) 2009-2016 x264 project
5
 *
6
 * Authors: David Conrad <[email protected]>
7
 *          Janne Grunau <[email protected]>
8
 *          Martin Storsjo <[email protected]>
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
 *
24
 * This program is also available under a commercial proprietary license.
25
 * For more information, contact us at [email protected].
26
 *****************************************************************************/
27

28
#include "asm.S"
29

30
.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
31
    add         v18.8h, v18.8h, \bias0
32
    add         v19.8h, v19.8h, \bias1
33
    umull       v20.4s, v18.4h, \mf0_1\().4h
34
    umull2      v21.4s, v18.8h, \mf0_1\().8h
35
    umull       v22.4s, v19.4h, \mf2_3\().4h
36
    umull2      v23.4s, v19.8h, \mf2_3\().8h
37
    sshr        v16.8h, v16.8h, #15
38
    sshr        v17.8h, v17.8h, #15
39
    shrn        v18.4h, v20.4s, #16
40
    shrn2       v18.8h, v21.4s, #16
41
    shrn        v19.4h, v22.4s, #16
42
    shrn2       v19.8h, v23.4s, #16
43
    eor         v18.16b, v18.16b, v16.16b
44
    eor         v19.16b, v19.16b, v17.16b
45
    sub         v18.8h, v18.8h, v16.8h
46
    sub         v19.8h, v19.8h, v17.8h
47
    orr         \mask,  v18.16b, v19.16b
48
    st1        {v18.8h,v19.8h}, [x0], #32
49
.endm
50

51
.macro QUANT_END d
52
    fmov        x2,  \d
53
    mov         w0,  #0
54
    tst         x2,  x2
55
    cinc        w0,  w0,  ne
56
    ret
57
.endm
58

59
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
60
function x264_quant_2x2_dc_neon, export=1
61
    ld1        {v0.4h}, [x0]
62
    dup         v2.4h,  w2
63
    dup         v1.4h,  w1
64
    abs         v3.4h,  v0.4h
65
    add         v3.4h,  v3.4h,  v2.4h
66
    umull       v3.4s,  v3.4h,  v1.4h
67
    sshr        v0.4h,  v0.4h,  #15
68
    shrn        v3.4h,  v3.4s,  #16
69
    eor         v3.8b,  v3.8b,  v0.8b
70
    sub         v3.4h,  v3.4h,  v0.4h
71
    st1        {v3.4h}, [x0]
72
    QUANT_END   d3
73
endfunc
74

75
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
76
function x264_quant_4x4_dc_neon, export=1
77
    ld1        {v16.8h,v17.8h}, [x0]
78
    abs         v18.8h,  v16.8h
79
    abs         v19.8h,  v17.8h
80
    dup         v0.8h,  w2
81
    dup         v2.8h,  w1
82
    QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
83
    uqxtn       v0.8b,  v0.8h
84
    QUANT_END   d0
85
endfunc
86

87
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
88
function x264_quant_4x4_neon, export=1
89
    ld1        {v16.8h,v17.8h}, [x0]
90
    abs         v18.8h,  v16.8h
91
    abs         v19.8h,  v17.8h
92
    ld1        {v0.8h,v1.8h}, [x2]
93
    ld1        {v2.8h,v3.8h}, [x1]
94
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
95
    uqxtn       v0.8b,  v0.8h
96
    QUANT_END   d0
97
endfunc
98

99
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
100
function x264_quant_4x4x4_neon, export=1
101
    ld1        {v16.8h,v17.8h}, [x0]
102
    abs         v18.8h, v16.8h
103
    abs         v19.8h, v17.8h
104
    ld1        {v0.8h,v1.8h}, [x2]
105
    ld1        {v2.8h,v3.8h}, [x1]
106
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
107
    ld1        {v16.8h,v17.8h}, [x0]
108
    abs         v18.8h, v16.8h
109
    abs         v19.8h, v17.8h
110
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
111
    ld1        {v16.8h,v17.8h}, [x0]
112
    abs         v18.8h, v16.8h
113
    abs         v19.8h, v17.8h
114
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
115
    ld1        {v16.8h,v17.8h}, [x0]
116
    abs         v18.8h, v16.8h
117
    abs         v19.8h, v17.8h
118
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
119
    uqxtn       v4.8b,  v4.8h
120
    uqxtn       v7.8b,  v7.8h
121
    uqxtn       v6.8b,  v6.8h
122
    uqxtn       v5.8b,  v5.8h
123
    fmov        x7,  d7
124
    fmov        x6,  d6
125
    fmov        x5,  d5
126
    fmov        x4,  d4
127
    mov         w0,  #0
128
    tst         x7,  x7
129
    cinc        w0,  w0,  ne
130
    lsl         w0,  w0,  #1
131
    tst         x6,  x6
132
    cinc        w0,  w0,  ne
133
    lsl         w0,  w0,  #1
134
    tst         x5,  x5
135
    cinc        w0,  w0,  ne
136
    lsl         w0,  w0,  #1
137
    tst         x4,  x4
138
    cinc        w0,  w0,  ne
139
    ret
140
endfunc
141

142
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
143
function x264_quant_8x8_neon, export=1
144
    ld1        {v16.8h,v17.8h}, [x0]
145
    abs         v18.8h, v16.8h
146
    abs         v19.8h, v17.8h
147
    ld1        {v0.8h,v1.8h}, [x2], #32
148
    ld1        {v2.8h,v3.8h}, [x1], #32
149
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
150
.rept 3
151
    ld1        {v16.8h,v17.8h}, [x0]
152
    abs         v18.8h, v16.8h
153
    abs         v19.8h, v17.8h
154
    ld1        {v0.8h,v1.8h}, [x2], #32
155
    ld1        {v2.8h,v3.8h}, [x1], #32
156
    QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
157
    orr         v4.16b, v4.16b, v5.16b
158
.endr
159
    uqxtn       v0.8b,  v4.8h
160
    QUANT_END   d0
161
endfunc
162

163
.macro DEQUANT_START mf_size offset dc=no
164
    mov         w3,  #0x2b
165
    mul         w3,  w3,  w2
166
    lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
167
    add         w5,  w3,  w3,  lsl #1
168
    sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
169
    lsl         w2,  w2,  #\mf_size
170
.ifc \dc,no
171
    add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
172
.else
173
    ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
174
.endif
175
    subs        w3,  w3,  #\offset      // 6 for 8x8
176
.endm
177

178
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
179
.macro DEQUANT size bits
180
function x264_dequant_\size\()_neon, export=1
181
    DEQUANT_START \bits+2, \bits
182
.ifc \size, 8x8
183
    mov         w2,  #4
184
.endif
185
    b.lt        dequant_\size\()_rshift
186

187
    dup         v31.8h, w3
188
dequant_\size\()_lshift_loop:
189
.ifc \size, 8x8
190
    subs        w2,  w2,  #1
191
.endif
192
    ld1        {v16.4s}, [x1], #16
193
    ld1        {v17.4s}, [x1], #16
194
    sqxtn       v2.4h,  v16.4s
195
    ld1        {v18.4s}, [x1], #16
196
    sqxtn2      v2.8h,  v17.4s
197
    ld1        {v19.4s}, [x1], #16
198
    sqxtn       v3.4h,  v18.4s
199
    ld1        {v0.8h,v1.8h}, [x0]
200
    sqxtn2      v3.8h,  v19.4s
201
    mul         v0.8h,  v0.8h,  v2.8h
202
    mul         v1.8h,  v1.8h,  v3.8h
203
    sshl        v0.8h,  v0.8h,  v31.8h
204
    sshl        v1.8h,  v1.8h,  v31.8h
205
    st1        {v0.8h,v1.8h}, [x0], #32
206
.ifc \size, 8x8
207
    b.gt        dequant_\size\()_lshift_loop
208
.endif
209
    ret
210

211
dequant_\size\()_rshift:
212
    dup         v31.4s, w3
213
    neg         w3,  w3
214
    mov         w5,  #1
215
    sub         w3,  w3,  #1
216
    lsl         w5,  w5,  w3
217

218
.ifc \size, 8x8
219
dequant_\size\()_rshift_loop:
220
    subs        w2,  w2,  #1
221
.endif
222
    ld1        {v16.4s}, [x1], #16
223
    ld1        {v17.4s}, [x1], #16
224
    sqxtn       v2.4h,  v16.4s
225
    ld1        {v18.4s}, [x1], #16
226
    dup         v16.4s, w5
227
    sqxtn2      v2.8h,  v17.4s
228
    ld1        {v19.4s}, [x1], #16
229
    dup         v17.4s, w5
230
    sqxtn       v3.4h,  v18.4s
231
    ld1        {v0.8h,v1.8h}, [x0]
232
    dup         v18.4s, w5
233
    sqxtn2      v3.8h,  v19.4s
234
    dup         v19.4s, w5
235

236
    smlal       v16.4s, v0.4h,  v2.4h
237
    smlal2      v17.4s, v0.8h,  v2.8h
238
    smlal       v18.4s, v1.4h,  v3.4h
239
    smlal2      v19.4s, v1.8h,  v3.8h
240
    sshl        v16.4s, v16.4s, v31.4s
241
    sshl        v17.4s, v17.4s, v31.4s
242
    sshl        v18.4s, v18.4s, v31.4s
243
    sshl        v19.4s, v19.4s, v31.4s
244

245
    sqxtn       v0.4h,  v16.4s
246
    sqxtn2      v0.8h,  v17.4s
247
    sqxtn       v1.4h,  v18.4s
248
    sqxtn2      v1.8h,  v19.4s
249
    st1        {v0.8h,v1.8h}, [x0], #32
250
.ifc \size, 8x8
251
    b.gt        dequant_\size\()_rshift_loop
252
.endif
253
    ret
254
endfunc
255
.endm
256

257
DEQUANT 4x4, 4
258
DEQUANT 8x8, 6
259

260
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
261
function x264_dequant_4x4_dc_neon, export=1
262
    DEQUANT_START 6, 6, yes
263
    b.lt        dequant_4x4_dc_rshift
264

265
    lsl         w1,  w1,  w3
266
    dup         v2.8h,  w1
267
    ld1        {v0.8h,v1.8h},   [x0]
268

269
    mul         v0.8h,  v0.8h,  v2.8h
270
    mul         v1.8h,  v1.8h,  v2.8h
271
    st1        {v0.8h,v1.8h},   [x0]
272
    ret
273

274
dequant_4x4_dc_rshift:
275
    dup         v4.8h,  w1
276
    dup         v3.4s, w3
277
    neg         w3,  w3
278
    mov         w5,  #1
279
    sub         w3,  w3,  #1
280
    lsl         w5,  w5,  w3
281

282
    dup         v16.4s, w5
283
    dup         v17.4s, w5
284
    ld1        {v0.8h,v1.8h}, [x0]
285
    dup         v18.4s, w5
286
    dup         v19.4s, w5
287

288
    smlal       v16.4s, v0.4h,  v4.4h
289
    smlal2      v17.4s, v0.8h,  v4.8h
290
    smlal       v18.4s, v1.4h,  v4.4h
291
    smlal2      v19.4s, v1.8h,  v4.8h
292
    sshl        v16.4s, v16.4s, v3.4s
293
    sshl        v17.4s, v17.4s, v3.4s
294
    sshl        v18.4s, v18.4s, v3.4s
295
    sshl        v19.4s, v19.4s, v3.4s
296

297
    sqxtn       v0.4h,  v16.4s
298
    sqxtn2      v0.8h,  v17.4s
299
    sqxtn       v1.4h,  v18.4s
300
    sqxtn2      v1.8h,  v19.4s
301
    st1        {v0.8h,v1.8h}, [x0]
302
    ret
303
endfunc
304

305
.macro decimate_score_1x size
306
function x264_decimate_score\size\()_neon, export=1
307
    ld1        {v0.8h,v1.8h}, [x0]
308
    movrel      x5,  X(x264_decimate_table4)
309
    movi        v3.16b, #0x01
310
    sqxtn       v0.8b,  v0.8h
311
    sqxtn2      v0.16b, v1.8h
312
    abs         v2.16b, v0.16b
313
    cmeq        v1.16b, v0.16b, #0
314
    cmhi        v2.16b, v2.16b, v3.16b
315
    shrn        v1.8b,  v1.8h,  #4
316
    shrn        v2.8b,  v2.8h,  #4
317
    fmov        x2,  d2
318
    fmov        x1,  d1
319
    cbnz        x2,  9f
320
    mvn         x1,  x1
321
    mov         w0,  #0
322
    cbz         x1,  0f
323
.ifc \size, 15
324
    lsr         x1,  x1,  #1
325
.endif
326
    rbit        x1,  x1
327
1:
328
    clz         x3,  x1
329
    lsr         x6,  x3,  #2
330
    lsl         x1,  x1,  x3
331
    ldrb        w7,  [x5, x6]
332
    lsl         x1,  x1,  #4
333
    add         w0,  w0,  w7
334
    cbnz        x1,  1b
335
    ret
336
9:
337
    mov         w0,  #9
338
0:
339
    ret
340
endfunc
341
.endm
342

343
decimate_score_1x 15
344
decimate_score_1x 16
345

346
const mask64, align=6
347
    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
348
    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
349
endconst
350

351
function x264_decimate_score64_neon, export=1
352
    ld1        {v0.8h,v1.8h}, [x0], #32
353
    ld1        {v2.8h,v3.8h}, [x0], #32
354
    ld1        {v4.8h,v5.8h}, [x0], #32
355
    ld1        {v6.8h,v7.8h}, [x0]
356
    movrel      x6,  mask64
357
    movi        v31.16b, #0x01
358
    sqxtn       v16.8b,  v1.8h
359
    sqxtn2      v16.16b, v0.8h
360
    sqxtn       v17.8b,  v3.8h
361
    sqxtn2      v17.16b, v2.8h
362
    sqxtn       v18.8b,  v5.8h
363
    sqxtn2      v18.16b, v4.8h
364
    sqxtn       v19.8b,  v7.8h
365
    sqxtn2      v19.16b, v6.8h
366
    abs         v4.16b, v16.16b
367
    abs         v5.16b, v17.16b
368
    abs         v6.16b, v18.16b
369
    abs         v7.16b, v19.16b
370
    ld1        {v30.16b}, [x6]
371
    cmeq        v0.16b, v16.16b, #0
372
    cmeq        v1.16b, v17.16b, #0
373
    cmeq        v2.16b, v18.16b, #0
374
    cmeq        v3.16b, v19.16b, #0
375
    umax        v4.16b, v4.16b, v5.16b
376
    umax        v6.16b, v6.16b, v7.16b
377
    and         v0.16b, v0.16b, v30.16b
378
    and         v1.16b, v1.16b, v30.16b
379
    and         v2.16b, v2.16b, v30.16b
380
    and         v3.16b, v3.16b, v30.16b
381
    umax        v4.16b, v4.16b, v6.16b
382
    addp        v0.16b, v1.16b, v0.16b
383
    addp        v2.16b, v3.16b, v2.16b
384
    cmhi        v4.16b, v4.16b, v31.16b
385
    addp        v0.16b, v2.16b, v0.16b
386
    shrn        v4.8b,  v4.8h,  #4
387
    addp        v0.16b, v0.16b, v0.16b
388
    fmov        x2,  d4
389
    fmov        x1,  d0
390
    cbnz        x2,  9f
391
    mvn         x1,  x1
392
    mov         w0,  #0
393
    cbz         x1,  0f
394
    movrel      x5,  X(x264_decimate_table8)
395
1:
396
    clz         x3,  x1
397
    lsl         x1,  x1,  x3
398
    ldrb        w7,  [x5, x3]
399
    lsl         x1,  x1,  #1
400
    add         w0,  w0,  w7
401
    cbnz        x1,  1b
402
    ret
403
9:
404
    mov         w0,  #9
405
0:
406
    ret
407
endfunc
408

409
// int coeff_last( int16_t *l )
410
function x264_coeff_last4_aarch64, export=1
411
    ldr         x2,  [x0]
412
    mov         w4,  #3
413
    clz         x0,  x2
414
    sub         w0,  w4,  w0, lsr #4
415
    ret
416
endfunc
417

418
function x264_coeff_last8_aarch64, export=1
419
    ldr         x3,  [x0, #8]
420
    mov         w4,  #7
421
    clz         x2,  x3
422
    cmp         w2,  #64
423
    b.ne        1f
424
    ldr         x3,  [x0]
425
    sub         w4,  w4,  #4
426
    clz         x2,  x3
427
1:
428
    sub         w0,  w4,  w2, lsr #4
429
    ret
430
endfunc
431

432
.macro COEFF_LAST_1x size
433
function x264_coeff_last\size\()_neon, export=1
434
.if \size == 15
435
    sub         x0,  x0,  #2
436
.endif
437
    ld1        {v0.8h,v1.8h}, [x0]
438
    uqxtn       v0.8b,  v0.8h
439
    uqxtn2      v0.16b, v1.8h
440
    cmtst       v0.16b, v0.16b, v0.16b
441
    shrn        v0.8b,  v0.8h,  #4
442
    fmov        x1,  d0
443
    mov         w3,  #\size - 1
444
    clz         x2,  x1
445
    sub         w0,  w3,  w2, lsr #2
446
    ret
447
endfunc
448
.endm
449

450
COEFF_LAST_1x 15
451
COEFF_LAST_1x 16
452

453
function x264_coeff_last64_neon, export=1
454
    ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
455
    movi        v31.8h,  #8
456
    movi        v30.8h,  #1
457
    uqxtn       v0.8b,  v0.8h
458
    uqxtn2      v0.16b, v1.8h
459
    ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
460
    uqxtn       v1.8b,  v2.8h
461
    uqxtn2      v1.16b, v3.8h
462
    uqxtn       v2.8b,  v4.8h
463
    uqxtn2      v2.16b, v5.8h
464
    uqxtn       v3.8b,  v6.8h
465
    uqxtn2      v3.16b, v7.8h
466

467
    cmtst       v0.16b, v0.16b, v0.16b
468
    cmtst       v1.16b, v1.16b, v1.16b
469
    cmtst       v2.16b, v2.16b, v2.16b
470
    cmtst       v3.16b, v3.16b, v3.16b
471

472
    shrn        v0.8b,  v0.8h,  #4
473
    shrn2       v0.16b, v1.8h,  #4
474
    shrn        v1.8b,  v2.8h,  #4
475
    shrn2       v1.16b, v3.8h,  #4
476

477
    clz         v0.4s,  v0.4s
478
    clz         v1.4s,  v1.4s
479

480
    shrn        v0.4h,  v0.4s,  #2
481
    shrn2       v0.8h,  v1.4s,  #2
482

483
    sub         v0.8h,  v31.8h,  v0.8h
484
    sshl        v0.8h,  v30.8h,  v0.8h
485
    shrn        v0.8b,  v0.8h,  #1
486

487
    fmov        x2,  d0
488
    mov         w3,  #63
489
    clz         x2,  x2
490
    sub         w0,  w3,  w2
491
    ret
492
endfunc
493

494
.macro coeff_level_run_start size
495
    add         x6,  x1,  #23            // runlevel->mask
496
    mov         w7,  #0
497
    mov         w8,  #0
498
    mov         w9,  #1
499
    and         x6,  x6,  #~15
500
    mov         w4,  #\size - 1
501
.endm
502

503
.macro coeff_level_run shift
504
    clz         x3,  x2
505
    subs        w4,  w4,  w3, lsr #\shift
506
    str         w4,  [x1], #4
507
1:
508
    ldrh        w5,  [x0, x4, lsl #1]
509
    strh        w5,  [x6], #2
510
    add         w7,  w7,  #1
511
    lsl         w10, w9, w4
512
    orr         w8,  w8,  w10
513
    b.le        2f
514
    add         w3,  w3,  #1 << \shift
515
    sub         w4,  w4,  #1
516
    and         x3,  x3,  #~((1 << \shift) - 1)
517
    lsl         x2,  x2,  x3
518
    clz         x3,  x2
519
    subs        w4,  w4,  w3, lsr #\shift
520
    b.ge        1b
521
2:
522
    str         w8,  [x1]
523
    mov         w0,  w7
524
.endm
525

526
function x264_coeff_level_run4_aarch64, export=1
527
    ldr         x2,  [x0]
528

529
    coeff_level_run_start 4
530

531
    coeff_level_run 4
532

533
    ret
534
endfunc
535

536
.macro X264_COEFF_LEVEL_RUN size
537
function x264_coeff_level_run\size\()_neon, export=1
538
.if \size == 15
539
    sub         x0,  x0,  #2
540
.endif
541
.if         \size < 15
542
    ld1         {v0.8h}, [x0]
543
    uqxtn       v0.8b,  v0.8h
544
    cmtst       v0.8b,  v0.8b,  v0.8b
545
.else
546
    ld1         {v0.8h,v1.8h}, [x0]
547
    uqxtn       v0.8b,  v0.8h
548
    uqxtn2      v0.16b, v1.8h
549
    cmtst       v0.16b, v0.16b, v0.16b
550
    shrn        v0.8b,  v0.8h,  #4
551
.endif
552
    fmov        x2,  d0
553
.if \size == 15
554
    add         x0,  x0,  #2
555
.endif
556

557
    coeff_level_run_start \size
558

559
    coeff_level_run (4 - (\size + 1) / 8)
560

561
    ret
562
endfunc
563
.endm
564

565
X264_COEFF_LEVEL_RUN 8
566
X264_COEFF_LEVEL_RUN 15
567
X264_COEFF_LEVEL_RUN 16
568

569
function x264_denoise_dct_neon, export=1
570
1:  subs        w3,  w3,  #16
571
    ld1         {v0.8h,v1.8h}, [x0]
572
    ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
573
    abs         v16.8h,  v0.8h
574
    abs         v17.8h,  v1.8h
575
    ld1         {v2.8h,v3.8h}, [x2], #32
576
    cmlt        v18.8h,  v0.8h,   #0
577
    cmlt        v19.8h,  v1.8h,   #0
578
    uaddw       v4.4s,   v4.4s,   v16.4h
579
    uaddw2      v5.4s,   v5.4s,   v16.8h
580
    uqsub       v20.8h,  v16.8h,  v2.8h
581
    uqsub       v21.8h,  v17.8h,  v3.8h
582
    uaddw       v6.4s,   v6.4s,   v17.4h
583
    uaddw2      v7.4s,   v7.4s,   v17.8h
584
    neg         v22.8h,  v20.8h
585
    neg         v23.8h,  v21.8h
586
    bsl         v18.16b, v22.16b, v20.16b
587
    bsl         v19.16b, v23.16b, v21.16b
588
    st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
589
    st1         {v18.8h,v19.8h}, [x0], #32
590
    b.gt        1b
591
    ret
592
endfunc
593

594
Product

Resources

Company