Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* mc.S: aarch64 motion compensation
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Janne Grunau <[email protected]>
8
* Mans Rullgard <[email protected]>
9
* Stefan Groenroos <[email protected]>
10
*
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; either version 2 of the License, or
14
* (at your option) any later version.
15
*
16
* This program is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
* GNU General Public License for more details.
20
*
21
* You should have received a copy of the GNU General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24
*
25
* This program is also available under a commercial proprietary license.
26
* For more information, contact us at [email protected].
27
*****************************************************************************/
28
29
#include "asm.S"
30
31
// note: prefetch stuff assumes 64-byte cacheline
32
33
// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
34
function x264_prefetch_ref_aarch64, export=1
35
cmp w2, #1
36
csel x2, xzr, x1, eq
37
add x0, x0, #64
38
add x0, x0, x2, lsl #3
39
40
lsl x2, x1, #1
41
add x3, x1, x1, lsl #1
42
add x4, x0, x1, lsl #2
43
44
prfm pldl1strm, [x0]
45
prfm pldl1strm, [x0, x1]
46
prfm pldl1strm, [x0, x2]
47
prfm pldl1strm, [x0, x3]
48
prfm pldl1strm, [x4]
49
prfm pldl1strm, [x4, x1]
50
prfm pldl1strm, [x4, x2]
51
prfm pldl1strm, [x4, x3]
52
ret
53
endfunc
54
55
// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
56
// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
57
.macro x264_prefetch_fenc sub
58
function x264_prefetch_fenc_\sub\()_aarch64, export=1
59
and w6, w5, #3
60
and w7, w5, #3
61
mul x6, x6, x1
62
mul x7, x7, x3
63
add x0, x0, #64
64
add x2, x2, #64
65
66
add x0, x0, x6, lsl #2
67
add x6, x0, x1, lsl #1
68
prfm pldl1strm, [x0]
69
prfm pldl1strm, [x0, x1]
70
prfm pldl1strm, [x6]
71
prfm pldl1strm, [x6, x1]
72
73
add x2, x2, x7, lsl #1
74
prfm pldl1strm, [x2]
75
prfm pldl1strm, [x2, x3]
76
.ifc \sub, 422
77
add x7, x2, x3, lsl #1
78
prfm pldl1strm, [x7]
79
prfm pldl1strm, [x7, x3]
80
.endif
81
ret
82
endfunc
83
.endm
84
85
x264_prefetch_fenc 420
86
x264_prefetch_fenc 422
87
88
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
89
// uint8_t *src1, intptr_t src1_stride,
90
// uint8_t *src2, intptr_t src2_stride, int weight );
91
.macro AVGH w h
92
function x264_pixel_avg_\w\()x\h\()_neon, export=1
93
mov w10, #64
94
cmp w6, #32
95
mov w9, #\h
96
b.eq pixel_avg_w\w\()_neon
97
subs w7, w10, w6
98
b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
99
cmp w6, #0
100
b.ge pixel_avg_weight_w\w\()_add_add_neon
101
b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
102
endfunc
103
.endm
104
105
AVGH 4, 2
106
AVGH 4, 4
107
AVGH 4, 8
108
AVGH 4, 16
109
AVGH 8, 4
110
AVGH 8, 8
111
AVGH 8, 16
112
AVGH 16, 8
113
AVGH 16, 16
114
115
// 0 < weight < 64
116
.macro load_weights_add_add
117
mov w6, w6
118
.endm
119
.macro weight_add_add dst, s1, s2, h=
120
.ifc \h, 2
121
umull2 \dst, \s1, v30.16b
122
umlal2 \dst, \s2, v31.16b
123
.else
124
umull \dst, \s1, v30.8b
125
umlal \dst, \s2, v31.8b
126
.endif
127
.endm
128
129
// weight > 64
130
.macro load_weights_add_sub
131
neg w7, w7
132
.endm
133
.macro weight_add_sub dst, s1, s2, h=
134
.ifc \h, 2
135
umull2 \dst, \s1, v30.16b
136
umlsl2 \dst, \s2, v31.16b
137
.else
138
umull \dst, \s1, v30.8b
139
umlsl \dst, \s2, v31.8b
140
.endif
141
.endm
142
143
// weight < 0
144
.macro load_weights_sub_add
145
neg w6, w6
146
.endm
147
.macro weight_sub_add dst, s1, s2, h=
148
.ifc \h, 2
149
umull2 \dst, \s2, v31.16b
150
umlsl2 \dst, \s1, v30.16b
151
.else
152
umull \dst, \s2, v31.8b
153
umlsl \dst, \s1, v30.8b
154
.endif
155
.endm
156
157
.macro AVG_WEIGHT ext
158
function pixel_avg_weight_w4_\ext\()_neon
159
load_weights_\ext
160
dup v30.8b, w6
161
dup v31.8b, w7
162
1: // height loop
163
subs w9, w9, #2
164
ld1 {v0.s}[0], [x2], x3
165
ld1 {v1.s}[0], [x4], x5
166
weight_\ext v4.8h, v0.8b, v1.8b
167
ld1 {v2.s}[0], [x2], x3
168
ld1 {v3.s}[0], [x4], x5
169
sqrshrun v0.8b, v4.8h, #6
170
weight_\ext v5.8h, v2.8b, v3.8b
171
st1 {v0.s}[0], [x0], x1
172
sqrshrun v1.8b, v5.8h, #6
173
st1 {v1.s}[0], [x0], x1
174
b.gt 1b
175
ret
176
endfunc
177
178
function pixel_avg_weight_w8_\ext\()_neon
179
load_weights_\ext
180
dup v30.8b, w6
181
dup v31.8b, w7
182
1: // height loop
183
subs w9, w9, #4
184
ld1 {v0.8b}, [x2], x3
185
ld1 {v1.8b}, [x4], x5
186
weight_\ext v16.8h, v0.8b, v1.8b
187
ld1 {v2.8b}, [x2], x3
188
ld1 {v3.8b}, [x4], x5
189
weight_\ext v17.8h, v2.8b, v3.8b
190
ld1 {v4.8b}, [x2], x3
191
ld1 {v5.8b}, [x4], x5
192
weight_\ext v18.8h, v4.8b, v5.8b
193
ld1 {v6.8b}, [x2], x3
194
ld1 {v7.8b}, [x4], x5
195
weight_\ext v19.8h, v6.8b, v7.8b
196
sqrshrun v0.8b, v16.8h, #6
197
sqrshrun v1.8b, v17.8h, #6
198
sqrshrun v2.8b, v18.8h, #6
199
sqrshrun v3.8b, v19.8h, #6
200
st1 {v0.8b}, [x0], x1
201
st1 {v1.8b}, [x0], x1
202
st1 {v2.8b}, [x0], x1
203
st1 {v3.8b}, [x0], x1
204
b.gt 1b
205
ret
206
endfunc
207
208
function pixel_avg_weight_w16_\ext\()_neon
209
load_weights_\ext
210
dup v30.16b, w6
211
dup v31.16b, w7
212
1: // height loop
213
subs w9, w9, #2
214
ld1 {v0.16b}, [x2], x3
215
ld1 {v1.16b}, [x4], x5
216
weight_\ext v16.8h, v0.8b, v1.8b
217
weight_\ext v17.8h, v0.16b, v1.16b, 2
218
ld1 {v2.16b}, [x2], x3
219
ld1 {v3.16b}, [x4], x5
220
weight_\ext v18.8h, v2.8b, v3.8b
221
weight_\ext v19.8h, v2.16b, v3.16b, 2
222
sqrshrun v0.8b, v16.8h, #6
223
sqrshrun v1.8b, v18.8h, #6
224
sqrshrun2 v0.16b, v17.8h, #6
225
sqrshrun2 v1.16b, v19.8h, #6
226
st1 {v0.16b}, [x0], x1
227
st1 {v1.16b}, [x0], x1
228
b.gt 1b
229
ret
230
endfunc
231
.endm
232
233
AVG_WEIGHT add_add
234
AVG_WEIGHT add_sub
235
AVG_WEIGHT sub_add
236
237
function pixel_avg_w4_neon
238
1: subs w9, w9, #2
239
ld1 {v0.s}[0], [x2], x3
240
ld1 {v2.s}[0], [x4], x5
241
urhadd v0.8b, v0.8b, v2.8b
242
ld1 {v1.s}[0], [x2], x3
243
ld1 {v3.s}[0], [x4], x5
244
urhadd v1.8b, v1.8b, v3.8b
245
st1 {v0.s}[0], [x0], x1
246
st1 {v1.s}[0], [x0], x1
247
b.gt 1b
248
ret
249
endfunc
250
251
function pixel_avg_w8_neon
252
1: subs w9, w9, #4
253
ld1 {v0.8b}, [x2], x3
254
ld1 {v1.8b}, [x4], x5
255
ld1 {v2.8b}, [x2], x3
256
urhadd v0.8b, v0.8b, v1.8b
257
ld1 {v3.8b}, [x4], x5
258
st1 {v0.8b}, [x0], x1
259
ld1 {v4.8b}, [x2], x3
260
urhadd v1.8b, v2.8b, v3.8b
261
ld1 {v5.8b}, [x4], x5
262
st1 {v1.8b}, [x0], x1
263
ld1 {v6.8b}, [x2], x3
264
ld1 {v7.8b}, [x4], x5
265
urhadd v2.8b, v4.8b, v5.8b
266
urhadd v3.8b, v6.8b, v7.8b
267
st1 {v2.8b}, [x0], x1
268
st1 {v3.8b}, [x0], x1
269
b.gt 1b
270
ret
271
endfunc
272
273
function pixel_avg_w16_neon
274
1: subs w9, w9, #4
275
ld1 {v0.16b}, [x2], x3
276
ld1 {v1.16b}, [x4], x5
277
ld1 {v2.16b}, [x2], x3
278
urhadd v0.16b, v0.16b, v1.16b
279
ld1 {v3.16b}, [x4], x5
280
st1 {v0.16b}, [x0], x1
281
ld1 {v4.16b}, [x2], x3
282
urhadd v1.16b, v2.16b, v3.16b
283
ld1 {v5.16b}, [x4], x5
284
st1 {v1.16b}, [x0], x1
285
ld1 {v6.16b}, [x2], x3
286
ld1 {v7.16b}, [x4], x5
287
urhadd v2.16b, v4.16b, v5.16b
288
urhadd v3.16b, v6.16b, v7.16b
289
st1 {v2.16b}, [x0], x1
290
st1 {v3.16b}, [x0], x1
291
b.gt 1b
292
ret
293
endfunc
294
295
function x264_pixel_avg2_w4_neon, export=1
296
1:
297
subs w5, w5, #2
298
ld1 {v0.s}[0], [x2], x3
299
ld1 {v2.s}[0], [x4], x3
300
urhadd v0.8b, v0.8b, v2.8b
301
ld1 {v1.s}[0], [x2], x3
302
ld1 {v3.s}[0], [x4], x3
303
urhadd v1.8b, v1.8b, v3.8b
304
st1 {v0.s}[0], [x0], x1
305
st1 {v1.s}[0], [x0], x1
306
b.gt 1b
307
ret
308
endfunc
309
310
function x264_pixel_avg2_w8_neon, export=1
311
1:
312
subs w5, w5, #2
313
ld1 {v0.8b}, [x2], x3
314
ld1 {v2.8b}, [x4], x3
315
urhadd v0.8b, v0.8b, v2.8b
316
ld1 {v1.8b}, [x2], x3
317
ld1 {v3.8b}, [x4], x3
318
urhadd v1.8b, v1.8b, v3.8b
319
st1 {v0.8b}, [x0], x1
320
st1 {v1.8b}, [x0], x1
321
b.gt 1b
322
ret
323
endfunc
324
325
function x264_pixel_avg2_w16_neon, export=1
326
1:
327
subs w5, w5, #2
328
ld1 {v0.16b}, [x2], x3
329
ld1 {v2.16b}, [x4], x3
330
urhadd v0.16b, v0.16b, v2.16b
331
ld1 {v1.16b}, [x2], x3
332
ld1 {v3.16b}, [x4], x3
333
urhadd v1.16b, v1.16b, v3.16b
334
st1 {v0.16b}, [x0], x1
335
st1 {v1.16b}, [x0], x1
336
b.gt 1b
337
ret
338
endfunc
339
340
function x264_pixel_avg2_w20_neon, export=1
341
sub x1, x1, #16
342
1:
343
subs w5, w5, #2
344
ld1 {v0.16b,v1.16b}, [x2], x3
345
ld1 {v2.16b,v3.16b}, [x4], x3
346
urhadd v0.16b, v0.16b, v2.16b
347
urhadd v1.8b, v1.8b, v3.8b
348
ld1 {v4.16b,v5.16b}, [x2], x3
349
ld1 {v6.16b,v7.16b}, [x4], x3
350
urhadd v4.16b, v4.16b, v6.16b
351
urhadd v5.8b, v5.8b, v7.8b
352
st1 {v0.16b}, [x0], #16
353
st1 {v1.s}[0], [x0], x1
354
st1 {v4.16b}, [x0], #16
355
st1 {v5.s}[0], [x0], x1
356
b.gt 1b
357
ret
358
endfunc
359
360
.macro weight_prologue type
361
mov w9, w5 // height
362
.ifc \type, full
363
ldr w12, [x4, #32] // denom
364
.endif
365
ldp w4, w5, [x4, #32+4] // scale, offset
366
dup v0.16b, w4
367
dup v1.8h, w5
368
.ifc \type, full
369
neg w12, w12
370
dup v2.8h, w12
371
.endif
372
.endm
373
374
// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
375
// intptr_t dst_stride, const x264_weight_t *weight, int h )
376
function x264_mc_weight_w20_neon, export=1
377
weight_prologue full
378
sub x1, x1, #16
379
1:
380
subs w9, w9, #2
381
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
382
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
383
umull v22.8h, v16.8b, v0.8b
384
umull v23.8h, v17.8b, v0.8b
385
zip1 v18.2s, v18.2s, v21.2s
386
umull v25.8h, v19.8b, v0.8b
387
umull v26.8h, v20.8b, v0.8b
388
umull v24.8h, v18.8b, v0.8b
389
srshl v22.8h, v22.8h, v2.8h
390
srshl v23.8h, v23.8h, v2.8h
391
srshl v24.8h, v24.8h, v2.8h
392
srshl v25.8h, v25.8h, v2.8h
393
srshl v26.8h, v26.8h, v2.8h
394
add v22.8h, v22.8h, v1.8h
395
add v23.8h, v23.8h, v1.8h
396
add v24.8h, v24.8h, v1.8h
397
add v25.8h, v25.8h, v1.8h
398
add v26.8h, v26.8h, v1.8h
399
sqxtun v4.8b, v22.8h
400
sqxtun2 v4.16b, v23.8h
401
sqxtun v6.8b, v24.8h
402
sqxtun v5.8b, v25.8h
403
sqxtun2 v5.16b, v26.8h
404
st1 {v4.16b}, [x0], #16
405
st1 {v6.s}[0], [x0], x1
406
st1 {v5.16b}, [x0], #16
407
st1 {v6.s}[1], [x0], x1
408
b.gt 1b
409
ret
410
endfunc
411
412
function x264_mc_weight_w16_neon, export=1
413
weight_prologue full
414
weight16_loop:
415
1:
416
subs w9, w9, #2
417
ld1 {v4.16b}, [x2], x3
418
ld1 {v5.16b}, [x2], x3
419
umull v22.8h, v4.8b, v0.8b
420
umull2 v23.8h, v4.16b, v0.16b
421
umull v24.8h, v5.8b, v0.8b
422
umull2 v25.8h, v5.16b, v0.16b
423
srshl v22.8h, v22.8h, v2.8h
424
srshl v23.8h, v23.8h, v2.8h
425
srshl v24.8h, v24.8h, v2.8h
426
srshl v25.8h, v25.8h, v2.8h
427
add v22.8h, v22.8h, v1.8h
428
add v23.8h, v23.8h, v1.8h
429
add v24.8h, v24.8h, v1.8h
430
add v25.8h, v25.8h, v1.8h
431
sqxtun v4.8b, v22.8h
432
sqxtun2 v4.16b, v23.8h
433
sqxtun v5.8b, v24.8h
434
sqxtun2 v5.16b, v25.8h
435
st1 {v4.16b}, [x0], x1
436
st1 {v5.16b}, [x0], x1
437
b.gt 1b
438
ret
439
endfunc
440
441
function x264_mc_weight_w8_neon, export=1
442
weight_prologue full
443
1:
444
subs w9, w9, #2
445
ld1 {v16.8b}, [x2], x3
446
ld1 {v17.8b}, [x2], x3
447
umull v4.8h, v16.8b, v0.8b
448
umull v5.8h, v17.8b, v0.8b
449
srshl v4.8h, v4.8h, v2.8h
450
srshl v5.8h, v5.8h, v2.8h
451
add v4.8h, v4.8h, v1.8h
452
add v5.8h, v5.8h, v1.8h
453
sqxtun v16.8b, v4.8h
454
sqxtun v17.8b, v5.8h
455
st1 {v16.8b}, [x0], x1
456
st1 {v17.8b}, [x0], x1
457
b.gt 1b
458
ret
459
endfunc
460
461
function x264_mc_weight_w4_neon, export=1
462
weight_prologue full
463
1:
464
subs w9, w9, #2
465
ld1 {v16.s}[0], [x2], x3
466
ld1 {v16.s}[1], [x2], x3
467
umull v4.8h, v16.8b, v0.8b
468
srshl v4.8h, v4.8h, v2.8h
469
add v4.8h, v4.8h, v1.8h
470
sqxtun v16.8b, v4.8h
471
st1 {v16.s}[0], [x0], x1
472
st1 {v16.s}[1], [x0], x1
473
b.gt 1b
474
ret
475
endfunc
476
477
function x264_mc_weight_w20_nodenom_neon, export=1
478
weight_prologue nodenom
479
sub x1, x1, #16
480
1:
481
subs w9, w9, #2
482
ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
483
mov v27.16b, v1.16b
484
mov v28.16b, v1.16b
485
ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
486
mov v31.16b, v1.16b
487
mov v29.16b, v1.16b
488
mov v30.16b, v1.16b
489
zip1 v18.2s, v18.2s, v21.2s
490
umlal v27.8h, v16.8b, v0.8b
491
umlal v28.8h, v17.8b, v0.8b
492
umlal v31.8h, v18.8b, v0.8b
493
umlal v29.8h, v19.8b, v0.8b
494
umlal v30.8h, v20.8b, v0.8b
495
sqxtun v4.8b, v27.8h
496
sqxtun2 v4.16b, v28.8h
497
sqxtun v5.8b, v29.8h
498
sqxtun2 v5.16b, v30.8h
499
sqxtun v6.8b, v31.8h
500
st1 {v4.16b}, [x0], #16
501
st1 {v6.s}[0], [x0], x1
502
st1 {v5.16b}, [x0], #16
503
st1 {v6.s}[1], [x0], x1
504
b.gt 1b
505
ret
506
endfunc
507
508
function x264_mc_weight_w16_nodenom_neon, export=1
509
weight_prologue nodenom
510
1:
511
subs w9, w9, #2
512
ld1 {v6.16b}, [x2], x3
513
mov v27.16b, v1.16b
514
mov v28.16b, v1.16b
515
ld1 {v7.16b}, [x2], x3
516
mov v29.16b, v1.16b
517
mov v30.16b, v1.16b
518
umlal v27.8h, v6.8b, v0.8b
519
umlal2 v28.8h, v6.16b, v0.16b
520
umlal v29.8h, v7.8b, v0.8b
521
umlal2 v30.8h, v7.16b, v0.16b
522
sqxtun v4.8b, v27.8h
523
sqxtun2 v4.16b, v28.8h
524
sqxtun v5.8b, v29.8h
525
sqxtun2 v5.16b, v30.8h
526
st1 {v4.16b}, [x0], x1
527
st1 {v5.16b}, [x0], x1
528
b.gt 1b
529
ret
530
endfunc
531
532
function x264_mc_weight_w8_nodenom_neon, export=1
533
weight_prologue nodenom
534
1:
535
subs w9, w9, #2
536
ld1 {v16.8b}, [x2], x3
537
mov v27.16b, v1.16b
538
ld1 {v17.8b}, [x2], x3
539
mov v29.16b, v1.16b
540
umlal v27.8h, v16.8b, v0.8b
541
umlal v29.8h, v17.8b, v0.8b
542
sqxtun v4.8b, v27.8h
543
sqxtun v5.8b, v29.8h
544
st1 {v4.8b}, [x0], x1
545
st1 {v5.8b}, [x0], x1
546
b.gt 1b
547
ret
548
endfunc
549
550
function x264_mc_weight_w4_nodenom_neon, export=1
551
weight_prologue nodenom
552
1:
553
subs w9, w9, #2
554
ld1 {v16.s}[0], [x2], x3
555
ld1 {v16.s}[1], [x2], x3
556
mov v27.16b, v1.16b
557
umlal v27.8h, v16.8b, v0.8b
558
sqxtun v4.8b, v27.8h
559
st1 {v4.s}[0], [x0], x1
560
st1 {v4.s}[1], [x0], x1
561
b.gt 1b
562
ret
563
endfunc
564
565
.macro weight_simple_prologue
566
ldr w6, [x4] // offset
567
dup v1.16b, w6
568
.endm
569
570
.macro weight_simple name op
571
function x264_mc_weight_w20_\name\()_neon, export=1
572
weight_simple_prologue
573
1:
574
subs w5, w5, #2
575
ldr s18, [x2, #16]
576
ld1 {v16.16b}, [x2], x3
577
ldr s19, [x2, #16]
578
ld1 {v17.16b}, [x2], x3
579
\op v18.8b, v18.8b, v1.8b
580
\op v16.16b, v16.16b, v1.16b
581
\op v19.8b, v19.8b, v1.8b
582
\op v17.16b, v17.16b, v1.16b
583
str s18, [x0, #16]
584
st1 {v16.16b}, [x0], x1
585
str s19, [x0, #16]
586
st1 {v17.16b}, [x0], x1
587
b.gt 1b
588
ret
589
endfunc
590
591
function x264_mc_weight_w16_\name\()_neon, export=1
592
weight_simple_prologue
593
1:
594
subs w5, w5, #2
595
ld1 {v16.16b}, [x2], x3
596
ld1 {v17.16b}, [x2], x3
597
\op v16.16b, v16.16b, v1.16b
598
\op v17.16b, v17.16b, v1.16b
599
st1 {v16.16b}, [x0], x1
600
st1 {v17.16b}, [x0], x1
601
b.gt 1b
602
ret
603
endfunc
604
605
function x264_mc_weight_w8_\name\()_neon, export=1
606
weight_simple_prologue
607
1:
608
subs w5, w5, #2
609
ld1 {v16.8b}, [x2], x3
610
ld1 {v17.8b}, [x2], x3
611
\op v16.8b, v16.8b, v1.8b
612
\op v17.8b, v17.8b, v1.8b
613
st1 {v16.8b}, [x0], x1
614
st1 {v17.8b}, [x0], x1
615
b.gt 1b
616
ret
617
endfunc
618
619
function x264_mc_weight_w4_\name\()_neon, export=1
620
weight_simple_prologue
621
1:
622
subs w5, w5, #2
623
ld1 {v16.s}[0], [x2], x3
624
ld1 {v16.s}[1], [x2], x3
625
\op v16.8b, v16.8b, v1.8b
626
st1 {v16.s}[0], [x0], x1
627
st1 {v16.s}[1], [x0], x1
628
b.gt 1b
629
ret
630
endfunc
631
.endm
632
633
weight_simple offsetadd, uqadd
634
weight_simple offsetsub, uqsub
635
636
637
// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
638
function x264_mc_copy_w4_neon, export=1
639
1:
640
subs w4, w4, #4
641
ld1 {v0.s}[0], [x2], x3
642
ld1 {v1.s}[0], [x2], x3
643
ld1 {v2.s}[0], [x2], x3
644
ld1 {v3.s}[0], [x2], x3
645
st1 {v0.s}[0], [x0], x1
646
st1 {v1.s}[0], [x0], x1
647
st1 {v2.s}[0], [x0], x1
648
st1 {v3.s}[0], [x0], x1
649
b.gt 1b
650
ret
651
endfunc
652
653
function x264_mc_copy_w8_neon, export=1
654
1: subs w4, w4, #4
655
ld1 {v0.8b}, [x2], x3
656
ld1 {v1.8b}, [x2], x3
657
ld1 {v2.8b}, [x2], x3
658
ld1 {v3.8b}, [x2], x3
659
st1 {v0.8b}, [x0], x1
660
st1 {v1.8b}, [x0], x1
661
st1 {v2.8b}, [x0], x1
662
st1 {v3.8b}, [x0], x1
663
b.gt 1b
664
ret
665
endfunc
666
667
function x264_mc_copy_w16_neon, export=1
668
1: subs w4, w4, #4
669
ld1 {v0.16b}, [x2], x3
670
ld1 {v1.16b}, [x2], x3
671
ld1 {v2.16b}, [x2], x3
672
ld1 {v3.16b}, [x2], x3
673
st1 {v0.16b}, [x0], x1
674
st1 {v1.16b}, [x0], x1
675
st1 {v2.16b}, [x0], x1
676
st1 {v3.16b}, [x0], x1
677
b.gt 1b
678
ret
679
endfunc
680
681
// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
682
// intptr_t i_dst_stride,
683
// uint8_t *src, intptr_t i_src_stride,
684
// int dx, int dy, int i_width, int i_height );
685
function x264_mc_chroma_neon, export=1
686
ldr w15, [sp] // height
687
sbfx x12, x6, #3, #29 // asr(3) and sign extend
688
sbfx x11, x5, #3, #29 // asr(3) and sign extend
689
cmp w7, #4
690
mul x12, x12, x4
691
add x3, x3, x11, lsl #1
692
693
and w5, w5, #7
694
and w6, w6, #7
695
696
add x3, x3, x12
697
698
//pld [x3]
699
//pld [x3, x4]
700
701
b.gt mc_chroma_w8_neon
702
b.eq mc_chroma_w4_neon
703
endfunc
704
705
.macro CHROMA_MC_START r00, r01, r10, r11
706
mul w12, w5, w6 // cD = d8x *d8y
707
lsl w13, w5, #3
708
add w9, w12, #64
709
lsl w14, w6, #3
710
tst w12, w12
711
sub w9, w9, w13
712
sub w10, w13, w12 // cB = d8x *(8-d8y);
713
sub w11, w14, w12 // cC = (8-d8x)*d8y
714
sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
715
.endm
716
717
.macro CHROMA_MC width, vsize
718
function mc_chroma_w\width\()_neon
719
// since the element size varies, there's a different index for the 2nd store
720
.if \width == 4
721
.set st2, 1
722
.else
723
.set st2, 2
724
.endif
725
CHROMA_MC_START
726
b.eq 2f
727
728
ld2 {v28.8b,v29.8b}, [x3], x4
729
dup v0.8b, w9 // cA
730
dup v1.8b, w10 // cB
731
732
ext v6.8b, v28.8b, v6.8b, #1
733
ext v7.8b, v29.8b, v7.8b, #1
734
735
ld2 {v30.8b,v31.8b}, [x3], x4
736
dup v2.8b, w11 // cC
737
dup v3.8b, w12 // cD
738
739
ext v22.8b, v30.8b, v22.8b, #1
740
ext v23.8b, v31.8b, v23.8b, #1
741
742
trn1 v0.2s, v0.2s, v1.2s
743
trn1 v2.2s, v2.2s, v3.2s
744
745
trn1 v4.2s, v28.2s, v6.2s
746
trn1 v5.2s, v29.2s, v7.2s
747
trn1 v20.2s, v30.2s, v22.2s
748
trn1 v21.2s, v31.2s, v23.2s
749
1: // height loop, interpolate xy
750
subs w15, w15, #2
751
umull v16.8h, v4.8b, v0.8b
752
umlal v16.8h, v20.8b, v2.8b
753
umull v17.8h, v5.8b, v0.8b
754
umlal v17.8h, v21.8b, v2.8b
755
756
ld2 {v28.8b,v29.8b}, [x3], x4
757
transpose v24.2d, v25.2d, v16.2d, v17.2d
758
759
ext v6.8b, v28.8b, v6.8b, #1
760
ext v7.8b, v29.8b, v7.8b, #1
761
762
trn1 v4.2s, v28.2s, v6.2s
763
trn1 v5.2s, v29.2s, v7.2s
764
765
add v16.8h, v24.8h, v25.8h
766
767
umull v18.8h, v20.8b, v0.8b
768
umlal v18.8h, v4.8b, v2.8b
769
umull v19.8h, v21.8b, v0.8b
770
umlal v19.8h, v5.8b, v2.8b
771
772
ld2 {v30.8b,v31.8b}, [x3], x4
773
transpose v26.2d, v27.2d, v18.2d, v19.2d
774
775
ext v22.8b, v30.8b, v22.8b, #1
776
ext v23.8b, v31.8b, v23.8b, #1
777
trn1 v20.2s, v30.2s, v22.2s
778
trn1 v21.2s, v31.2s, v23.2s
779
780
add v17.8h, v26.8h, v27.8h
781
782
rshrn v16.8b, v16.8h, #6
783
rshrn v17.8b, v17.8h, #6
784
785
//pld [x3]
786
//pld [x3, x4]
787
788
st1 {v16.\vsize}[0], [x0], x2
789
st1 {v16.\vsize}[st2], [x1], x2
790
st1 {v17.\vsize}[0], [x0], x2
791
st1 {v17.\vsize}[st2], [x1], x2
792
b.gt 1b
793
794
ret
795
2: // dx or dy are 0
796
tst w11, w11
797
add w10, w10, w11
798
dup v0.8b, w9
799
dup v1.8b, w10
800
801
b.eq 4f
802
803
ld1 {v4.8b}, [x3], x4
804
ld1 {v6.8b}, [x3], x4
805
3: // vertical interpolation loop
806
subs w15, w15, #2
807
umull v16.8h, v4.8b, v0.8b
808
ld1 {v4.8b}, [x3], x4
809
umlal v16.8h, v6.8b, v1.8b
810
umull v17.8h, v6.8b, v0.8b
811
ld1 {v6.8b}, [x3], x4
812
umlal v17.8h, v4.8b, v1.8b
813
814
rshrn v20.8b, v16.8h, #6 // uvuvuvuv
815
rshrn v21.8b, v17.8h, #6 // uvuvuvuv
816
817
uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
818
uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
819
820
//pld [x3]
821
//pld [x3, x4]
822
823
st1 {v16.\vsize}[0], [x0], x2
824
st1 {v16.\vsize}[st2], [x0], x2
825
st1 {v17.\vsize}[0], [x1], x2
826
st1 {v17.\vsize}[st2], [x1], x2
827
b.gt 3b
828
829
ret
830
831
4: // dy is 0
832
ld1 {v4.8b,v5.8b}, [x3], x4
833
ld1 {v6.8b,v7.8b}, [x3], x4
834
835
ext v5.8b, v4.8b, v5.8b, #2
836
ext v7.8b, v6.8b, v7.8b, #2
837
5: // horizontal interpolation loop
838
subs w15, w15, #2
839
umull v16.8h, v4.8b, v0.8b
840
umlal v16.8h, v5.8b, v1.8b
841
umull v17.8h, v6.8b, v0.8b
842
umlal v17.8h, v7.8b, v1.8b
843
844
ld1 {v4.8b,v5.8b}, [x3], x4
845
ld1 {v6.8b,v7.8b}, [x3], x4
846
rshrn v20.8b, v16.8h, #6
847
rshrn v21.8b, v17.8h, #6
848
ext v5.8b, v4.8b, v5.8b, #2
849
ext v7.8b, v6.8b, v7.8b, #2
850
uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
851
uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
852
853
//pld [x3]
854
//pld [x3, x4]
855
856
st1 {v16.\vsize}[0], [x0], x2
857
st1 {v16.\vsize}[st2], [x0], x2
858
st1 {v17.\vsize}[0], [x1], x2
859
st1 {v17.\vsize}[st2], [x1], x2
860
b.gt 5b
861
862
ret
863
endfunc
864
.endm
865
866
CHROMA_MC 2, h
867
CHROMA_MC 4, s
868
869
function mc_chroma_w8_neon
870
CHROMA_MC_START
871
b.eq 2f
872
ld2 {v4.16b,v5.16b}, [x3], x4
873
ld2 {v20.16b,v21.16b}, [x3], x4
874
dup v0.8b, w9 // cA
875
dup v1.8b, w10 // cB
876
877
ext v6.16b, v4.16b, v4.16b, #1
878
ext v7.16b, v5.16b, v5.16b, #1
879
880
dup v2.8b, w11 // cC
881
dup v3.8b, w12 // cD
882
883
ext v22.16b, v20.16b, v20.16b, #1
884
ext v23.16b, v21.16b, v21.16b, #1
885
886
1: // height loop, interpolate xy
887
subs w15, w15, #2
888
umull v16.8h, v4.8b, v0.8b
889
umlal v16.8h, v6.8b, v1.8b
890
umlal v16.8h, v20.8b, v2.8b
891
umlal v16.8h, v22.8b, v3.8b
892
893
umull v17.8h, v5.8b, v0.8b
894
umlal v17.8h, v7.8b, v1.8b
895
umlal v17.8h, v21.8b, v2.8b
896
umlal v17.8h, v23.8b, v3.8b
897
898
ld2 {v4.16b,v5.16b}, [x3], x4
899
900
ext v6.16b, v4.16b, v4.16b, #1
901
ext v7.16b, v5.16b, v5.16b, #1
902
903
umull v18.8h, v20.8b, v0.8b
904
umlal v18.8h, v22.8b, v1.8b
905
umlal v18.8h, v4.8b, v2.8b
906
umlal v18.8h, v6.8b, v3.8b
907
908
umull v19.8h, v21.8b, v0.8b
909
umlal v19.8h, v23.8b, v1.8b
910
umlal v19.8h, v5.8b, v2.8b
911
umlal v19.8h, v7.8b, v3.8b
912
913
ld2 {v20.16b,v21.16b}, [x3], x4
914
915
rshrn v16.8b, v16.8h, #6
916
rshrn v17.8b, v17.8h, #6
917
rshrn v18.8b, v18.8h, #6
918
rshrn v19.8b, v19.8h, #6
919
920
ext v22.16b, v20.16b, v20.16b, #1
921
ext v23.16b, v21.16b, v21.16b, #1
922
923
//pld [x3]
924
//pld [x3, x4]
925
926
st1 {v16.8b}, [x0], x2
927
st1 {v17.8b}, [x1], x2
928
st1 {v18.8b}, [x0], x2
929
st1 {v19.8b}, [x1], x2
930
b.gt 1b
931
932
ret
933
2: // dx or dy are 0
934
tst w11, w11
935
add w10, w10, w11
936
dup v0.8b, w9
937
dup v1.8b, w10
938
939
b.eq 4f
940
941
ld2 {v4.8b,v5.8b}, [x3], x4
942
ld2 {v6.8b,v7.8b}, [x3], x4
943
3: // vertical interpolation loop
944
subs w15, w15, #2
945
umull v16.8h, v4.8b, v0.8b //U
946
umlal v16.8h, v6.8b, v1.8b
947
umull v17.8h, v5.8b, v0.8b //V
948
umlal v17.8h, v7.8b, v1.8b
949
950
ld2 {v4.8b,v5.8b}, [x3], x4
951
952
umull v18.8h, v6.8b, v0.8b
953
umlal v18.8h, v4.8b, v1.8b
954
umull v19.8h, v7.8b, v0.8b
955
umlal v19.8h, v5.8b, v1.8b
956
957
ld2 {v6.8b,v7.8b}, [x3], x4
958
959
rshrn v16.8b, v16.8h, #6
960
rshrn v17.8b, v17.8h, #6
961
rshrn v18.8b, v18.8h, #6
962
rshrn v19.8b, v19.8h, #6
963
964
//pld [x3]
965
//pld [x3, x4]
966
967
st1 {v16.8b}, [x0], x2
968
st1 {v17.8b}, [x1], x2
969
st1 {v18.8b}, [x0], x2
970
st1 {v19.8b}, [x1], x2
971
b.gt 3b
972
973
ret
974
4: // dy is 0
975
ld2 {v4.16b,v5.16b}, [x3], x4
976
ext v6.16b, v4.16b, v4.16b, #1
977
ext v7.16b, v5.16b, v5.16b, #1
978
ld2 {v20.16b,v21.16b}, [x3], x4
979
ext v22.16b, v20.16b, v20.16b, #1
980
ext v23.16b, v21.16b, v21.16b, #1
981
5: // horizontal interpolation loop
982
subs w15, w15, #2
983
umull v16.8h, v4.8b, v0.8b //U
984
umlal v16.8h, v6.8b, v1.8b
985
umull v17.8h, v5.8b, v0.8b //V
986
umlal v17.8h, v7.8b, v1.8b
987
988
ld2 {v4.16b,v5.16b}, [x3], x4
989
990
umull v18.8h, v20.8b, v0.8b
991
umlal v18.8h, v22.8b, v1.8b
992
umull v19.8h, v21.8b, v0.8b
993
umlal v19.8h, v23.8b, v1.8b
994
995
ld2 {v20.16b,v21.16b}, [x3], x4
996
997
rshrn v16.8b, v16.8h, #6
998
rshrn v17.8b, v17.8h, #6
999
rshrn v18.8b, v18.8h, #6
1000
rshrn v19.8b, v19.8h, #6
1001
1002
ext v6.16b, v4.16b, v4.16b, #1
1003
ext v7.16b, v5.16b, v5.16b, #1
1004
ext v22.16b, v20.16b, v20.16b, #1
1005
ext v23.16b, v21.16b, v21.16b, #1
1006
1007
//pld [x3]
1008
//pld [x3, x4]
1009
1010
st1 {v16.8b}, [x0], x2
1011
st1 {v17.8b}, [x1], x2
1012
st1 {v18.8b}, [x0], x2
1013
st1 {v19.8b}, [x1], x2
1014
b.gt 5b
1015
1016
ret
1017
endfunc
1018
1019
//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
1020
// intptr_t stride, int width, int height, int16_t *buf )
1021
function x264_hpel_filter_neon, export=1
1022
ubfm x9, x3, #0, #3
1023
add w15, w5, w9
1024
sub x13, x3, x9 // align src
1025
sub x10, x0, x9
1026
sub x11, x1, x9
1027
sub x12, x2, x9
1028
movi v30.16b, #5
1029
movi v31.16b, #20
1030
1: // line start
1031
mov x3, x13
1032
mov x2, x12
1033
mov x1, x11
1034
mov x0, x10
1035
add x7, x3, #16 // src pointer next 16b for horiz filter
1036
mov x5, x15 // restore width
1037
sub x3, x3, x4, lsl #1 // src - 2*stride
1038
ld1 {v28.16b}, [x7], #16 // src[16:31]
1039
1040
add x9, x3, x5 // holds src - 2*stride + width
1041
1042
ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
1043
ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
1044
ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
1045
ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
1046
ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
1047
ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
1048
1049
ext v22.16b, v7.16b, v18.16b, #14
1050
uaddl v1.8h, v16.8b, v21.8b
1051
ext v26.16b, v18.16b, v28.16b, #3
1052
umlsl v1.8h, v17.8b, v30.8b
1053
ext v23.16b, v7.16b, v18.16b, #15
1054
umlal v1.8h, v18.8b, v31.8b
1055
ext v24.16b, v18.16b, v28.16b, #1
1056
umlal v1.8h, v19.8b, v31.8b
1057
ext v25.16b, v18.16b, v28.16b, #2
1058
umlsl v1.8h, v20.8b, v30.8b
1059
2: // next 16 pixel of line
1060
subs x5, x5, #16
1061
sub x3, x9, x5 // src - 2*stride += 16
1062
1063
uaddl v4.8h, v22.8b, v26.8b
1064
uaddl2 v5.8h, v22.16b, v26.16b
1065
sqrshrun v6.8b, v1.8h, #5
1066
umlsl v4.8h, v23.8b, v30.8b
1067
umlsl2 v5.8h, v23.16b, v30.16b
1068
umlal v4.8h, v18.8b, v31.8b
1069
umlal2 v5.8h, v18.16b, v31.16b
1070
umlal v4.8h, v24.8b, v31.8b
1071
umlal2 v5.8h, v24.16b, v31.16b
1072
umlsl v4.8h, v25.8b, v30.8b
1073
umlsl2 v5.8h, v25.16b, v30.16b
1074
1075
uaddl2 v2.8h, v16.16b, v21.16b
1076
sqrshrun v4.8b, v4.8h, #5
1077
mov v7.16b, v18.16b
1078
sqrshrun2 v4.16b, v5.8h, #5
1079
1080
umlsl2 v2.8h, v17.16b, v30.16b
1081
ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
1082
umlal2 v2.8h, v18.16b, v31.16b
1083
ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
1084
umlal2 v2.8h, v19.16b, v31.16b
1085
ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
1086
umlsl2 v2.8h, v20.16b, v30.16b
1087
ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
1088
st1 {v4.16b}, [x0], #16
1089
sqrshrun2 v6.16b, v2.8h, #5
1090
ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
1091
ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
1092
1093
ext v22.16b, v0.16b, v1.16b, #12
1094
ext v26.16b, v1.16b, v2.16b, #6
1095
ext v23.16b, v0.16b, v1.16b, #14
1096
st1 {v6.16b}, [x1], #16
1097
uaddl v3.8h, v16.8b, v21.8b
1098
ext v25.16b, v1.16b, v2.16b, #4
1099
umlsl v3.8h, v17.8b, v30.8b
1100
ext v24.16b, v1.16b, v2.16b, #2
1101
1102
umlal v3.8h, v18.8b, v31.8b
1103
add v4.8h, v22.8h, v26.8h
1104
umlal v3.8h, v19.8b, v31.8b
1105
add v5.8h, v23.8h, v25.8h
1106
umlsl v3.8h, v20.8b, v30.8b
1107
add v6.8h, v24.8h, v1.8h
1108
1109
ext v22.16b, v1.16b, v2.16b, #12
1110
ext v26.16b, v2.16b, v3.16b, #6
1111
ext v23.16b, v1.16b, v2.16b, #14
1112
ext v25.16b, v2.16b, v3.16b, #4
1113
ext v24.16b, v2.16b, v3.16b, #2
1114
1115
add v22.8h, v22.8h, v26.8h
1116
add v23.8h, v23.8h, v25.8h
1117
add v24.8h, v24.8h, v2.8h
1118
1119
sub v4.8h, v4.8h, v5.8h // a-b
1120
sub v5.8h, v5.8h, v6.8h // b-c
1121
1122
sub v22.8h, v22.8h, v23.8h // a-b
1123
sub v23.8h, v23.8h, v24.8h // b-c
1124
1125
sshr v4.8h, v4.8h, #2 // (a-b)/4
1126
sshr v22.8h, v22.8h, #2 // (a-b)/4
1127
sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
1128
sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
1129
sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
1130
sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
1131
add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1132
add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1133
1134
sqrshrun v4.8b, v4.8h, #6
1135
ld1 {v28.16b}, [x7], #16 // src[16:31]
1136
mov v0.16b, v2.16b
1137
ext v23.16b, v7.16b, v18.16b, #15
1138
sqrshrun2 v4.16b, v22.8h, #6
1139
mov v1.16b, v3.16b
1140
ext v22.16b, v7.16b, v18.16b, #14
1141
ext v24.16b, v18.16b, v28.16b, #1
1142
ext v25.16b, v18.16b, v28.16b, #2
1143
ext v26.16b, v18.16b, v28.16b, #3
1144
1145
st1 {v4.16b}, [x2], #16
1146
b.gt 2b
1147
1148
subs w6, w6, #1
1149
add x10, x10, x4
1150
add x11, x11, x4
1151
add x12, x12, x4
1152
add x13, x13, x4
1153
b.gt 1b
1154
1155
ret
1156
endfunc
1157
1158
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
1159
// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
1160
// intptr_t dst_stride, int width, int height )
1161
function x264_frame_init_lowres_core_neon, export=1
1162
ldr w8, [sp]
1163
sub x10, x6, w7, uxtw // dst_stride - width
1164
and x10, x10, #~15
1165
1166
1:
1167
mov w9, w7 // width
1168
mov x11, x0 // src0
1169
add x12, x0, x5 // src1 = src0 + src_stride
1170
add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
1171
1172
ld2 {v0.16b,v1.16b}, [x11], #32
1173
ld2 {v2.16b,v3.16b}, [x12], #32
1174
ld2 {v4.16b,v5.16b}, [x13], #32
1175
1176
urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
1177
urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
1178
2:
1179
subs w9, w9, #16
1180
urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
1181
urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
1182
1183
ld2 {v0.16b,v1.16b}, [x11], #32
1184
ld2 {v2.16b,v3.16b}, [x12], #32
1185
ld2 {v4.16b,v5.16b}, [x13], #32
1186
urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
1187
urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
1188
ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
1189
ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
1190
1191
urhadd v16.16b, v20.16b, v21.16b
1192
urhadd v18.16b, v22.16b, v23.16b
1193
urhadd v17.16b, v21.16b, v24.16b
1194
urhadd v19.16b, v23.16b, v25.16b
1195
1196
st1 {v16.16b}, [x1], #16
1197
st1 {v18.16b}, [x3], #16
1198
st1 {v17.16b}, [x2], #16
1199
st1 {v19.16b}, [x4], #16
1200
b.le 3f
1201
1202
subs w9, w9, #16
1203
urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
1204
urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
1205
1206
ld2 {v0.16b,v1.16b}, [x11], #32
1207
ld2 {v2.16b,v3.16b}, [x12], #32
1208
ld2 {v4.16b,v5.16b}, [x13], #32
1209
urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
1210
urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
1211
ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
1212
ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
1213
1214
urhadd v16.16b, v30.16b, v21.16b
1215
urhadd v18.16b, v31.16b, v23.16b
1216
urhadd v17.16b, v21.16b, v24.16b
1217
urhadd v19.16b, v23.16b, v25.16b
1218
1219
st1 {v16.16b}, [x1], #16
1220
st1 {v18.16b}, [x3], #16
1221
st1 {v17.16b}, [x2], #16
1222
st1 {v19.16b}, [x4], #16
1223
b.gt 2b
1224
3:
1225
subs w8, w8, #1
1226
add x0, x0, x5, lsl #1
1227
add x1, x1, x10
1228
add x2, x2, x10
1229
add x3, x3, x10
1230
add x4, x4, x10
1231
b.gt 1b
1232
1233
ret
1234
endfunc
1235
1236
function x264_load_deinterleave_chroma_fenc_neon, export=1
1237
mov x4, #FENC_STRIDE/2
1238
b load_deinterleave_chroma
1239
endfunc
1240
1241
function x264_load_deinterleave_chroma_fdec_neon, export=1
1242
mov x4, #FDEC_STRIDE/2
1243
load_deinterleave_chroma:
1244
ld2 {v0.8b,v1.8b}, [x1], x2
1245
ld2 {v2.8b,v3.8b}, [x1], x2
1246
subs w3, w3, #2
1247
st1 {v0.8b}, [x0], x4
1248
st1 {v1.8b}, [x0], x4
1249
st1 {v2.8b}, [x0], x4
1250
st1 {v3.8b}, [x0], x4
1251
b.gt load_deinterleave_chroma
1252
1253
ret
1254
endfunc
1255
1256
function x264_plane_copy_neon, export=1
1257
add x8, x4, #15
1258
and x4, x8, #~15
1259
sub x1, x1, x4
1260
sub x3, x3, x4
1261
1:
1262
mov w8, w4
1263
16:
1264
tst w8, #16
1265
b.eq 32f
1266
subs w8, w8, #16
1267
ldr q0, [x2], #16
1268
str q0, [x0], #16
1269
b.eq 0f
1270
32:
1271
subs w8, w8, #32
1272
ldp q0, q1, [x2], #32
1273
stp q0, q1, [x0], #32
1274
b.gt 32b
1275
0:
1276
subs w5, w5, #1
1277
add x2, x2, x3
1278
add x0, x0, x1
1279
b.gt 1b
1280
1281
ret
1282
endfunc
1283
1284
function x264_plane_copy_deinterleave_neon, export=1
1285
add w9, w6, #15
1286
and w9, w9, #0xfffffff0
1287
sub x1, x1, x9
1288
sub x3, x3, x9
1289
sub x5, x5, x9, lsl #1
1290
1:
1291
ld2 {v0.16b,v1.16b}, [x4], #32
1292
subs w9, w9, #16
1293
st1 {v0.16b}, [x0], #16
1294
st1 {v1.16b}, [x2], #16
1295
b.gt 1b
1296
1297
add x4, x4, x5
1298
subs w7, w7, #1
1299
add x0, x0, x1
1300
add x2, x2, x3
1301
mov w9, w6
1302
b.gt 1b
1303
1304
ret
1305
endfunc
1306
1307
.macro deinterleave_rgb
1308
subs x11, x11, #8
1309
st1 {v0.8b}, [x0], #8
1310
st1 {v1.8b}, [x2], #8
1311
st1 {v2.8b}, [x4], #8
1312
b.gt 1b
1313
1314
subs w10, w10, #1
1315
add x0, x0, x1
1316
add x2, x2, x3
1317
add x4, x4, x5
1318
add x6, x6, x7
1319
mov x11, x9
1320
b.gt 1b
1321
.endm
1322
1323
function x264_plane_copy_deinterleave_rgb_neon, export=1
1324
#if SYS_MACOSX
1325
ldr w8, [sp]
1326
ldp w9, w10, [sp, #4]
1327
#else
1328
ldr x8, [sp]
1329
ldp x9, x10, [sp, #8]
1330
#endif
1331
cmp w8, #3
1332
uxtw x9, w9
1333
add x11, x9, #7
1334
and x11, x11, #~7
1335
sub x1, x1, x11
1336
sub x3, x3, x11
1337
sub x5, x5, x11
1338
b.ne 4f
1339
sub x7, x7, x11, lsl #1
1340
sub x7, x7, x11
1341
1:
1342
ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
1343
deinterleave_rgb
1344
1345
ret
1346
4:
1347
sub x7, x7, x11, lsl #2
1348
1:
1349
ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
1350
deinterleave_rgb
1351
1352
ret
1353
endfunc
1354
1355
function x264_plane_copy_interleave_neon, export=1
1356
add w9, w6, #15
1357
and w9, w9, #0xfffffff0
1358
sub x1, x1, x9, lsl #1
1359
sub x3, x3, x9
1360
sub x5, x5, x9
1361
1:
1362
ld1 {v0.16b}, [x2], #16
1363
ld1 {v1.16b}, [x4], #16
1364
subs w9, w9, #16
1365
st2 {v0.16b,v1.16b}, [x0], #32
1366
b.gt 1b
1367
1368
subs w7, w7, #1
1369
add x0, x0, x1
1370
add x2, x2, x3
1371
add x4, x4, x5
1372
mov w9, w6
1373
b.gt 1b
1374
1375
ret
1376
endfunc
1377
1378
function x264_store_interleave_chroma_neon, export=1
1379
mov x5, #FDEC_STRIDE
1380
1:
1381
ld1 {v0.8b}, [x2], x5
1382
ld1 {v1.8b}, [x3], x5
1383
ld1 {v2.8b}, [x2], x5
1384
ld1 {v3.8b}, [x3], x5
1385
subs w4, w4, #2
1386
zip1 v4.16b, v0.16b, v1.16b
1387
zip1 v5.16b, v2.16b, v3.16b
1388
st1 {v4.16b}, [x0], x1
1389
st1 {v5.16b}, [x0], x1
1390
b.gt 1b
1391
1392
ret
1393
endfunc
1394
1395
.macro integral4h p1, p2
1396
ext v1.8b, \p1\().8b, \p2\().8b, #1
1397
ext v2.8b, \p1\().8b, \p2\().8b, #2
1398
ext v3.8b, \p1\().8b, \p2\().8b, #3
1399
uaddl v0.8h, \p1\().8b, v1.8b
1400
uaddl v4.8h, v2.8b, v3.8b
1401
add v0.8h, v0.8h, v4.8h
1402
add v0.8h, v0.8h, v5.8h
1403
.endm
1404
1405
function integral_init4h_neon, export=1
1406
sub x3, x0, x2, lsl #1
1407
ld1 {v6.8b,v7.8b}, [x1], #16
1408
1:
1409
subs x2, x2, #16
1410
ld1 {v5.8h}, [x3], #16
1411
integral4h v6, v7
1412
ld1 {v6.8b}, [x1], #8
1413
ld1 {v5.8h}, [x3], #16
1414
st1 {v0.8h}, [x0], #16
1415
integral4h v7, v6
1416
ld1 {v7.8b}, [x1], #8
1417
st1 {v0.8h}, [x0], #16
1418
b.gt 1b
1419
ret
1420
endfunc
1421
1422
.macro integral8h p1, p2, s
1423
ext v1.8b, \p1\().8b, \p2\().8b, #1
1424
ext v2.8b, \p1\().8b, \p2\().8b, #2
1425
ext v3.8b, \p1\().8b, \p2\().8b, #3
1426
ext v4.8b, \p1\().8b, \p2\().8b, #4
1427
ext v5.8b, \p1\().8b, \p2\().8b, #5
1428
ext v6.8b, \p1\().8b, \p2\().8b, #6
1429
ext v7.8b, \p1\().8b, \p2\().8b, #7
1430
uaddl v0.8h, \p1\().8b, v1.8b
1431
uaddl v2.8h, v2.8b, v3.8b
1432
uaddl v4.8h, v4.8b, v5.8b
1433
uaddl v6.8h, v6.8b, v7.8b
1434
add v0.8h, v0.8h, v2.8h
1435
add v4.8h, v4.8h, v6.8h
1436
add v0.8h, v0.8h, v4.8h
1437
add v0.8h, v0.8h, \s\().8h
1438
.endm
1439
1440
function integral_init8h_neon, export=1
1441
sub x3, x0, x2, lsl #1
1442
ld1 {v16.8b,v17.8b}, [x1], #16
1443
1:
1444
subs x2, x2, #16
1445
ld1 {v18.8h}, [x3], #16
1446
integral8h v16, v17, v18
1447
ld1 {v16.8b}, [x1], #8
1448
ld1 {v18.8h}, [x3], #16
1449
st1 {v0.8h}, [x0], #16
1450
integral8h v17, v16, v18
1451
ld1 {v17.8b}, [x1], #8
1452
st1 {v0.8h}, [x0], #16
1453
b.gt 1b
1454
ret
1455
endfunc
1456
1457
function integral_init4v_neon, export=1
1458
mov x3, x0
1459
add x4, x0, x2, lsl #3
1460
add x8, x0, x2, lsl #4
1461
sub x2, x2, #8
1462
ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
1463
ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
1464
1:
1465
subs x2, x2, #16
1466
ld1 {v24.8h,v25.8h}, [x4], #32
1467
ext v0.16b, v20.16b, v21.16b, #8
1468
ext v1.16b, v21.16b, v22.16b, #8
1469
ext v2.16b, v16.16b, v17.16b, #8
1470
ext v3.16b, v17.16b, v18.16b, #8
1471
sub v24.8h, v24.8h, v20.8h
1472
sub v25.8h, v25.8h, v21.8h
1473
add v0.8h, v0.8h, v20.8h
1474
add v1.8h, v1.8h, v21.8h
1475
add v2.8h, v2.8h, v16.8h
1476
add v3.8h, v3.8h, v17.8h
1477
st1 {v24.8h}, [x1], #16
1478
st1 {v25.8h}, [x1], #16
1479
mov v20.16b, v22.16b
1480
mov v16.16b, v18.16b
1481
sub v0.8h, v2.8h, v0.8h
1482
sub v1.8h, v3.8h, v1.8h
1483
ld1 {v21.8h,v22.8h}, [x3], #32
1484
ld1 {v17.8h,v18.8h}, [x8], #32
1485
st1 {v0.8h}, [x0], #16
1486
st1 {v1.8h}, [x0], #16
1487
b.gt 1b
1488
2:
1489
ret
1490
endfunc
1491
1492
function integral_init8v_neon, export=1
1493
add x2, x0, x1, lsl #4
1494
sub x1, x1, #8
1495
ands x3, x1, #16 - 1
1496
b.eq 1f
1497
subs x1, x1, #8
1498
ld1 {v0.8h}, [x0]
1499
ld1 {v2.8h}, [x2], #16
1500
sub v4.8h, v2.8h, v0.8h
1501
st1 {v4.8h}, [x0], #16
1502
b.le 2f
1503
1:
1504
subs x1, x1, #16
1505
ld1 {v0.8h,v1.8h}, [x0]
1506
ld1 {v2.8h,v3.8h}, [x2], #32
1507
sub v4.8h, v2.8h, v0.8h
1508
sub v5.8h, v3.8h, v1.8h
1509
st1 {v4.8h}, [x0], #16
1510
st1 {v5.8h}, [x0], #16
1511
b.gt 1b
1512
2:
1513
ret
1514
endfunc
1515
1516
function x264_mbtree_propagate_cost_neon, export=1
1517
ld1r {v5.4s}, [x5]
1518
8:
1519
subs w6, w6, #8
1520
ld1 {v1.8h}, [x1], #16
1521
ld1 {v2.8h}, [x2], #16
1522
ld1 {v3.8h}, [x3], #16
1523
ld1 {v4.8h}, [x4], #16
1524
bic v3.8h, #0xc0, lsl #8
1525
umin v3.8h, v2.8h, v3.8h
1526
umull v20.4s, v2.4h, v4.4h // propagate_intra
1527
umull2 v21.4s, v2.8h, v4.8h // propagate_intra
1528
usubl v22.4s, v2.4h, v3.4h // propagate_num
1529
usubl2 v23.4s, v2.8h, v3.8h // propagate_num
1530
uxtl v26.4s, v2.4h // propagate_denom
1531
uxtl2 v27.4s, v2.8h // propagate_denom
1532
uxtl v24.4s, v1.4h
1533
uxtl2 v25.4s, v1.8h
1534
ucvtf v20.4s, v20.4s
1535
ucvtf v21.4s, v21.4s
1536
ucvtf v26.4s, v26.4s
1537
ucvtf v27.4s, v27.4s
1538
ucvtf v22.4s, v22.4s
1539
ucvtf v23.4s, v23.4s
1540
frecpe v28.4s, v26.4s
1541
frecpe v29.4s, v27.4s
1542
ucvtf v24.4s, v24.4s
1543
ucvtf v25.4s, v25.4s
1544
frecps v30.4s, v28.4s, v26.4s
1545
frecps v31.4s, v29.4s, v27.4s
1546
fmla v24.4s, v20.4s, v5.4s // propagate_amount
1547
fmla v25.4s, v21.4s, v5.4s // propagate_amount
1548
fmul v28.4s, v28.4s, v30.4s
1549
fmul v29.4s, v29.4s, v31.4s
1550
fmul v16.4s, v24.4s, v22.4s
1551
fmul v17.4s, v25.4s, v23.4s
1552
fmul v18.4s, v16.4s, v28.4s
1553
fmul v19.4s, v17.4s, v29.4s
1554
fcvtns v20.4s, v18.4s
1555
fcvtns v21.4s, v19.4s
1556
sqxtn v0.4h, v20.4s
1557
sqxtn2 v0.8h, v21.4s
1558
st1 {v0.8h}, [x0], #16
1559
b.gt 8b
1560
ret
1561
endfunc
1562
1563
const pw_0to15, align=5
1564
.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1565
endconst
1566
1567
function x264_mbtree_propagate_list_internal_neon, export=1
1568
movrel x11, pw_0to15
1569
dup v31.8h, w4 // bipred_weight
1570
movi v30.8h, #0xc0, lsl #8
1571
ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
1572
movi v28.4s, #4
1573
movi v27.8h, #31
1574
movi v26.8h, #32
1575
dup v24.8h, w5 // mb_y
1576
zip1 v29.8h, v29.8h, v24.8h
1577
8:
1578
subs w6, w6, #8
1579
ld1 {v1.8h}, [x1], #16 // propagate_amount
1580
ld1 {v2.8h}, [x2], #16 // lowres_cost
1581
and v2.16b, v2.16b, v30.16b
1582
cmeq v25.8h, v2.8h, v30.8h
1583
umull v16.4s, v1.4h, v31.4h
1584
umull2 v17.4s, v1.8h, v31.8h
1585
rshrn v16.4h, v16.4s, #6
1586
rshrn2 v16.8h, v17.4s, #6
1587
bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
1588
// propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
1589
ld1 {v4.8h,v5.8h}, [x0], #32
1590
sshr v6.8h, v4.8h, #5
1591
sshr v7.8h, v5.8h, #5
1592
add v6.8h, v6.8h, v29.8h
1593
add v29.8h, v29.8h, v28.8h
1594
add v7.8h, v7.8h, v29.8h
1595
add v29.8h, v29.8h, v28.8h
1596
st1 {v6.8h,v7.8h}, [x3], #32
1597
and v4.16b, v4.16b, v27.16b
1598
and v5.16b, v5.16b, v27.16b
1599
uzp1 v6.8h, v4.8h, v5.8h // x & 31
1600
uzp2 v7.8h, v4.8h, v5.8h // y & 31
1601
sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
1602
sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
1603
mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
1604
mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
1605
mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
1606
mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
1607
umull v6.4s, v19.4h, v25.4h
1608
umull2 v7.4s, v19.8h, v25.8h
1609
umull v4.4s, v18.4h, v25.4h
1610
umull2 v5.4s, v18.8h, v25.8h
1611
umull v2.4s, v17.4h, v25.4h
1612
umull2 v3.4s, v17.8h, v25.8h
1613
umull v0.4s, v16.4h, v25.4h
1614
umull2 v1.4s, v16.8h, v25.8h
1615
rshrn v19.4h, v6.4s, #10
1616
rshrn2 v19.8h, v7.4s, #10
1617
rshrn v18.4h, v4.4s, #10
1618
rshrn2 v18.8h, v5.4s, #10
1619
rshrn v17.4h, v2.4s, #10
1620
rshrn2 v17.8h, v3.4s, #10
1621
rshrn v16.4h, v0.4s, #10
1622
rshrn2 v16.8h, v1.4s, #10
1623
zip1 v0.8h, v16.8h, v17.8h
1624
zip2 v1.8h, v16.8h, v17.8h
1625
zip1 v2.8h, v18.8h, v19.8h
1626
zip2 v3.8h, v18.8h, v19.8h
1627
st1 {v0.8h,v1.8h}, [x3], #32
1628
st1 {v2.8h,v3.8h}, [x3], #32
1629
b.ge 8b
1630
ret
1631
endfunc
1632
1633
function x264_memcpy_aligned_neon, export=1
1634
tst x2, #16
1635
b.eq 32f
1636
sub x2, x2, #16
1637
ldr q0, [x1], #16
1638
str q0, [x0], #16
1639
32:
1640
tst x2, #32
1641
b.eq 640f
1642
sub x2, x2, #32
1643
ldp q0, q1, [x1], #32
1644
stp q0, q1, [x0], #32
1645
640:
1646
cbz x2, 1f
1647
64:
1648
subs x2, x2, #64
1649
ldp q0, q1, [x1, #32]
1650
ldp q2, q3, [x1], #64
1651
stp q0, q1, [x0, #32]
1652
stp q2, q3, [x0], #64
1653
b.gt 64b
1654
1:
1655
ret
1656
endfunc
1657
1658
function x264_memzero_aligned_neon, export=1
1659
movi v0.16b, #0
1660
movi v1.16b, #0
1661
1:
1662
subs x1, x1, #128
1663
stp q0, q1, [x0, #96]
1664
stp q0, q1, [x0, #64]
1665
stp q0, q1, [x0, #32]
1666
stp q0, q1, [x0], 128
1667
b.gt 1b
1668
ret
1669
endfunc
1670
1671