Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* predict.S: aarch64 intra prediction
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Mans Rullgard <[email protected]>
8
* Janne Grunau <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include "asm.S"
29
30
const p8weight, align=4
31
.short 1, 2, 3, 4, 1, 2, 3, 4
32
endconst
33
const p16weight, align=4
34
.short 1, 2, 3, 4, 5, 6, 7, 8
35
endconst
36
37
.macro ldcol.8 vd, xn, xm, n=8, hi=0
38
.if \n == 8 || \hi == 0
39
ld1 {\vd\().b}[0], [\xn], \xm
40
ld1 {\vd\().b}[1], [\xn], \xm
41
ld1 {\vd\().b}[2], [\xn], \xm
42
ld1 {\vd\().b}[3], [\xn], \xm
43
.endif
44
.if \n == 8 || \hi == 1
45
ld1 {\vd\().b}[4], [\xn], \xm
46
ld1 {\vd\().b}[5], [\xn], \xm
47
ld1 {\vd\().b}[6], [\xn], \xm
48
ld1 {\vd\().b}[7], [\xn], \xm
49
.endif
50
.endm
51
52
.macro ldcol.16 vd, xn, xm
53
ldcol.8 \vd, \xn, \xm
54
ld1 {\vd\().b}[ 8], [\xn], \xm
55
ld1 {\vd\().b}[ 9], [\xn], \xm
56
ld1 {\vd\().b}[10], [\xn], \xm
57
ld1 {\vd\().b}[11], [\xn], \xm
58
ld1 {\vd\().b}[12], [\xn], \xm
59
ld1 {\vd\().b}[13], [\xn], \xm
60
ld1 {\vd\().b}[14], [\xn], \xm
61
ld1 {\vd\().b}[15], [\xn], \xm
62
.endm
63
64
65
function x264_predict_4x4_h_aarch64, export=1
66
ldrb w1, [x0, #0*FDEC_STRIDE-1]
67
mov w5, #0x01010101
68
ldrb w2, [x0, #1*FDEC_STRIDE-1]
69
ldrb w3, [x0, #2*FDEC_STRIDE-1]
70
mul w1, w1, w5
71
ldrb w4, [x0, #3*FDEC_STRIDE-1]
72
mul w2, w2, w5
73
str w1, [x0, #0*FDEC_STRIDE]
74
mul w3, w3, w5
75
str w2, [x0, #1*FDEC_STRIDE]
76
mul w4, w4, w5
77
str w3, [x0, #2*FDEC_STRIDE]
78
str w4, [x0, #3*FDEC_STRIDE]
79
ret
80
endfunc
81
82
function x264_predict_4x4_v_aarch64, export=1
83
ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
84
str w1, [x0, #0 + 0 * FDEC_STRIDE]
85
str w1, [x0, #0 + 1 * FDEC_STRIDE]
86
str w1, [x0, #0 + 2 * FDEC_STRIDE]
87
str w1, [x0, #0 + 3 * FDEC_STRIDE]
88
ret
89
endfunc
90
91
function x264_predict_4x4_dc_neon, export=1
92
sub x1, x0, #FDEC_STRIDE
93
ldrb w4, [x0, #-1 + 0 * FDEC_STRIDE]
94
ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
95
ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
96
ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
97
add w4, w4, w5
98
ldr s0, [x1]
99
add w6, w6, w7
100
uaddlv h0, v0.8b
101
add w4, w4, w6
102
dup v0.4h, v0.h[0]
103
dup v1.4h, w4
104
add v0.4h, v0.4h, v1.4h
105
rshrn v0.8b, v0.8h, #3
106
str s0, [x0]
107
str s0, [x0, #1 * FDEC_STRIDE]
108
str s0, [x0, #2 * FDEC_STRIDE]
109
str s0, [x0, #3 * FDEC_STRIDE]
110
ret
111
endfunc
112
113
function x264_predict_4x4_dc_top_neon, export=1
114
sub x1, x0, #FDEC_STRIDE
115
ldr s0, [x1]
116
uaddlv h0, v0.8b
117
dup v0.4h, v0.h[0]
118
rshrn v0.8b, v0.8h, #2
119
str s0, [x0]
120
str s0, [x0, #1 * FDEC_STRIDE]
121
str s0, [x0, #2 * FDEC_STRIDE]
122
str s0, [x0, #3 * FDEC_STRIDE]
123
ret
124
ret
125
endfunc
126
127
function x264_predict_4x4_ddr_neon, export=1
128
sub x1, x0, #FDEC_STRIDE+1
129
mov x7, #FDEC_STRIDE
130
ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
131
ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
132
ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
133
ext v0.8b, v1.8b, v0.8b, #7
134
ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
135
ext v0.8b, v2.8b, v0.8b, #7 // a
136
ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
137
ext v1.8b, v3.8b, v0.8b, #7 // b
138
ext v2.8b, v4.8b, v1.8b, #7 // c
139
uaddl v0.8h, v0.8b, v1.8b
140
uaddl v1.8h, v1.8b, v2.8b
141
add v0.8h, v0.8h, v1.8h
142
rshrn v0.8b, v0.8h, #2
143
144
ext v3.8b, v0.8b, v0.8b, #3
145
ext v2.8b, v0.8b, v0.8b, #2
146
ext v1.8b, v0.8b, v0.8b, #1
147
148
str s3, [x0], #FDEC_STRIDE
149
str s2, [x0], #FDEC_STRIDE
150
str s1, [x0], #FDEC_STRIDE
151
str s0, [x0]
152
ret
153
endfunc
154
155
function x264_predict_4x4_ddl_neon, export=1
156
sub x0, x0, #FDEC_STRIDE
157
mov x7, #FDEC_STRIDE
158
ld1 {v0.8b}, [x0], x7
159
dup v3.8b, v0.b[7]
160
ext v1.8b, v0.8b, v0.8b, #1
161
ext v2.8b, v0.8b, v3.8b, #2
162
uhadd v0.8b, v0.8b, v2.8b
163
urhadd v0.8b, v0.8b, v1.8b
164
str s0, [x0], #FDEC_STRIDE
165
ext v1.8b, v0.8b, v0.8b, #1
166
ext v2.8b, v0.8b, v0.8b, #2
167
str s1, [x0], #FDEC_STRIDE
168
ext v3.8b, v0.8b, v0.8b, #3
169
str s2, [x0], #FDEC_STRIDE
170
str s3, [x0]
171
ret
172
endfunc
173
174
function x264_predict_8x8_dc_neon, export=1
175
mov x7, #FDEC_STRIDE
176
ld1 {v0.16b}, [x1], #16
177
ld1 {v1.8b}, [x1]
178
ext v0.16b, v0.16b, v0.16b, #7
179
uaddlv h1, v1.8b
180
uaddlv h0, v0.8b
181
add v0.8h, v0.8h, v1.8h
182
dup v0.8h, v0.h[0]
183
rshrn v0.8b, v0.8h, #4
184
.rept 8
185
st1 {v0.8b}, [x0], x7
186
.endr
187
ret
188
endfunc
189
190
function x264_predict_8x8_h_neon, export=1
191
mov x7, #FDEC_STRIDE
192
ld1 {v16.16b}, [x1]
193
dup v0.8b, v16.b[14]
194
dup v1.8b, v16.b[13]
195
st1 {v0.8b}, [x0], x7
196
dup v2.8b, v16.b[12]
197
st1 {v1.8b}, [x0], x7
198
dup v3.8b, v16.b[11]
199
st1 {v2.8b}, [x0], x7
200
dup v4.8b, v16.b[10]
201
st1 {v3.8b}, [x0], x7
202
dup v5.8b, v16.b[9]
203
st1 {v4.8b}, [x0], x7
204
dup v6.8b, v16.b[8]
205
st1 {v5.8b}, [x0], x7
206
dup v7.8b, v16.b[7]
207
st1 {v6.8b}, [x0], x7
208
st1 {v7.8b}, [x0], x7
209
ret
210
endfunc
211
212
function x264_predict_8x8_v_neon, export=1
213
add x1, x1, #16
214
mov x7, #FDEC_STRIDE
215
ld1 {v0.8b}, [x1]
216
.rept 8
217
st1 {v0.8b}, [x0], x7
218
.endr
219
ret
220
endfunc
221
222
function x264_predict_8x8_ddl_neon, export=1
223
add x1, x1, #16
224
mov x7, #FDEC_STRIDE
225
ld1 {v0.16b}, [x1]
226
movi v3.16b, #0
227
dup v2.16b, v0.b[15]
228
ext v4.16b, v3.16b, v0.16b, #15
229
ext v2.16b, v0.16b, v2.16b, #1
230
uhadd v4.16b, v4.16b, v2.16b
231
urhadd v0.16b, v0.16b, v4.16b
232
ext v1.16b, v0.16b, v0.16b, #1
233
ext v2.16b, v0.16b, v0.16b, #2
234
st1 {v1.8b}, [x0], x7
235
ext v3.16b, v0.16b, v0.16b, #3
236
st1 {v2.8b}, [x0], x7
237
ext v4.16b, v0.16b, v0.16b, #4
238
st1 {v3.8b}, [x0], x7
239
ext v5.16b, v0.16b, v0.16b, #5
240
st1 {v4.8b}, [x0], x7
241
ext v6.16b, v0.16b, v0.16b, #6
242
st1 {v5.8b}, [x0], x7
243
ext v7.16b, v0.16b, v0.16b, #7
244
st1 {v6.8b}, [x0], x7
245
ext v0.16b, v0.16b, v0.16b, #8
246
st1 {v7.8b}, [x0], x7
247
st1 {v0.8b}, [x0], x7
248
ret
249
endfunc
250
251
function x264_predict_8x8_ddr_neon, export=1
252
ld1 {v0.16b,v1.16b}, [x1]
253
ext v2.16b, v0.16b, v1.16b, #7
254
ext v4.16b, v0.16b, v1.16b, #9
255
ext v3.16b, v0.16b, v1.16b, #8
256
257
uhadd v2.16b, v2.16b, v4.16b
258
urhadd v7.16b, v3.16b, v2.16b
259
260
add x0, x0, #7*FDEC_STRIDE
261
mov x7, #-1*FDEC_STRIDE
262
263
ext v6.16b, v7.16b, v7.16b, #1
264
st1 {v7.8b}, [x0], x7
265
ext v5.16b, v7.16b, v7.16b, #2
266
st1 {v6.8b}, [x0], x7
267
ext v4.16b, v7.16b, v7.16b, #3
268
st1 {v5.8b}, [x0], x7
269
ext v3.16b, v7.16b, v7.16b, #4
270
st1 {v4.8b}, [x0], x7
271
ext v2.16b, v7.16b, v7.16b, #5
272
st1 {v3.8b}, [x0], x7
273
ext v1.16b, v7.16b, v7.16b, #6
274
st1 {v2.8b}, [x0], x7
275
ext v0.16b, v7.16b, v7.16b, #7
276
st1 {v1.8b}, [x0], x7
277
st1 {v0.8b}, [x0], x7
278
ret
279
endfunc
280
281
function x264_predict_8x8_vl_neon, export=1
282
add x1, x1, #16
283
mov x7, #FDEC_STRIDE
284
285
ld1 {v0.16b}, [x1]
286
ext v1.16b, v1.16b, v0.16b, #15
287
ext v2.16b, v0.16b, v2.16b, #1
288
289
uhadd v1.16b, v1.16b, v2.16b
290
urhadd v3.16b, v0.16b, v2.16b
291
292
urhadd v0.16b, v0.16b, v1.16b
293
294
ext v4.16b, v0.16b, v0.16b, #1
295
st1 {v3.8b}, [x0], x7
296
ext v5.16b, v3.16b, v3.16b, #1
297
st1 {v4.8b}, [x0], x7
298
ext v6.16b, v0.16b, v0.16b, #2
299
st1 {v5.8b}, [x0], x7
300
ext v7.16b, v3.16b, v3.16b, #2
301
st1 {v6.8b}, [x0], x7
302
ext v4.16b, v0.16b, v0.16b, #3
303
st1 {v7.8b}, [x0], x7
304
ext v5.16b, v3.16b, v3.16b, #3
305
st1 {v4.8b}, [x0], x7
306
ext v6.16b, v0.16b, v0.16b, #4
307
st1 {v5.8b}, [x0], x7
308
st1 {v6.8b}, [x0], x7
309
ret
310
endfunc
311
312
function x264_predict_8x8_vr_neon, export=1
313
add x1, x1, #8
314
mov x7, #FDEC_STRIDE
315
ld1 {v2.16b}, [x1]
316
317
ext v1.16b, v2.16b, v2.16b, #14
318
ext v0.16b, v2.16b, v2.16b, #15
319
320
uhadd v3.16b, v2.16b, v1.16b
321
urhadd v2.16b, v2.16b, v0.16b
322
urhadd v0.16b, v0.16b, v3.16b
323
324
ext v1.16b, v2.16b, v2.16b, #8
325
uzp1 v2.8b, v0.8b, v0.8b
326
uzp2 v3.8b, v0.8b, v0.8b
327
ext v0.16b, v0.16b, v0.16b, #8
328
329
st1 {v1.8b}, [x0], x7
330
st1 {v0.8b}, [x0], x7
331
ext v4.8b, v3.8b, v1.8b, #7
332
ext v5.8b, v2.8b, v0.8b, #7
333
st1 {v4.8b}, [x0], x7
334
st1 {v5.8b}, [x0], x7
335
ext v6.8b, v3.8b, v1.8b, #6
336
ext v7.8b, v2.8b, v0.8b, #6
337
st1 {v6.8b}, [x0], x7
338
st1 {v7.8b}, [x0], x7
339
ext v1.8b, v3.8b, v1.8b, #5
340
ext v0.8b, v2.8b, v0.8b, #5
341
st1 {v1.8b}, [x0], x7
342
st1 {v0.8b}, [x0], x7
343
ret
344
endfunc
345
346
function x264_predict_8x8_hd_neon, export=1
347
add x1, x1, #7
348
mov x7, #FDEC_STRIDE
349
350
ld1 {v1.16b}, [x1]
351
ext v3.16b, v1.16b, v1.16b, #1
352
ext v2.16b, v1.16b, v1.16b, #2
353
354
urhadd v4.16b, v1.16b, v3.16b
355
356
uhadd v1.16b, v1.16b, v2.16b
357
urhadd v0.16b, v1.16b, v3.16b
358
359
zip1 v16.8b, v4.8b, v0.8b
360
zip2 v17.8b, v4.8b, v0.8b
361
ext v7.16b, v0.16b, v0.16b, #8
362
363
ext v0.8b, v17.8b, v7.8b, #6
364
ext v1.8b, v17.8b, v7.8b, #4
365
st1 {v0.8b}, [x0], x7
366
ext v2.8b, v17.8b, v7.8b, #2
367
st1 {v1.8b}, [x0], x7
368
st1 {v2.8b}, [x0], x7
369
ext v3.8b, v16.8b, v17.8b, #6
370
st1 {v17.8b}, [x0], x7
371
ext v4.8b, v16.8b, v17.8b, #4
372
st1 {v3.8b}, [x0], x7
373
ext v5.8b, v16.8b, v17.8b, #2
374
st1 {v4.8b}, [x0], x7
375
st1 {v5.8b}, [x0], x7
376
st1 {v16.8b}, [x0], x7
377
378
ret
379
endfunc
380
381
function x264_predict_8x8_hu_neon, export=1
382
add x1, x1, #7
383
mov x7, #FDEC_STRIDE
384
ld1 {v7.8b}, [x1]
385
dup v6.8b, v7.b[0]
386
rev64 v7.8b, v7.8b
387
388
ext v4.8b, v7.8b, v6.8b, #2
389
ext v2.8b, v7.8b, v6.8b, #1
390
391
uhadd v5.8b, v7.8b, v4.8b
392
urhadd v0.8b, v2.8b, v7.8b
393
urhadd v1.8b, v5.8b, v2.8b
394
395
zip1 v16.8b, v0.8b, v1.8b
396
zip2 v17.8b, v0.8b, v1.8b
397
398
dup v18.4h, v17.h[3]
399
400
ext v0.8b, v16.8b, v17.8b, #2
401
ext v1.8b, v16.8b, v17.8b, #4
402
ext v2.8b, v16.8b, v17.8b, #6
403
st1 {v16.8b}, [x0], x7
404
st1 {v0.8b}, [x0], x7
405
st1 {v1.8b}, [x0], x7
406
st1 {v2.8b}, [x0], x7
407
408
ext v4.8b, v17.8b, v18.8b, #2
409
ext v5.8b, v17.8b, v18.8b, #4
410
ext v6.8b, v17.8b, v18.8b, #6
411
st1 {v17.8b}, [x0], x7
412
st1 {v4.8b}, [x0], x7
413
st1 {v5.8b}, [x0], x7
414
st1 {v6.8b}, [x0]
415
ret
416
endfunc
417
418
419
function x264_predict_8x8c_dc_top_neon, export=1
420
sub x2, x0, #FDEC_STRIDE
421
mov x1, #FDEC_STRIDE
422
ld1 {v0.8b}, [x2]
423
uaddlp v0.4h, v0.8b
424
addp v0.4h, v0.4h, v0.4h
425
rshrn v0.8b, v0.8h, #2
426
dup v3.8b, v0.b[1]
427
dup v2.8b, v0.b[0]
428
transpose v0.2s, v1.2s, v2.2s, v3.2s
429
b pred8x8c_dc_end
430
endfunc
431
432
function x264_predict_8x8c_dc_left_neon, export=1
433
ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
434
ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
435
ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
436
ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
437
mov x1, #FDEC_STRIDE
438
add w2, w2, w3
439
add w3, w4, w5
440
ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
441
ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
442
ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
443
ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
444
add w6, w6, w7
445
add w7, w8, w9
446
add w2, w2, w3
447
add w6, w6, w7
448
dup v0.8h, w2
449
dup v1.8h, w6
450
rshrn v0.8b, v0.8h, #2
451
rshrn v1.8b, v1.8h, #2
452
b pred8x8c_dc_end
453
endfunc
454
455
function x264_predict_8x8c_dc_neon, export=1
456
mov x1, #FDEC_STRIDE
457
sub x2, x0, #FDEC_STRIDE
458
ldrb w10, [x0, #0 * FDEC_STRIDE - 1]
459
ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
460
ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
461
ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
462
add w10, w10, w11
463
ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
464
ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
465
add w12, w12, w13
466
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
467
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
468
add w4, w4, w5
469
add w6, w6, w7
470
add w10, w10, w12, lsl #16
471
add w4, w4, w6, lsl #16
472
ld1 {v0.8b}, [x2]
473
add x10, x10, x4, lsl #32
474
uaddlp v0.4h, v0.8b // s0, s1
475
mov v1.d[0], x10 // s2, s3
476
add v3.4h, v0.4h, v1.4h
477
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
478
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
479
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
480
uzp1 v1.2d, v1.2d, v1.2d
481
uzp1 v0.2d, v0.2d, v0.2d
482
rshrn v3.8b, v1.8h, #3
483
rshrn v2.8b, v0.8h, #2
484
uzp1 v0.8b, v3.8b, v2.8b
485
uzp2 v1.8b, v2.8b, v3.8b
486
pred8x8c_dc_end:
487
add x2, x0, #2 * FDEC_STRIDE
488
add x4, x0, #4 * FDEC_STRIDE
489
add x5, x0, #6 * FDEC_STRIDE
490
st1 {v0.8b}, [x0], x1
491
st1 {v0.8b}, [x2], x1
492
st1 {v0.8b}, [x0]
493
st1 {v0.8b}, [x2]
494
st1 {v1.8b}, [x4], x1
495
st1 {v1.8b}, [x5], x1
496
st1 {v1.8b}, [x4]
497
st1 {v1.8b}, [x5]
498
ret
499
endfunc
500
501
function x264_predict_8x8c_h_neon, export=1
502
sub x1, x0, #1
503
mov x7, #FDEC_STRIDE
504
.rept 4
505
ld1r {v0.8b}, [x1], x7
506
ld1r {v1.8b}, [x1], x7
507
st1 {v0.8b}, [x0], x7
508
st1 {v1.8b}, [x0], x7
509
.endr
510
ret
511
endfunc
512
513
function x264_predict_8x8c_v_aarch64, export=1
514
ldr x1, [x0, #-FDEC_STRIDE]
515
.irp c, 0,1,2,3,4,5,6,7
516
str x1, [x0, #\c * FDEC_STRIDE]
517
.endr
518
ret
519
endfunc
520
521
function x264_predict_8x8c_p_neon, export=1
522
sub x3, x0, #FDEC_STRIDE
523
mov x1, #FDEC_STRIDE
524
add x2, x3, #4
525
sub x3, x3, #1
526
ld1 {v0.s}[0], [x3]
527
ld1 {v2.s}[0], [x2], x1
528
ldcol.8 v0, x3, x1, 4, hi=1
529
add x3, x3, x1
530
ldcol.8 v3, x3, x1, 4
531
movrel x4, p8weight
532
movrel x5, p16weight
533
uaddl v4.8h, v2.8b, v3.8b
534
rev32 v0.8b, v0.8b
535
trn1 v2.2s, v2.2s, v3.2s
536
ld1 {v7.8h}, [x4]
537
usubl v2.8h, v2.8b, v0.8b
538
mul v2.8h, v2.8h, v7.8h
539
ld1 {v0.8h}, [x5]
540
saddlp v2.4s, v2.8h
541
addp v2.4s, v2.4s, v2.4s
542
shl v3.2s, v2.2s, #4
543
add v2.2s, v2.2s, v3.2s
544
rshrn v5.4h, v2.4s, #5 // b, c, x, x
545
addp v2.4h, v5.4h, v5.4h
546
shl v3.4h, v2.4h, #2
547
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
548
rev64 v4.4h, v4.4h
549
add v4.4h, v4.4h, v0.4h
550
shl v2.4h, v4.4h, #4 // a
551
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
552
ext v0.16b, v0.16b, v0.16b, #14
553
sub v6.4h, v5.4h, v3.4h
554
mov v0.h[0], wzr
555
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
556
dup v1.8h, v2.h[0] // pix
557
dup v2.8h, v5.h[1] // c
558
add v1.8h, v1.8h, v0.8h // pix + x*b
559
mov x3, #8
560
1:
561
subs x3, x3, #1
562
sqshrun v0.8b, v1.8h, #5
563
add v1.8h, v1.8h, v2.8h
564
st1 {v0.8b}, [x0], x1
565
b.ne 1b
566
ret
567
endfunc
568
569
570
.macro loadsum4 wd, t1, t2, t3, x, idx
571
ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
572
ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
573
ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
574
ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
575
add \wd, \wd, \t1
576
add \t1, \t2, \t3
577
add \wd, \wd, \t1
578
.endm
579
580
function x264_predict_8x16c_h_neon, export=1
581
sub x2, x0, #1
582
add x3, x0, #FDEC_STRIDE - 1
583
mov x7, #2 * FDEC_STRIDE
584
add x1, x0, #FDEC_STRIDE
585
.rept 4
586
ld1r {v0.8b}, [x2], x7
587
ld1r {v1.8b}, [x3], x7
588
ld1r {v2.8b}, [x2], x7
589
ld1r {v3.8b}, [x3], x7
590
st1 {v0.8b}, [x0], x7
591
st1 {v1.8b}, [x1], x7
592
st1 {v2.8b}, [x0], x7
593
st1 {v3.8b}, [x1], x7
594
.endr
595
ret
596
endfunc
597
598
function x264_predict_8x16c_v_neon, export=1
599
sub x1, x0, #FDEC_STRIDE
600
mov x2, #2 * FDEC_STRIDE
601
ld1 {v0.8b}, [x1], x2
602
.rept 8
603
st1 {v0.8b}, [x0], x2
604
st1 {v0.8b}, [x1], x2
605
.endr
606
ret
607
endfunc
608
609
function x264_predict_8x16c_p_neon, export=1
610
movrel x4, p16weight
611
ld1 {v17.8h}, [x4]
612
sub x3, x0, #FDEC_STRIDE
613
mov x1, #FDEC_STRIDE
614
add x2, x3, #4
615
sub x3, x3, #1
616
617
ld1 {v0.8b}, [x3]
618
ld1 {v2.8b}, [x2], x1
619
ldcol.8 v1, x3, x1
620
add x3, x3, x1
621
ldcol.8 v3, x3, x1
622
ext v4.8b, v2.8b, v2.8b, #3
623
ext v5.8b, v3.8b, v3.8b, #7
624
rev32 v0.8b, v0.8b
625
rev64 v1.8b, v1.8b
626
627
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
628
629
usubl v2.8h, v2.8b, v0.8b
630
mul v2.8h, v2.8h, v17.8h
631
saddlp v2.4s, v2.8h
632
addp v2.4s, v2.4s, v2.4s // H
633
634
usubl v3.8h, v3.8b, v1.8b
635
mul v3.8h, v3.8h, v17.8h
636
saddlp v3.4s, v3.8h
637
addp v3.4s, v3.4s, v3.4s
638
addp v3.4s, v3.4s, v3.4s // V
639
640
ext v17.16b, v17.16b, v17.16b, #14
641
642
shl v4.4h, v4.4h, #4 // a
643
shl v6.2s, v2.2s, #4 // 16 * H
644
shl v7.2s, v3.2s, #2 // 4 * V
645
add v2.2s, v2.2s, v6.2s // 17 * H
646
add v3.2s, v3.2s, v7.2s // 5 * V
647
rshrn v2.4h, v2.4s, #5 // b
648
rshrn v3.4h, v3.4s, #6 // c
649
650
mov v17.h[0], wzr
651
652
sub v4.4h, v4.4h, v2.4h // a - b
653
shl v6.4h, v2.4h, #1 // 2 * b
654
add v4.4h, v4.4h, v3.4h // a - b + c
655
shl v7.4h, v3.4h, #3 // 8 * c
656
sub v4.4h, v4.4h, v6.4h // a - 3b + c
657
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
658
659
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
660
dup v1.8h, v4.h[0] // i00
661
dup v2.8h, v3.h[0] // c
662
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
663
mov x3, #16
664
1:
665
subs x3, x3, #2
666
sqrshrun v4.8b, v1.8h, #5
667
add v1.8h, v1.8h, v2.8h
668
sqrshrun v5.8b, v1.8h, #5
669
st1 {v4.8b}, [x0], x1
670
add v1.8h, v1.8h, v2.8h
671
st1 {v5.8b}, [x0], x1
672
b.ne 1b
673
ret
674
endfunc
675
676
function x264_predict_8x16c_dc_neon, export=1
677
mov x1, #FDEC_STRIDE
678
sub x10, x0, #FDEC_STRIDE
679
loadsum4 w2, w3, w4, w5, x0, 0
680
ld1 {v6.8b}, [x10]
681
loadsum4 w6, w7, w8, w9, x0, 4
682
uaddlp v6.4h, v6.8b
683
dup v22.8h, w2 // s2
684
dup v23.8h, w6 // s3
685
loadsum4 w2, w3, w4, w5, x0, 8
686
addp v6.4h, v6.4h, v6.4h // s0, s1
687
loadsum4 w6, w7, w8, w9, x0, 12
688
dup v20.8h, v6.h[0] // s0
689
dup v21.8h, v6.h[1] // s1
690
dup v24.8h, w2 // s4
691
dup v25.8h, w6 // s5
692
693
ext v16.16b, v20.16b, v21.16b, #8
694
ext v17.16b, v22.16b, v21.16b, #8
695
ext v1.16b, v23.16b, v21.16b, #8
696
ext v2.16b, v24.16b, v21.16b, #8
697
ext v3.16b, v25.16b, v21.16b, #8
698
699
add v0.8h, v16.8h, v17.8h
700
add v1.8h, v1.8h, v23.8h
701
add v2.8h, v2.8h, v24.8h
702
add v3.8h, v3.8h, v25.8h
703
704
rshrn v0.8b, v0.8h, #3
705
rshrn v1.8b, v1.8h, #3
706
rshrn v2.8b, v2.8h, #3
707
rshrn v3.8b, v3.8h, #3
708
709
add x11, x0, #4 * FDEC_STRIDE
710
add x12, x0, #8 * FDEC_STRIDE
711
add x13, x0, #12 * FDEC_STRIDE
712
.rept 4
713
st1 {v0.8b}, [x0], x1
714
st1 {v1.8b}, [x11], x1
715
st1 {v2.8b}, [x12], x1
716
st1 {v3.8b}, [x13], x1
717
.endr
718
ret
719
endfunc
720
721
function x264_predict_8x16c_dc_left_neon, export=1
722
mov x1, #FDEC_STRIDE
723
ldrb w2, [x0, # 0 * FDEC_STRIDE - 1]
724
ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
725
ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
726
ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
727
add w2, w2, w3
728
729
ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
730
add w4, w4, w5
731
ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
732
add w2, w2, w4
733
ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
734
ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
735
dup v0.8h, w2
736
add w6, w6, w7
737
rshrn v0.8b, v0.8h, #2
738
add w8, w8, w9
739
740
ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
741
ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
742
add w6, w6, w8
743
ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
744
ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
745
dup v1.8h, w6
746
add w10, w10, w11
747
rshrn v1.8b, v1.8h, #2
748
add w12, w12, w13
749
750
ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
751
ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
752
add w10, w10, w12
753
ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
754
ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
755
dup v2.8h, w10
756
add w2, w2, w3
757
rshrn v2.8b, v2.8h, #2
758
add w4, w4, w5
759
st1 {v0.8b}, [x0], x1
760
st1 {v0.8b}, [x0], x1
761
add w2, w2, w4
762
st1 {v0.8b}, [x0], x1
763
dup v3.8h, w2
764
st1 {v0.8b}, [x0], x1
765
rshrn v3.8b, v3.8h, #2
766
767
.irp idx, 1, 2, 3
768
.rept 4
769
st1 {v\idx\().8b}, [x0], x1
770
.endr
771
.endr
772
ret
773
endfunc
774
775
function x264_predict_8x16c_dc_top_neon, export=1
776
sub x2, x0, #FDEC_STRIDE
777
mov x1, #FDEC_STRIDE
778
ld1 {v0.8b}, [x2]
779
uaddlp v0.4h, v0.8b
780
addp v0.4h, v0.4h, v0.4h
781
rshrn v4.8b, v0.8h, #2
782
dup v0.8b, v4.b[0]
783
dup v1.8b, v4.b[1]
784
ext v0.8b, v0.8b, v1.8b, #4
785
.rept 16
786
st1 {v0.8b}, [x0], x1
787
.endr
788
ret
789
endfunc
790
791
792
function x264_predict_16x16_dc_top_neon, export=1
793
sub x2, x0, #FDEC_STRIDE
794
mov x1, #FDEC_STRIDE
795
ld1 {v0.16b}, [x2]
796
uaddlv h0, v0.16b
797
rshrn v0.8b, v0.8h, #4
798
dup v0.16b, v0.b[0]
799
b pred16x16_dc_end
800
endfunc
801
802
function x264_predict_16x16_dc_left_neon, export=1
803
sub x2, x0, #1
804
mov x1, #FDEC_STRIDE
805
ldcol.16 v0, x2, x1
806
uaddlv h0, v0.16b
807
rshrn v0.8b, v0.8h, #4
808
dup v0.16b, v0.b[0]
809
b pred16x16_dc_end
810
endfunc
811
812
function x264_predict_16x16_dc_neon, export=1
813
sub x3, x0, #FDEC_STRIDE
814
sub x2, x0, #1
815
mov x1, #FDEC_STRIDE
816
ld1 {v0.16b}, [x3]
817
ldcol.16 v1, x2, x1
818
uaddlv h0, v0.16b
819
uaddlv h1, v1.16b
820
add v0.4h, v0.4h, v1.4h
821
rshrn v0.8b, v0.8h, #5
822
dup v0.16b, v0.b[0]
823
pred16x16_dc_end:
824
.rept 16
825
st1 {v0.16b}, [x0], x1
826
.endr
827
ret
828
endfunc
829
830
function x264_predict_16x16_h_neon, export=1
831
sub x1, x0, #1
832
mov x7, #FDEC_STRIDE
833
.rept 8
834
ld1r {v0.16b}, [x1], x7
835
ld1r {v1.16b}, [x1], x7
836
st1 {v0.16b}, [x0], x7
837
st1 {v1.16b}, [x0], x7
838
.endr
839
ret
840
endfunc
841
842
function x264_predict_16x16_v_neon, export=1
843
sub x0, x0, #FDEC_STRIDE
844
mov x7, #FDEC_STRIDE
845
ld1 {v0.16b}, [x0], x7
846
.rept 16
847
st1 {v0.16b}, [x0], x7
848
.endr
849
ret
850
endfunc
851
852
function x264_predict_16x16_p_neon, export=1
853
sub x3, x0, #FDEC_STRIDE
854
mov x1, #FDEC_STRIDE
855
add x2, x3, #8
856
sub x3, x3, #1
857
ld1 {v0.8b}, [x3]
858
ld1 {v2.8b}, [x2], x1
859
ldcol.8 v1, x3, x1
860
add x3, x3, x1
861
ldcol.8 v3, x3, x1
862
rev64 v0.8b, v0.8b
863
rev64 v1.8b, v1.8b
864
movrel x4, p16weight
865
uaddl v4.8h, v2.8b, v3.8b
866
ld1 {v7.8h}, [x4]
867
usubl v2.8h, v2.8b, v0.8b
868
usubl v3.8h, v3.8b, v1.8b
869
mul v2.8h, v2.8h, v7.8h
870
mul v3.8h, v3.8h, v7.8h
871
saddlp v2.4s, v2.8h
872
saddlp v3.4s, v3.8h
873
addp v2.4s, v2.4s, v3.4s
874
addp v2.4s, v2.4s, v2.4s
875
shl v3.2s, v2.2s, #2
876
add v2.2s, v2.2s, v3.2s
877
rshrn v5.4h, v2.4s, #6 // b, c, x, x
878
addp v2.4h, v5.4h, v5.4h
879
shl v3.4h, v2.4h, #3
880
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
881
ext v4.16b, v4.16b, v4.16b, #14
882
add v4.4h, v4.4h, v7.4h
883
shl v2.4h, v4.4h, #4 // a
884
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
885
ext v7.16b, v7.16b, v7.16b, #14
886
mov v7.h[0], wzr
887
dup v3.8h, v5.h[0]
888
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
889
dup v1.8h, v2.h[0] // pix
890
dup v2.8h, v5.h[1] // c
891
shl v3.8h, v3.8h, #3
892
add v1.8h, v1.8h, v0.8h // pix + x*b
893
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
894
mov x3, #16
895
1:
896
subs x3, x3, #1
897
sqshrun v0.8b, v1.8h, #5
898
add v1.8h, v1.8h, v2.8h
899
sqshrun2 v0.16b, v3.8h, #5
900
add v3.8h, v3.8h, v2.8h
901
st1 {v0.16b}, [x0], x1
902
b.ne 1b
903
ret
904
endfunc
905
906