Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/*****************************************************************************
2
* predict.S: arm intra prediction
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Mans Rullgard <[email protected]>
8
* Martin Storsjo <[email protected]>
9
*
10
* This program is free software; you can redistribute it and/or modify
11
* it under the terms of the GNU General Public License as published by
12
* the Free Software Foundation; either version 2 of the License, or
13
* (at your option) any later version.
14
*
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
19
*
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
*
24
* This program is also available under a commercial proprietary license.
25
* For more information, contact us at [email protected].
26
*****************************************************************************/
27
28
#include "asm.S"
29
30
.section .rodata
31
.align 4
32
33
p16weight: .short 1,2,3,4,5,6,7,8
34
35
.text
36
37
.macro ldcol.8 rd, rs, rt, n=8, hi=0
38
.if \n == 8 || \hi == 0
39
vld1.8 {\rd[0]}, [\rs], \rt
40
vld1.8 {\rd[1]}, [\rs], \rt
41
vld1.8 {\rd[2]}, [\rs], \rt
42
vld1.8 {\rd[3]}, [\rs], \rt
43
.endif
44
.if \n == 8 || \hi == 1
45
vld1.8 {\rd[4]}, [\rs], \rt
46
vld1.8 {\rd[5]}, [\rs], \rt
47
vld1.8 {\rd[6]}, [\rs], \rt
48
vld1.8 {\rd[7]}, [\rs], \rt
49
.endif
50
.endm
51
52
.macro ldcol.16 rd1, rd2, rs, rt, ru
53
add \ru, \rs, \rt, lsl #3
54
vld1.8 {\rd1[0]}, [\rs], \rt
55
vld1.8 {\rd2[0]}, [\ru], \rt
56
vld1.8 {\rd1[1]}, [\rs], \rt
57
vld1.8 {\rd2[1]}, [\ru], \rt
58
vld1.8 {\rd1[2]}, [\rs], \rt
59
vld1.8 {\rd2[2]}, [\ru], \rt
60
vld1.8 {\rd1[3]}, [\rs], \rt
61
vld1.8 {\rd2[3]}, [\ru], \rt
62
vld1.8 {\rd1[4]}, [\rs], \rt
63
vld1.8 {\rd2[4]}, [\ru], \rt
64
vld1.8 {\rd1[5]}, [\rs], \rt
65
vld1.8 {\rd2[5]}, [\ru], \rt
66
vld1.8 {\rd1[6]}, [\rs], \rt
67
vld1.8 {\rd2[6]}, [\ru], \rt
68
vld1.8 {\rd1[7]}, [\rs], \rt
69
vld1.8 {\rd2[7]}, [\ru], \rt
70
.endm
71
72
.macro add16x8 dq, dl, dh, rl, rh
73
vaddl.u8 \dq, \rl, \rh
74
vadd.u16 \dl, \dl, \dh
75
vpadd.u16 \dl, \dl, \dl
76
vpadd.u16 \dl, \dl, \dl
77
.endm
78
79
80
// because gcc doesn't believe in using the free shift in add
81
function x264_predict_4x4_h_armv6
82
ldrb r1, [r0, #0*FDEC_STRIDE-1]
83
ldrb r2, [r0, #1*FDEC_STRIDE-1]
84
ldrb r3, [r0, #2*FDEC_STRIDE-1]
85
ldrb ip, [r0, #3*FDEC_STRIDE-1]
86
add r1, r1, r1, lsl #8
87
add r2, r2, r2, lsl #8
88
add r3, r3, r3, lsl #8
89
add ip, ip, ip, lsl #8
90
add r1, r1, r1, lsl #16
91
str r1, [r0, #0*FDEC_STRIDE]
92
add r2, r2, r2, lsl #16
93
str r2, [r0, #1*FDEC_STRIDE]
94
add r3, r3, r3, lsl #16
95
str r3, [r0, #2*FDEC_STRIDE]
96
add ip, ip, ip, lsl #16
97
str ip, [r0, #3*FDEC_STRIDE]
98
bx lr
99
endfunc
100
101
function x264_predict_4x4_v_armv6
102
ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
103
str r1, [r0, #0 + 0 * FDEC_STRIDE]
104
str r1, [r0, #0 + 1 * FDEC_STRIDE]
105
str r1, [r0, #0 + 2 * FDEC_STRIDE]
106
str r1, [r0, #0 + 3 * FDEC_STRIDE]
107
bx lr
108
endfunc
109
110
function x264_predict_4x4_dc_armv6
111
mov ip, #0
112
ldr r1, [r0, #-FDEC_STRIDE]
113
ldrb r2, [r0, #0*FDEC_STRIDE-1]
114
ldrb r3, [r0, #1*FDEC_STRIDE-1]
115
usad8 r1, r1, ip
116
add r2, r2, #4
117
ldrb ip, [r0, #2*FDEC_STRIDE-1]
118
add r2, r2, r3
119
ldrb r3, [r0, #3*FDEC_STRIDE-1]
120
add r2, r2, ip
121
add r2, r2, r3
122
add r1, r1, r2
123
lsr r1, r1, #3
124
add r1, r1, r1, lsl #8
125
add r1, r1, r1, lsl #16
126
str r1, [r0, #0*FDEC_STRIDE]
127
str r1, [r0, #1*FDEC_STRIDE]
128
str r1, [r0, #2*FDEC_STRIDE]
129
str r1, [r0, #3*FDEC_STRIDE]
130
bx lr
131
endfunc
132
133
function x264_predict_4x4_dc_top_neon
134
mov r12, #FDEC_STRIDE
135
sub r1, r0, #FDEC_STRIDE
136
vld1.32 d1[], [r1,:32]
137
vpaddl.u8 d1, d1
138
vpadd.u16 d1, d1, d1
139
vrshr.u16 d1, d1, #2
140
vdup.8 d1, d1[0]
141
vst1.32 d1[0], [r0,:32], r12
142
vst1.32 d1[0], [r0,:32], r12
143
vst1.32 d1[0], [r0,:32], r12
144
vst1.32 d1[0], [r0,:32], r12
145
bx lr
146
endfunc
147
148
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
149
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
150
uhadd8 \a1, \a1, \c1
151
uhadd8 \a2, \a2, \c2
152
uhadd8 \c1, \a1, \b1
153
uhadd8 \c2, \a2, \b2
154
eor \a1, \a1, \b1
155
eor \a2, \a2, \b2
156
and \a1, \a1, \pb_1
157
and \a2, \a2, \pb_1
158
uadd8 \a1, \a1, \c1
159
uadd8 \a2, \a2, \c2
160
.endm
161
162
function x264_predict_4x4_ddr_armv6
163
ldr r1, [r0, # -FDEC_STRIDE]
164
ldrb r2, [r0, # -FDEC_STRIDE-1]
165
ldrb r3, [r0, #0*FDEC_STRIDE-1]
166
push {r4-r6,lr}
167
add r2, r2, r1, lsl #8
168
ldrb r4, [r0, #1*FDEC_STRIDE-1]
169
add r3, r3, r2, lsl #8
170
ldrb r5, [r0, #2*FDEC_STRIDE-1]
171
ldrb r6, [r0, #3*FDEC_STRIDE-1]
172
add r4, r4, r3, lsl #8
173
add r5, r5, r4, lsl #8
174
add r6, r6, r5, lsl #8
175
ldr ip, =0x01010101
176
PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
177
str r1, [r0, #0*FDEC_STRIDE]
178
lsl r2, r1, #8
179
lsl r3, r1, #16
180
lsl r4, r4, #8
181
lsl r5, r1, #24
182
add r2, r2, r4, lsr #24
183
str r2, [r0, #1*FDEC_STRIDE]
184
add r3, r3, r4, lsr #16
185
str r3, [r0, #2*FDEC_STRIDE]
186
add r5, r5, r4, lsr #8
187
str r5, [r0, #3*FDEC_STRIDE]
188
pop {r4-r6,pc}
189
endfunc
190
191
function x264_predict_4x4_ddl_neon
192
sub r0, #FDEC_STRIDE
193
mov ip, #FDEC_STRIDE
194
vld1.64 {d0}, [r0], ip
195
vdup.8 d3, d0[7]
196
vext.8 d1, d0, d0, #1
197
vext.8 d2, d0, d3, #2
198
vhadd.u8 d0, d0, d2
199
vrhadd.u8 d0, d0, d1
200
vst1.32 {d0[0]}, [r0,:32], ip
201
vext.8 d1, d0, d0, #1
202
vext.8 d2, d0, d0, #2
203
vst1.32 {d1[0]}, [r0,:32], ip
204
vext.8 d3, d0, d0, #3
205
vst1.32 {d2[0]}, [r0,:32], ip
206
vst1.32 {d3[0]}, [r0,:32], ip
207
bx lr
208
endfunc
209
210
function x264_predict_8x8_dc_neon
211
mov ip, #0
212
ldrd r2, r3, [r1, #8]
213
push {r4-r5,lr}
214
ldrd r4, r5, [r1, #16]
215
lsl r3, r3, #8
216
ldrb lr, [r1, #7]
217
usad8 r2, r2, ip
218
usad8 r3, r3, ip
219
usada8 r2, r4, ip, r2
220
add lr, lr, #8
221
usada8 r3, r5, ip, r3
222
add r2, r2, lr
223
mov ip, #FDEC_STRIDE
224
add r2, r2, r3
225
lsr r2, r2, #4
226
227
vdup.8 d0, r2
228
.rept 8
229
vst1.64 {d0}, [r0,:64], ip
230
.endr
231
pop {r4-r5,pc}
232
endfunc
233
234
function x264_predict_8x8_h_neon
235
add r1, r1, #7
236
mov ip, #FDEC_STRIDE
237
vld1.64 {d16}, [r1]
238
vdup.8 d0, d16[7]
239
vdup.8 d1, d16[6]
240
vst1.64 {d0}, [r0,:64], ip
241
vdup.8 d2, d16[5]
242
vst1.64 {d1}, [r0,:64], ip
243
vdup.8 d3, d16[4]
244
vst1.64 {d2}, [r0,:64], ip
245
vdup.8 d4, d16[3]
246
vst1.64 {d3}, [r0,:64], ip
247
vdup.8 d5, d16[2]
248
vst1.64 {d4}, [r0,:64], ip
249
vdup.8 d6, d16[1]
250
vst1.64 {d5}, [r0,:64], ip
251
vdup.8 d7, d16[0]
252
vst1.64 {d6}, [r0,:64], ip
253
vst1.64 {d7}, [r0,:64], ip
254
bx lr
255
endfunc
256
257
function x264_predict_8x8_v_neon
258
add r1, r1, #16
259
mov r12, #FDEC_STRIDE
260
vld1.8 {d0}, [r1,:64]
261
.rept 8
262
vst1.8 {d0}, [r0,:64], r12
263
.endr
264
bx lr
265
endfunc
266
267
function x264_predict_8x8_ddl_neon
268
add r1, #16
269
vld1.8 {d0, d1}, [r1,:128]
270
vmov.i8 q3, #0
271
vrev64.8 d2, d1
272
vext.8 q8, q3, q0, #15
273
vext.8 q2, q0, q1, #1
274
vhadd.u8 q8, q2
275
mov r12, #FDEC_STRIDE
276
vrhadd.u8 q0, q8
277
vext.8 d2, d0, d1, #1
278
vext.8 d3, d0, d1, #2
279
vst1.8 d2, [r0,:64], r12
280
vext.8 d2, d0, d1, #3
281
vst1.8 d3, [r0,:64], r12
282
vext.8 d3, d0, d1, #4
283
vst1.8 d2, [r0,:64], r12
284
vext.8 d2, d0, d1, #5
285
vst1.8 d3, [r0,:64], r12
286
vext.8 d3, d0, d1, #6
287
vst1.8 d2, [r0,:64], r12
288
vext.8 d2, d0, d1, #7
289
vst1.8 d3, [r0,:64], r12
290
vst1.8 d2, [r0,:64], r12
291
vst1.8 d1, [r0,:64], r12
292
bx lr
293
endfunc
294
295
function x264_predict_8x8_ddr_neon
296
vld1.8 {d0-d3}, [r1,:128]
297
vext.8 q2, q0, q1, #7
298
vext.8 q3, q0, q1, #9
299
300
vhadd.u8 q2, q2, q3
301
vrhadd.u8 d0, d1, d4
302
vrhadd.u8 d1, d2, d5
303
304
add r0, #7*FDEC_STRIDE
305
mov r12, #-1*FDEC_STRIDE
306
307
vext.8 d2, d0, d1, #1
308
vst1.8 {d0}, [r0,:64], r12
309
vext.8 d4, d0, d1, #2
310
vst1.8 {d2}, [r0,:64], r12
311
vext.8 d5, d0, d1, #3
312
vst1.8 {d4}, [r0,:64], r12
313
vext.8 d4, d0, d1, #4
314
vst1.8 {d5}, [r0,:64], r12
315
vext.8 d5, d0, d1, #5
316
vst1.8 {d4}, [r0,:64], r12
317
vext.8 d4, d0, d1, #6
318
vst1.8 {d5}, [r0,:64], r12
319
vext.8 d5, d0, d1, #7
320
vst1.8 {d4}, [r0,:64], r12
321
vst1.8 {d5}, [r0,:64], r12
322
bx lr
323
endfunc
324
325
function x264_predict_8x8_vl_neon
326
add r1, #16
327
mov r12, #FDEC_STRIDE
328
329
vld1.8 {d0, d1}, [r1,:128]
330
vext.8 q1, q1, q0, #15
331
vext.8 q2, q0, q2, #1
332
333
vrhadd.u8 q3, q0, q2
334
335
vhadd.u8 q1, q1, q2
336
vrhadd.u8 q0, q0, q1
337
338
vext.8 d2, d0, d1, #1
339
vst1.8 {d6}, [r0,:64], r12
340
vext.8 d3, d6, d7, #1
341
vst1.8 {d2}, [r0,:64], r12
342
vext.8 d2, d0, d1, #2
343
vst1.8 {d3}, [r0,:64], r12
344
vext.8 d3, d6, d7, #2
345
vst1.8 {d2}, [r0,:64], r12
346
vext.8 d2, d0, d1, #3
347
vst1.8 {d3}, [r0,:64], r12
348
vext.8 d3, d6, d7, #3
349
vst1.8 {d2}, [r0,:64], r12
350
vext.8 d2, d0, d1, #4
351
vst1.8 {d3}, [r0,:64], r12
352
vst1.8 {d2}, [r0,:64], r12
353
bx lr
354
endfunc
355
356
function x264_predict_8x8_vr_neon
357
add r1, #8
358
mov r12, #FDEC_STRIDE
359
vld1.8 {d4,d5}, [r1,:64]
360
361
vext.8 q1, q2, q2, #14
362
vext.8 q0, q2, q2, #15
363
364
vhadd.u8 q3, q2, q1
365
vrhadd.u8 q2, q2, q0
366
vrhadd.u8 q0, q0, q3
367
368
vmov d2, d0
369
370
vst1.8 {d5}, [r0,:64], r12
371
vuzp.8 d2, d0
372
vst1.8 {d1}, [r0,:64], r12
373
vext.8 d6, d0, d5, #7
374
vext.8 d3, d2, d1, #7
375
vst1.8 {d6}, [r0,:64], r12
376
vst1.8 {d3}, [r0,:64], r12
377
vext.8 d6, d0, d5, #6
378
vext.8 d3, d2, d1, #6
379
vst1.8 {d6}, [r0,:64], r12
380
vst1.8 {d3}, [r0,:64], r12
381
vext.8 d6, d0, d5, #5
382
vext.8 d3, d2, d1, #5
383
vst1.8 {d6}, [r0,:64], r12
384
vst1.8 {d3}, [r0,:64], r12
385
bx lr
386
endfunc
387
388
function x264_predict_8x8_hd_neon
389
mov r12, #FDEC_STRIDE
390
add r1, #7
391
392
vld1.8 {d2,d3}, [r1]
393
vext.8 q3, q1, q1, #1
394
vext.8 q2, q1, q1, #2
395
396
vrhadd.u8 q8, q1, q3
397
398
vhadd.u8 q1, q2
399
vrhadd.u8 q0, q1, q3
400
401
vzip.8 d16, d0
402
403
vext.8 d2, d0, d1, #6
404
vext.8 d3, d0, d1, #4
405
vst1.8 {d2}, [r0,:64], r12
406
vext.8 d2, d0, d1, #2
407
vst1.8 {d3}, [r0,:64], r12
408
vst1.8 {d2}, [r0,:64], r12
409
vext.8 d2, d16, d0, #6
410
vst1.8 {d0}, [r0,:64], r12
411
vext.8 d3, d16, d0, #4
412
vst1.8 {d2}, [r0,:64], r12
413
vext.8 d2, d16, d0, #2
414
vst1.8 {d3}, [r0,:64], r12
415
vst1.8 {d2}, [r0,:64], r12
416
vst1.8 {d16}, [r0,:64], r12
417
418
bx lr
419
endfunc
420
421
function x264_predict_8x8_hu_neon
422
mov r12, #FDEC_STRIDE
423
add r1, #7
424
vld1.8 {d7}, [r1]
425
vdup.8 d6, d7[0]
426
vrev64.8 d7, d7
427
428
vext.8 d4, d7, d6, #2
429
vext.8 d2, d7, d6, #1
430
431
vhadd.u8 d16, d7, d4
432
vrhadd.u8 d0, d2, d7
433
vrhadd.u8 d1, d16, d2
434
435
vzip.8 d0, d1
436
437
vdup.16 q1, d1[3]
438
439
vext.8 q2, q0, q1, #2
440
vext.8 q3, q0, q1, #4
441
vext.8 q8, q0, q1, #6
442
vst1.8 {d0}, [r0,:64], r12
443
vst1.8 {d4}, [r0,:64], r12
444
vst1.8 {d6}, [r0,:64], r12
445
vst1.8 {d16}, [r0,:64], r12
446
447
vst1.8 {d1}, [r0,:64], r12
448
vst1.8 {d5}, [r0,:64], r12
449
vst1.8 {d7}, [r0,:64], r12
450
vst1.8 {d17}, [r0,:64]
451
bx lr
452
endfunc
453
454
function x264_predict_8x8c_dc_top_neon
455
sub r2, r0, #FDEC_STRIDE
456
mov r1, #FDEC_STRIDE
457
vld1.8 {d0}, [r2,:64]
458
vpaddl.u8 d0, d0
459
vpadd.u16 d0, d0, d0
460
vrshrn.u16 d0, q0, #2
461
vdup.8 d1, d0[1]
462
vdup.8 d0, d0[0]
463
vtrn.32 d0, d1
464
b pred8x8_dc_end
465
endfunc
466
467
function x264_predict_8x8c_dc_left_neon
468
mov r1, #FDEC_STRIDE
469
sub r2, r0, #1
470
ldcol.8 d0, r2, r1
471
vpaddl.u8 d0, d0
472
vpadd.u16 d0, d0, d0
473
vrshrn.u16 d0, q0, #2
474
vdup.8 d1, d0[1]
475
vdup.8 d0, d0[0]
476
b pred8x8_dc_end
477
endfunc
478
479
function x264_predict_8x8c_dc_neon
480
sub r2, r0, #FDEC_STRIDE
481
mov r1, #FDEC_STRIDE
482
vld1.8 {d0}, [r2,:64]
483
sub r2, r0, #1
484
ldcol.8 d1, r2, r1
485
vtrn.32 d0, d1
486
vpaddl.u8 q0, q0
487
vpadd.u16 d0, d0, d1
488
vpadd.u16 d1, d0, d0
489
vrshrn.u16 d2, q0, #3
490
vrshrn.u16 d3, q0, #2
491
vdup.8 d0, d2[4]
492
vdup.8 d1, d3[3]
493
vdup.8 d4, d3[2]
494
vdup.8 d5, d2[5]
495
vtrn.32 q0, q2
496
pred8x8_dc_end:
497
add r2, r0, r1, lsl #2
498
.rept 4
499
vst1.8 {d0}, [r0,:64], r1
500
vst1.8 {d1}, [r2,:64], r1
501
.endr
502
bx lr
503
endfunc
504
505
function x264_predict_8x8c_h_neon
506
sub r1, r0, #1
507
mov ip, #FDEC_STRIDE
508
.rept 4
509
vld1.8 {d0[]}, [r1], ip
510
vld1.8 {d2[]}, [r1], ip
511
vst1.64 {d0}, [r0,:64], ip
512
vst1.64 {d2}, [r0,:64], ip
513
.endr
514
bx lr
515
endfunc
516
517
function x264_predict_8x8c_v_neon
518
sub r0, r0, #FDEC_STRIDE
519
mov ip, #FDEC_STRIDE
520
vld1.64 {d0}, [r0,:64], ip
521
.rept 8
522
vst1.64 {d0}, [r0,:64], ip
523
.endr
524
bx lr
525
endfunc
526
527
function x264_predict_8x8c_p_neon
528
sub r3, r0, #FDEC_STRIDE
529
mov r1, #FDEC_STRIDE
530
add r2, r3, #4
531
sub r3, r3, #1
532
vld1.32 {d0[0]}, [r3]
533
vld1.32 {d2[0]}, [r2,:32], r1
534
ldcol.8 d0, r3, r1, 4, hi=1
535
add r3, r3, r1
536
ldcol.8 d3, r3, r1, 4
537
vaddl.u8 q8, d2, d3
538
vrev32.8 d0, d0
539
vtrn.32 d2, d3
540
vsubl.u8 q2, d2, d0
541
movrel r3, p16weight
542
vld1.16 {q0}, [r3,:128]
543
vmul.s16 d4, d4, d0
544
vmul.s16 d5, d5, d0
545
vpadd.i16 d4, d4, d5
546
vpaddl.s16 d4, d4
547
vshl.i32 d5, d4, #4
548
vadd.s32 d4, d4, d5
549
vrshrn.s32 d4, q2, #5
550
mov r3, #0
551
vtrn.16 d4, d5
552
vadd.i16 d2, d4, d5
553
vshl.i16 d3, d2, #2
554
vrev64.16 d16, d16
555
vsub.i16 d3, d3, d2
556
vadd.i16 d16, d16, d0
557
vshl.i16 d2, d16, #4
558
vsub.i16 d2, d2, d3
559
vext.16 q0, q0, q0, #7
560
vmov.16 d0[0], r3
561
vmul.i16 q0, q0, d4[0]
562
vdup.16 q1, d2[0]
563
vdup.16 q3, d5[0]
564
vadd.i16 q1, q1, q0
565
mov r3, #8
566
1:
567
vqshrun.s16 d0, q1, #5
568
vadd.i16 q1, q1, q3
569
vst1.8 {d0}, [r0,:64], r1
570
subs r3, r3, #1
571
bne 1b
572
bx lr
573
endfunc
574
575
576
function x264_predict_8x16c_dc_top_neon
577
sub r2, r0, #FDEC_STRIDE
578
mov r1, #FDEC_STRIDE
579
vld1.8 {d0}, [r2,:64]
580
vpaddl.u8 d0, d0
581
vpadd.u16 d0, d0, d0
582
vrshrn.u16 d0, q0, #2
583
vdup.8 d1, d0[1]
584
vdup.8 d0, d0[0]
585
vtrn.32 d0, d1
586
587
add r2, r0, r1, lsl #2
588
.rept 4
589
vst1.8 {d0}, [r0,:64], r1
590
vst1.8 {d1}, [r2,:64], r1
591
.endr
592
add r2, r2, r1, lsl #2
593
add r0, r0, r1, lsl #2
594
.rept 4
595
vst1.8 {d0}, [r0,:64], r1
596
vst1.8 {d1}, [r2,:64], r1
597
.endr
598
bx lr
599
endfunc
600
601
function x264_predict_8x16c_h_neon
602
sub r1, r0, #1
603
mov ip, #FDEC_STRIDE
604
.rept 8
605
vld1.8 {d0[]}, [r1], ip
606
vld1.8 {d2[]}, [r1], ip
607
vst1.64 {d0}, [r0,:64], ip
608
vst1.64 {d2}, [r0,:64], ip
609
.endr
610
bx lr
611
endfunc
612
613
function x264_predict_8x16c_p_neon
614
sub r3, r0, #FDEC_STRIDE
615
mov r1, #FDEC_STRIDE
616
add r2, r3, #4
617
sub r3, r3, #1
618
vld1.32 {d0[0]}, [r3]
619
vld1.32 {d2[0]}, [r2,:32], r1
620
ldcol.8 d1, r3, r1
621
add r3, r3, r1
622
ldcol.8 d3, r3, r1
623
vrev64.32 d16, d3
624
vaddl.u8 q8, d2, d16
625
vrev32.8 d0, d0
626
vsubl.u8 q2, d2, d0
627
vrev64.8 d1, d1
628
vsubl.u8 q3, d3, d1
629
movrel r3, p16weight
630
vld1.16 {q0}, [r3,:128]
631
vmul.s16 d4, d4, d0
632
vmul.s16 q3, q3, q0
633
vpadd.i16 d4, d4, d5
634
vpadd.i16 d6, d6, d7
635
vpaddl.s16 d4, d4 @ d4[0] = H
636
vpaddl.s16 d6, d6
637
vpadd.s32 d6, d6 @ d6[0] = V
638
vshl.i32 d5, d4, #4
639
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
640
vshl.i32 d7, d6, #2
641
vrshrn.s32 d4, q2, #5 @ d4[0] = b
642
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
643
vrshrn.s32 d6, q3, #6 @ d6[0] = c
644
mov r3, #0
645
vshl.i16 d3, d4, #2
646
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
647
vshl.i16 d2, d6, #3
648
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
649
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
650
vrev64.16 d16, d16
651
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
652
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
653
vsub.i16 d2, d2, d3 @ i00
654
vext.16 q0, q0, q0, #7
655
vmov.16 d0[0], r3
656
vmul.i16 q0, q0, d4[0]
657
vdup.16 q1, d2[0]
658
vdup.16 q3, d6[0]
659
vadd.i16 q1, q1, q0
660
mov r3, #16
661
1:
662
vqshrun.s16 d0, q1, #5
663
vadd.i16 q1, q1, q3
664
vst1.8 {d0}, [r0,:64], r1
665
subs r3, r3, #1
666
bne 1b
667
bx lr
668
endfunc
669
670
671
function x264_predict_16x16_dc_top_neon
672
sub r2, r0, #FDEC_STRIDE
673
mov r1, #FDEC_STRIDE
674
vld1.8 {q0}, [r2,:128]
675
add16x8 q0, d0, d1, d0, d1
676
vrshrn.u16 d0, q0, #4
677
vdup.8 q0, d0[0]
678
b pred16x16_dc_end
679
endfunc
680
681
function x264_predict_16x16_dc_left_neon
682
mov r1, #FDEC_STRIDE
683
sub r2, r0, #1
684
ldcol.8 d0, r2, r1
685
ldcol.8 d1, r2, r1
686
add16x8 q0, d0, d1, d0, d1
687
vrshrn.u16 d0, q0, #4
688
vdup.8 q0, d0[0]
689
b pred16x16_dc_end
690
endfunc
691
692
function x264_predict_16x16_dc_neon
693
sub r3, r0, #FDEC_STRIDE
694
sub r0, r0, #1
695
vld1.64 {d0-d1}, [r3,:128]
696
ldrb ip, [r0], #FDEC_STRIDE
697
vaddl.u8 q0, d0, d1
698
ldrb r1, [r0], #FDEC_STRIDE
699
vadd.u16 d0, d0, d1
700
vpadd.u16 d0, d0, d0
701
vpadd.u16 d0, d0, d0
702
.rept 4
703
ldrb r2, [r0], #FDEC_STRIDE
704
add ip, ip, r1
705
ldrb r3, [r0], #FDEC_STRIDE
706
add ip, ip, r2
707
ldrb r1, [r0], #FDEC_STRIDE
708
add ip, ip, r3
709
.endr
710
ldrb r2, [r0], #FDEC_STRIDE
711
add ip, ip, r1
712
ldrb r3, [r0], #FDEC_STRIDE
713
add ip, ip, r2
714
715
sub r0, r0, #FDEC_STRIDE*16
716
add ip, ip, r3
717
vdup.16 d1, ip
718
vadd.u16 d0, d0, d1
719
mov r1, #FDEC_STRIDE
720
add r0, r0, #1
721
vrshr.u16 d0, d0, #5
722
vdup.8 q0, d0[0]
723
pred16x16_dc_end:
724
.rept 16
725
vst1.64 {d0-d1}, [r0,:128], r1
726
.endr
727
bx lr
728
endfunc
729
730
function x264_predict_16x16_h_neon
731
sub r1, r0, #1
732
mov ip, #FDEC_STRIDE
733
.rept 8
734
vld1.8 {d0[]}, [r1], ip
735
vmov d1, d0
736
vld1.8 {d2[]}, [r1], ip
737
vmov d3, d2
738
vst1.64 {d0-d1}, [r0,:128], ip
739
vst1.64 {d2-d3}, [r0,:128], ip
740
.endr
741
bx lr
742
endfunc
743
744
function x264_predict_16x16_v_neon
745
sub r0, r0, #FDEC_STRIDE
746
mov ip, #FDEC_STRIDE
747
vld1.64 {d0-d1}, [r0,:128], ip
748
.rept 16
749
vst1.64 {d0-d1}, [r0,:128], ip
750
.endr
751
bx lr
752
endfunc
753
754
function x264_predict_16x16_p_neon
755
sub r3, r0, #FDEC_STRIDE
756
mov r1, #FDEC_STRIDE
757
add r2, r3, #8
758
sub r3, r3, #1
759
vld1.8 {d0}, [r3]
760
vld1.8 {d2}, [r2,:64], r1
761
ldcol.8 d1, r3, r1
762
add r3, r3, r1
763
ldcol.8 d3, r3, r1
764
vrev64.8 q0, q0
765
vaddl.u8 q8, d2, d3
766
vsubl.u8 q2, d2, d0
767
vsubl.u8 q3, d3, d1
768
movrel r3, p16weight
769
vld1.8 {q0}, [r3,:128]
770
vmul.s16 q2, q2, q0
771
vmul.s16 q3, q3, q0
772
vadd.i16 d4, d4, d5
773
vadd.i16 d5, d6, d7
774
vpadd.i16 d4, d4, d5
775
vpadd.i16 d4, d4, d4
776
vshll.s16 q3, d4, #2
777
vaddw.s16 q2, q3, d4
778
vrshrn.s32 d4, q2, #6
779
mov r3, #0
780
vtrn.16 d4, d5
781
vadd.i16 d2, d4, d5
782
vshl.i16 d3, d2, #3
783
vrev64.16 d16, d17
784
vsub.i16 d3, d3, d2
785
vadd.i16 d16, d16, d0
786
vshl.i16 d2, d16, #4
787
vsub.i16 d2, d2, d3
788
vshl.i16 d3, d4, #4
789
vext.16 q0, q0, q0, #7
790
vsub.i16 d6, d5, d3
791
vmov.16 d0[0], r3
792
vmul.i16 q0, q0, d4[0]
793
vdup.16 q1, d2[0]
794
vdup.16 q2, d4[0]
795
vdup.16 q3, d6[0]
796
vshl.i16 q2, q2, #3
797
vadd.i16 q1, q1, q0
798
vadd.i16 q3, q3, q2
799
mov r3, #16
800
1:
801
vqshrun.s16 d0, q1, #5
802
vadd.i16 q1, q1, q2
803
vqshrun.s16 d1, q1, #5
804
vadd.i16 q1, q1, q3
805
vst1.8 {q0}, [r0,:128], r1
806
subs r3, r3, #1
807
bne 1b
808
bx lr
809
endfunc
810
811