const p8weight, align=4
.short 1, 2, 3, 4, 1, 2, 3, 4
endconst
const p16weight, align=4
.short 1, 2, 3, 4, 5, 6, 7, 8
endconst
.macro ldcol.8 vd, xn, xm, n=8, hi=0
.if \n == 8 || \hi == 0
ld1 {\vd\().b}[0], [\xn], \xm
ld1 {\vd\().b}[1], [\xn], \xm
ld1 {\vd\().b}[2], [\xn], \xm
ld1 {\vd\().b}[3], [\xn], \xm
.endif
.if \n == 8 || \hi == 1
ld1 {\vd\().b}[4], [\xn], \xm
ld1 {\vd\().b}[5], [\xn], \xm
ld1 {\vd\().b}[6], [\xn], \xm
ld1 {\vd\().b}[7], [\xn], \xm
.endif
.endm
.macro ldcol.16 vd, xn, xm
ldcol.8 \vd, \xn, \xm
ld1 {\vd\().b}[ 8], [\xn], \xm
ld1 {\vd\().b}[ 9], [\xn], \xm
ld1 {\vd\().b}[10], [\xn], \xm
ld1 {\vd\().b}[11], [\xn], \xm
ld1 {\vd\().b}[12], [\xn], \xm
ld1 {\vd\().b}[13], [\xn], \xm
ld1 {\vd\().b}[14], [\xn], \xm
ld1 {\vd\().b}[15], [\xn], \xm
.endm
function x264_predict_4x4_h_aarch64, export=1
ldrb w1, [x0,
mov w5,
ldrb w2, [x0,
ldrb w3, [x0,
mul w1, w1, w5
ldrb w4, [x0,
mul w2, w2, w5
str w1, [x0,
mul w3, w3, w5
str w2, [x0,
mul w4, w4, w5
str w3, [x0,
str w4, [x0,
ret
endfunc
function x264_predict_4x4_v_aarch64, export=1
ldr w1, [x0,
str w1, [x0,
str w1, [x0,
str w1, [x0,
str w1, [x0,
ret
endfunc
function x264_predict_4x4_dc_neon, export=1
sub x1, x0,
ldrb w4, [x0,
ldrb w5, [x0,
ldrb w6, [x0,
ldrb w7, [x0,
add w4, w4, w5
ldr s0, [x1]
add w6, w6, w7
uaddlv h0, v0.8b
add w4, w4, w6
dup v0.4h, v0.h[0]
dup v1.4h, w4
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h,
str s0, [x0]
str s0, [x0,
str s0, [x0,
str s0, [x0,
ret
endfunc
function x264_predict_4x4_dc_top_neon, export=1
sub x1, x0,
ldr s0, [x1]
uaddlv h0, v0.8b
dup v0.4h, v0.h[0]
rshrn v0.8b, v0.8h,
str s0, [x0]
str s0, [x0,
str s0, [x0,
str s0, [x0,
ret
ret
endfunc
function x264_predict_4x4_ddr_neon, export=1
sub x1, x0,
mov x7,
ld1 {v0.8b}, [x1], x7 //
ld1r {v1.8b}, [x1], x7 //
ld1r {v2.8b}, [x1], x7 //
ext v0.8b, v1.8b, v0.8b,
ld1r {v3.8b}, [x1], x7 //
ext v0.8b, v2.8b, v0.8b,
ld1r {v4.8b}, [x1], x7 //
ext v1.8b, v3.8b, v0.8b,
ext v2.8b, v4.8b, v1.8b,
uaddl v0.8h, v0.8b, v1.8b
uaddl v1.8h, v1.8b, v2.8b
add v0.8h, v0.8h, v1.8h
rshrn v0.8b, v0.8h,
ext v3.8b, v0.8b, v0.8b,
ext v2.8b, v0.8b, v0.8b,
ext v1.8b, v0.8b, v0.8b,
str s3, [x0],
str s2, [x0],
str s1, [x0],
str s0, [x0]
ret
endfunc
function x264_predict_4x4_ddl_neon, export=1
sub x0, x0,
mov x7,
ld1 {v0.8b}, [x0], x7
dup v3.8b, v0.b[7]
ext v1.8b, v0.8b, v0.8b,
ext v2.8b, v0.8b, v3.8b,
uhadd v0.8b, v0.8b, v2.8b
urhadd v0.8b, v0.8b, v1.8b
str s0, [x0],
ext v1.8b, v0.8b, v0.8b,
ext v2.8b, v0.8b, v0.8b,
str s1, [x0],
ext v3.8b, v0.8b, v0.8b,
str s2, [x0],
str s3, [x0]
ret
endfunc
function x264_predict_8x8_dc_neon, export=1
mov x7,
ld1 {v0.16b}, [x1],
ld1 {v1.8b}, [x1]
ext v0.16b, v0.16b, v0.16b,
uaddlv h1, v1.8b
uaddlv h0, v0.8b
add v0.8h, v0.8h, v1.8h
dup v0.8h, v0.h[0]
rshrn v0.8b, v0.8h,
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function x264_predict_8x8_h_neon, export=1
mov x7,
ld1 {v16.16b}, [x1]
dup v0.8b, v16.b[14]
dup v1.8b, v16.b[13]
st1 {v0.8b}, [x0], x7
dup v2.8b, v16.b[12]
st1 {v1.8b}, [x0], x7
dup v3.8b, v16.b[11]
st1 {v2.8b}, [x0], x7
dup v4.8b, v16.b[10]
st1 {v3.8b}, [x0], x7
dup v5.8b, v16.b[9]
st1 {v4.8b}, [x0], x7
dup v6.8b, v16.b[8]
st1 {v5.8b}, [x0], x7
dup v7.8b, v16.b[7]
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_v_neon, export=1
add x1, x1,
mov x7,
ld1 {v0.8b}, [x1]
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function x264_predict_8x8_ddl_neon, export=1
add x1, x1,
mov x7,
ld1 {v0.16b}, [x1]
movi v3.16b,
dup v2.16b, v0.b[15]
ext v4.16b, v3.16b, v0.16b,
ext v2.16b, v0.16b, v2.16b,
uhadd v4.16b, v4.16b, v2.16b
urhadd v0.16b, v0.16b, v4.16b
ext v1.16b, v0.16b, v0.16b,
ext v2.16b, v0.16b, v0.16b,
st1 {v1.8b}, [x0], x7
ext v3.16b, v0.16b, v0.16b,
st1 {v2.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b,
st1 {v3.8b}, [x0], x7
ext v5.16b, v0.16b, v0.16b,
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b,
st1 {v5.8b}, [x0], x7
ext v7.16b, v0.16b, v0.16b,
st1 {v6.8b}, [x0], x7
ext v0.16b, v0.16b, v0.16b,
st1 {v7.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_ddr_neon, export=1
ld1 {v0.16b,v1.16b}, [x1]
ext v2.16b, v0.16b, v1.16b,
ext v4.16b, v0.16b, v1.16b,
ext v3.16b, v0.16b, v1.16b,
uhadd v2.16b, v2.16b, v4.16b
urhadd v7.16b, v3.16b, v2.16b
add x0, x0,
mov x7,
ext v6.16b, v7.16b, v7.16b,
st1 {v7.8b}, [x0], x7
ext v5.16b, v7.16b, v7.16b,
st1 {v6.8b}, [x0], x7
ext v4.16b, v7.16b, v7.16b,
st1 {v5.8b}, [x0], x7
ext v3.16b, v7.16b, v7.16b,
st1 {v4.8b}, [x0], x7
ext v2.16b, v7.16b, v7.16b,
st1 {v3.8b}, [x0], x7
ext v1.16b, v7.16b, v7.16b,
st1 {v2.8b}, [x0], x7
ext v0.16b, v7.16b, v7.16b,
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_vl_neon, export=1
add x1, x1,
mov x7,
ld1 {v0.16b}, [x1]
ext v1.16b, v1.16b, v0.16b,
ext v2.16b, v0.16b, v2.16b,
uhadd v1.16b, v1.16b, v2.16b
urhadd v3.16b, v0.16b, v2.16b
urhadd v0.16b, v0.16b, v1.16b
ext v4.16b, v0.16b, v0.16b,
st1 {v3.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b,
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b,
st1 {v5.8b}, [x0], x7
ext v7.16b, v3.16b, v3.16b,
st1 {v6.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b,
st1 {v7.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b,
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b,
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_vr_neon, export=1
add x1, x1,
mov x7,
ld1 {v2.16b}, [x1]
ext v1.16b, v2.16b, v2.16b,
ext v0.16b, v2.16b, v2.16b,
uhadd v3.16b, v2.16b, v1.16b
urhadd v2.16b, v2.16b, v0.16b
urhadd v0.16b, v0.16b, v3.16b
ext v1.16b, v2.16b, v2.16b,
uzp1 v2.8b, v0.8b, v0.8b
uzp2 v3.8b, v0.8b, v0.8b
ext v0.16b, v0.16b, v0.16b,
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ext v4.8b, v3.8b, v1.8b,
ext v5.8b, v2.8b, v0.8b,
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
ext v6.8b, v3.8b, v1.8b,
ext v7.8b, v2.8b, v0.8b,
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ext v1.8b, v3.8b, v1.8b,
ext v0.8b, v2.8b, v0.8b,
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_hd_neon, export=1
add x1, x1,
mov x7,
ld1 {v1.16b}, [x1]
ext v3.16b, v1.16b, v1.16b,
ext v2.16b, v1.16b, v1.16b,
urhadd v4.16b, v1.16b, v3.16b
uhadd v1.16b, v1.16b, v2.16b
urhadd v0.16b, v1.16b, v3.16b
zip1 v16.8b, v4.8b, v0.8b
zip2 v17.8b, v4.8b, v0.8b
ext v7.16b, v0.16b, v0.16b,
ext v0.8b, v17.8b, v7.8b,
ext v1.8b, v17.8b, v7.8b,
st1 {v0.8b}, [x0], x7
ext v2.8b, v17.8b, v7.8b,
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v3.8b, v16.8b, v17.8b,
st1 {v17.8b}, [x0], x7
ext v4.8b, v16.8b, v17.8b,
st1 {v3.8b}, [x0], x7
ext v5.8b, v16.8b, v17.8b,
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v16.8b}, [x0], x7
ret
endfunc
function x264_predict_8x8_hu_neon, export=1
add x1, x1,
mov x7,
ld1 {v7.8b}, [x1]
dup v6.8b, v7.b[0]
rev64 v7.8b, v7.8b
ext v4.8b, v7.8b, v6.8b,
ext v2.8b, v7.8b, v6.8b,
uhadd v5.8b, v7.8b, v4.8b
urhadd v0.8b, v2.8b, v7.8b
urhadd v1.8b, v5.8b, v2.8b
zip1 v16.8b, v0.8b, v1.8b
zip2 v17.8b, v0.8b, v1.8b
dup v18.4h, v17.h[3]
ext v0.8b, v16.8b, v17.8b,
ext v1.8b, v16.8b, v17.8b,
ext v2.8b, v16.8b, v17.8b,
st1 {v16.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v4.8b, v17.8b, v18.8b,
ext v5.8b, v17.8b, v18.8b,
ext v6.8b, v17.8b, v18.8b,
st1 {v17.8b}, [x0], x7
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0]
ret
endfunc
function x264_predict_8x8c_dc_top_neon, export=1
sub x2, x0,
mov x1,
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v0.8b, v0.8h,
dup v3.8b, v0.b[1]
dup v2.8b, v0.b[0]
transpose v0.2s, v1.2s, v2.2s, v3.2s
b pred8x8c_dc_end
endfunc
function x264_predict_8x8c_dc_left_neon, export=1
ldrb w2, [x0,
ldrb w3, [x0,
ldrb w4, [x0,
ldrb w5, [x0,
mov x1,
add w2, w2, w3
add w3, w4, w5
ldrb w6, [x0,
ldrb w7, [x0,
ldrb w8, [x0,
ldrb w9, [x0,
add w6, w6, w7
add w7, w8, w9
add w2, w2, w3
add w6, w6, w7
dup v0.8h, w2
dup v1.8h, w6
rshrn v0.8b, v0.8h,
rshrn v1.8b, v1.8h,
b pred8x8c_dc_end
endfunc
function x264_predict_8x8c_dc_neon, export=1
mov x1,
sub x2, x0,
ldrb w10, [x0,
ldrb w11, [x0,
ldrb w12, [x0,
ldrb w13, [x0,
add w10, w10, w11
ldrb w4, [x0,
ldrb w5, [x0,
add w12, w12, w13
ldrb w6, [x0,
ldrb w7, [x0,
add w4, w4, w5
add w6, w6, w7
add w10, w10, w12, lsl
add w4, w4, w6, lsl
ld1 {v0.8b}, [x2]
add x10, x10, x4, lsl
uaddlp v0.4h, v0.8b // s0, s1
mov v1.d[0], x10 // s2, s3
add v3.4h, v0.4h, v1.4h
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
uzp1 v1.2d, v1.2d, v1.2d
uzp1 v0.2d, v0.2d, v0.2d
rshrn v3.8b, v1.8h,
rshrn v2.8b, v0.8h,
uzp1 v0.8b, v3.8b, v2.8b
uzp2 v1.8b, v2.8b, v3.8b
pred8x8c_dc_end:
add x2, x0,
add x4, x0,
add x5, x0,
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x2], x1
st1 {v0.8b}, [x0]
st1 {v0.8b}, [x2]
st1 {v1.8b}, [x4], x1
st1 {v1.8b}, [x5], x1
st1 {v1.8b}, [x4]
st1 {v1.8b}, [x5]
ret
endfunc
function x264_predict_8x8c_h_neon, export=1
sub x1, x0,
mov x7,
.rept 4
ld1r {v0.8b}, [x1], x7
ld1r {v1.8b}, [x1], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
.endr
ret
endfunc
function x264_predict_8x8c_v_aarch64, export=1
ldr x1, [x0,
.irp c, 0,1,2,3,4,5,6,7
str x1, [x0,
.endr
ret
endfunc
function x264_predict_8x8c_p_neon, export=1
sub x3, x0,
mov x1,
add x2, x3,
sub x3, x3,
ld1 {v0.s}[0], [x3]
ld1 {v2.s}[0], [x2], x1
ldcol.8 v0, x3, x1, 4, hi=1
add x3, x3, x1
ldcol.8 v3, x3, x1, 4
movrel x4, p8weight
movrel x5, p16weight
uaddl v4.8h, v2.8b, v3.8b
rev32 v0.8b, v0.8b
trn1 v2.2s, v2.2s, v3.2s
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v7.8h
ld1 {v0.8h}, [x5]
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s,
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s,
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h,
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
rev64 v4.4h, v4.4h
add v4.4h, v4.4h, v0.4h
shl v2.4h, v4.4h,
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
ext v0.16b, v0.16b, v0.16b,
sub v6.4h, v5.4h, v3.4h
mov v0.h[0], wzr
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
add v1.8h, v1.8h, v0.8h // pix + x*b
mov x3,
1:
subs x3, x3,
sqshrun v0.8b, v1.8h,
add v1.8h, v1.8h, v2.8h
st1 {v0.8b}, [x0], x1
b.ne 1b
ret
endfunc
.macro loadsum4 wd, t1, t2, t3, x, idx
ldrb \wd, [\x,
ldrb \t1, [\x,
ldrb \t2, [\x,
ldrb \t3, [\x,
add \wd, \wd, \t1
add \t1, \t2, \t3
add \wd, \wd, \t1
.endm
function x264_predict_8x16c_h_neon, export=1
sub x2, x0,
add x3, x0,
mov x7,
add x1, x0,
.rept 4
ld1r {v0.8b}, [x2], x7
ld1r {v1.8b}, [x3], x7
ld1r {v2.8b}, [x2], x7
ld1r {v3.8b}, [x3], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x1], x7
st1 {v2.8b}, [x0], x7
st1 {v3.8b}, [x1], x7
.endr
ret
endfunc
function x264_predict_8x16c_v_neon, export=1
sub x1, x0,
mov x2,
ld1 {v0.8b}, [x1], x2
.rept 8
st1 {v0.8b}, [x0], x2
st1 {v0.8b}, [x1], x2
.endr
ret
endfunc
function x264_predict_8x16c_p_neon, export=1
movrel x4, p16weight
ld1 {v17.8h}, [x4]
sub x3, x0,
mov x1,
add x2, x3,
sub x3, x3,
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
ext v4.8b, v2.8b, v2.8b,
ext v5.8b, v3.8b, v3.8b,
rev32 v0.8b, v0.8b
rev64 v1.8b, v1.8b
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v17.8h
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s // H
usubl v3.8h, v3.8b, v1.8b
mul v3.8h, v3.8h, v17.8h
saddlp v3.4s, v3.8h
addp v3.4s, v3.4s, v3.4s
addp v3.4s, v3.4s, v3.4s // V
ext v17.16b, v17.16b, v17.16b,
shl v4.4h, v4.4h,
shl v6.2s, v2.2s,
shl v7.2s, v3.2s,
add v2.2s, v2.2s, v6.2s // 17 * H
add v3.2s, v3.2s, v7.2s // 5 * V
rshrn v2.4h, v2.4s,
rshrn v3.4h, v3.4s,
mov v17.h[0], wzr
sub v4.4h, v4.4h, v2.4h // a - b
shl v6.4h, v2.4h,
add v4.4h, v4.4h, v3.4h // a - b + c
shl v7.4h, v3.4h,
sub v4.4h, v4.4h, v6.4h // a - 3b + c
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v4.h[0] // i00
dup v2.8h, v3.h[0] // c
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
mov x3,
1:
subs x3, x3,
sqrshrun v4.8b, v1.8h,
add v1.8h, v1.8h, v2.8h
sqrshrun v5.8b, v1.8h,
st1 {v4.8b}, [x0], x1
add v1.8h, v1.8h, v2.8h
st1 {v5.8b}, [x0], x1
b.ne 1b
ret
endfunc
function x264_predict_8x16c_dc_neon, export=1
mov x1,
sub x10, x0,
loadsum4 w2, w3, w4, w5, x0, 0
ld1 {v6.8b}, [x10]
loadsum4 w6, w7, w8, w9, x0, 4
uaddlp v6.4h, v6.8b
dup v22.8h, w2 // s2
dup v23.8h, w6 // s3
loadsum4 w2, w3, w4, w5, x0, 8
addp v6.4h, v6.4h, v6.4h // s0, s1
loadsum4 w6, w7, w8, w9, x0, 12
dup v20.8h, v6.h[0] // s0
dup v21.8h, v6.h[1] // s1
dup v24.8h, w2 // s4
dup v25.8h, w6 // s5
ext v16.16b, v20.16b, v21.16b,
ext v17.16b, v22.16b, v21.16b,
ext v1.16b, v23.16b, v21.16b,
ext v2.16b, v24.16b, v21.16b,
ext v3.16b, v25.16b, v21.16b,
add v0.8h, v16.8h, v17.8h
add v1.8h, v1.8h, v23.8h
add v2.8h, v2.8h, v24.8h
add v3.8h, v3.8h, v25.8h
rshrn v0.8b, v0.8h,
rshrn v1.8b, v1.8h,
rshrn v2.8b, v2.8h,
rshrn v3.8b, v3.8h,
add x11, x0,
add x12, x0,
add x13, x0,
.rept 4
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x11], x1
st1 {v2.8b}, [x12], x1
st1 {v3.8b}, [x13], x1
.endr
ret
endfunc
function x264_predict_8x16c_dc_left_neon, export=1
mov x1,
ldrb w2, [x0,
ldrb w3, [x0,
ldrb w4, [x0,
ldrb w5, [x0,
add w2, w2, w3
ldrb w6, [x0,
add w4, w4, w5
ldrb w7, [x0,
add w2, w2, w4
ldrb w8, [x0,
ldrb w9, [x0,
dup v0.8h, w2
add w6, w6, w7
rshrn v0.8b, v0.8h,
add w8, w8, w9
ldrb w10, [x0,
ldrb w11, [x0,
add w6, w6, w8
ldrb w12, [x0,
ldrb w13, [x0,
dup v1.8h, w6
add w10, w10, w11
rshrn v1.8b, v1.8h,
add w12, w12, w13
ldrb w2, [x0,
ldrb w3, [x0,
add w10, w10, w12
ldrb w4, [x0,
ldrb w5, [x0,
dup v2.8h, w10
add w2, w2, w3
rshrn v2.8b, v2.8h,
add w4, w4, w5
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
add w2, w2, w4
st1 {v0.8b}, [x0], x1
dup v3.8h, w2
st1 {v0.8b}, [x0], x1
rshrn v3.8b, v3.8h,
.irp idx, 1, 2, 3
.rept 4
st1 {v\idx\().8b}, [x0], x1
.endr
.endr
ret
endfunc
function x264_predict_8x16c_dc_top_neon, export=1
sub x2, x0,
mov x1,
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v4.8b, v0.8h,
dup v0.8b, v4.b[0]
dup v1.8b, v4.b[1]
ext v0.8b, v0.8b, v1.8b,
.rept 16
st1 {v0.8b}, [x0], x1
.endr
ret
endfunc
function x264_predict_16x16_dc_top_neon, export=1
sub x2, x0,
mov x1,
ld1 {v0.16b}, [x2]
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h,
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function x264_predict_16x16_dc_left_neon, export=1
sub x2, x0,
mov x1,
ldcol.16 v0, x2, x1
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h,
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function x264_predict_16x16_dc_neon, export=1
sub x3, x0,
sub x2, x0,
mov x1,
ld1 {v0.16b}, [x3]
ldcol.16 v1, x2, x1
uaddlv h0, v0.16b
uaddlv h1, v1.16b
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h,
dup v0.16b, v0.b[0]
pred16x16_dc_end:
.rept 16
st1 {v0.16b}, [x0], x1
.endr
ret
endfunc
function x264_predict_16x16_h_neon, export=1
sub x1, x0,
mov x7,
.rept 8
ld1r {v0.16b}, [x1], x7
ld1r {v1.16b}, [x1], x7
st1 {v0.16b}, [x0], x7
st1 {v1.16b}, [x0], x7
.endr
ret
endfunc
function x264_predict_16x16_v_neon, export=1
sub x0, x0,
mov x7,
ld1 {v0.16b}, [x0], x7
.rept 16
st1 {v0.16b}, [x0], x7
.endr
ret
endfunc
function x264_predict_16x16_p_neon, export=1
sub x3, x0,
mov x1,
add x2, x3,
sub x3, x3,
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
rev64 v0.8b, v0.8b
rev64 v1.8b, v1.8b
movrel x4, p16weight
uaddl v4.8h, v2.8b, v3.8b
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
usubl v3.8h, v3.8b, v1.8b
mul v2.8h, v2.8h, v7.8h
mul v3.8h, v3.8h, v7.8h
saddlp v2.4s, v2.8h
saddlp v3.4s, v3.8h
addp v2.4s, v2.4s, v3.4s
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s,
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s,
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h,
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
ext v4.16b, v4.16b, v4.16b,
add v4.4h, v4.4h, v7.4h
shl v2.4h, v4.4h,
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
ext v7.16b, v7.16b, v7.16b,
mov v7.h[0], wzr
dup v3.8h, v5.h[0]
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
shl v3.8h, v3.8h,
add v1.8h, v1.8h, v0.8h // pix + x*b
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
mov x3,
1:
subs x3, x3,
sqshrun v0.8b, v1.8h,
add v1.8h, v1.8h, v2.8h
sqshrun2 v0.16b, v3.8h,
add v3.8h, v3.8h, v2.8h
st1 {v0.16b}, [x0], x1
b.ne 1b
ret
endfunc