Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/*****************************************************************************
2
* deblock.S: aarch64 deblocking
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: Mans Rullgard <[email protected]>
7
* Janne Grunau <[email protected]>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
13
*
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
18
*
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
*
23
* This program is also available under a commercial proprietary license.
24
* For more information, contact us at [email protected].
25
*****************************************************************************/
26
27
#include "asm.S"
28
29
.macro h264_loop_filter_start
30
cmp w2, #0
31
ldr w6, [x4]
32
ccmp w3, #0, #0, ne
33
mov v24.s[0], w6
34
and w8, w6, w6, lsl #16
35
b.eq 1f
36
ands w8, w8, w8, lsl #8
37
b.ge 2f
38
1:
39
ret
40
2:
41
.endm
42
43
.macro h264_loop_filter_luma
44
dup v22.16b, w2 // alpha
45
uxtl v24.8h, v24.8b
46
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
47
uxtl v24.4s, v24.4h
48
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
49
sli v24.8h, v24.8h, #8
50
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
51
sli v24.4s, v24.4s, #16
52
cmhi v21.16b, v22.16b, v21.16b // < alpha
53
dup v22.16b, w3 // beta
54
cmlt v23.16b, v24.16b, #0
55
cmhi v28.16b, v22.16b, v28.16b // < beta
56
cmhi v30.16b, v22.16b, v30.16b // < beta
57
bic v21.16b, v21.16b, v23.16b
58
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
59
and v21.16b, v21.16b, v28.16b
60
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
61
cmhi v17.16b, v22.16b, v17.16b // < beta
62
and v21.16b, v21.16b, v30.16b
63
cmhi v19.16b, v22.16b, v19.16b // < beta
64
and v17.16b, v17.16b, v21.16b
65
and v19.16b, v19.16b, v21.16b
66
and v24.16b, v24.16b, v21.16b
67
urhadd v28.16b, v16.16b, v0.16b
68
sub v21.16b, v24.16b, v17.16b
69
uqadd v23.16b, v18.16b, v24.16b
70
uhadd v20.16b, v20.16b, v28.16b
71
sub v21.16b, v21.16b, v19.16b
72
uhadd v28.16b, v4.16b, v28.16b
73
umin v23.16b, v23.16b, v20.16b
74
uqsub v22.16b, v18.16b, v24.16b
75
uqadd v4.16b, v2.16b, v24.16b
76
umax v23.16b, v23.16b, v22.16b
77
uqsub v22.16b, v2.16b, v24.16b
78
umin v28.16b, v4.16b, v28.16b
79
uxtl v4.8h, v0.8b
80
umax v28.16b, v28.16b, v22.16b
81
uxtl2 v20.8h, v0.16b
82
usubw v4.8h, v4.8h, v16.8b
83
usubw2 v20.8h, v20.8h, v16.16b
84
shl v4.8h, v4.8h, #2
85
shl v20.8h, v20.8h, #2
86
uaddw v4.8h, v4.8h, v18.8b
87
uaddw2 v20.8h, v20.8h, v18.16b
88
usubw v4.8h, v4.8h, v2.8b
89
usubw2 v20.8h, v20.8h, v2.16b
90
rshrn v4.8b, v4.8h, #3
91
rshrn2 v4.16b, v20.8h, #3
92
bsl v17.16b, v23.16b, v18.16b
93
bsl v19.16b, v28.16b, v2.16b
94
neg v23.16b, v21.16b
95
uxtl v28.8h, v16.8b
96
smin v4.16b, v4.16b, v21.16b
97
uxtl2 v21.8h, v16.16b
98
smax v4.16b, v4.16b, v23.16b
99
uxtl v22.8h, v0.8b
100
uxtl2 v24.8h, v0.16b
101
saddw v28.8h, v28.8h, v4.8b
102
saddw2 v21.8h, v21.8h, v4.16b
103
ssubw v22.8h, v22.8h, v4.8b
104
ssubw2 v24.8h, v24.8h, v4.16b
105
sqxtun v16.8b, v28.8h
106
sqxtun2 v16.16b, v21.8h
107
sqxtun v0.8b, v22.8h
108
sqxtun2 v0.16b, v24.8h
109
.endm
110
111
function x264_deblock_v_luma_neon, export=1
112
h264_loop_filter_start
113
114
ld1 {v0.16b}, [x0], x1
115
ld1 {v2.16b}, [x0], x1
116
ld1 {v4.16b}, [x0], x1
117
sub x0, x0, x1, lsl #2
118
sub x0, x0, x1, lsl #1
119
ld1 {v20.16b}, [x0], x1
120
ld1 {v18.16b}, [x0], x1
121
ld1 {v16.16b}, [x0], x1
122
123
h264_loop_filter_luma
124
125
sub x0, x0, x1, lsl #1
126
st1 {v17.16b}, [x0], x1
127
st1 {v16.16b}, [x0], x1
128
st1 {v0.16b}, [x0], x1
129
st1 {v19.16b}, [x0]
130
131
ret
132
endfunc
133
134
function x264_deblock_h_luma_neon, export=1
135
h264_loop_filter_start
136
137
sub x0, x0, #4
138
ld1 {v6.8b}, [x0], x1
139
ld1 {v20.8b}, [x0], x1
140
ld1 {v18.8b}, [x0], x1
141
ld1 {v16.8b}, [x0], x1
142
ld1 {v0.8b}, [x0], x1
143
ld1 {v2.8b}, [x0], x1
144
ld1 {v4.8b}, [x0], x1
145
ld1 {v26.8b}, [x0], x1
146
ld1 {v6.d}[1], [x0], x1
147
ld1 {v20.d}[1], [x0], x1
148
ld1 {v18.d}[1], [x0], x1
149
ld1 {v16.d}[1], [x0], x1
150
ld1 {v0.d}[1], [x0], x1
151
ld1 {v2.d}[1], [x0], x1
152
ld1 {v4.d}[1], [x0], x1
153
ld1 {v26.d}[1], [x0], x1
154
155
transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156
157
h264_loop_filter_luma
158
159
transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
160
161
sub x0, x0, x1, lsl #4
162
add x0, x0, #2
163
st1 {v17.s}[0], [x0], x1
164
st1 {v16.s}[0], [x0], x1
165
st1 {v0.s}[0], [x0], x1
166
st1 {v19.s}[0], [x0], x1
167
st1 {v17.s}[1], [x0], x1
168
st1 {v16.s}[1], [x0], x1
169
st1 {v0.s}[1], [x0], x1
170
st1 {v19.s}[1], [x0], x1
171
st1 {v17.s}[2], [x0], x1
172
st1 {v16.s}[2], [x0], x1
173
st1 {v0.s}[2], [x0], x1
174
st1 {v19.s}[2], [x0], x1
175
st1 {v17.s}[3], [x0], x1
176
st1 {v16.s}[3], [x0], x1
177
st1 {v0.s}[3], [x0], x1
178
st1 {v19.s}[3], [x0], x1
179
180
ret
181
endfunc
182
183
.macro h264_loop_filter_start_intra
184
orr w4, w2, w3
185
cmp w4, #0
186
b.ne 1f
187
ret
188
1:
189
dup v30.16b, w2 // alpha
190
dup v31.16b, w3 // beta
191
.endm
192
193
.macro h264_loop_filter_luma_intra
194
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
195
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
196
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
197
cmhi v19.16b, v30.16b, v16.16b // < alpha
198
cmhi v17.16b, v31.16b, v17.16b // < beta
199
cmhi v18.16b, v31.16b, v18.16b // < beta
200
201
movi v29.16b, #2
202
ushr v30.16b, v30.16b, #2 // alpha >> 2
203
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
204
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
205
206
and v19.16b, v19.16b, v17.16b
207
and v19.16b, v19.16b, v18.16b
208
shrn v20.8b, v19.8h, #4
209
mov x4, v20.d[0]
210
cbz x4, 9f
211
212
ushll v20.8h, v6.8b, #1
213
ushll v22.8h, v1.8b, #1
214
ushll2 v21.8h, v6.16b, #1
215
ushll2 v23.8h, v1.16b, #1
216
uaddw v20.8h, v20.8h, v7.8b
217
uaddw v22.8h, v22.8h, v0.8b
218
uaddw2 v21.8h, v21.8h, v7.16b
219
uaddw2 v23.8h, v23.8h, v0.16b
220
uaddw v20.8h, v20.8h, v1.8b
221
uaddw v22.8h, v22.8h, v6.8b
222
uaddw2 v21.8h, v21.8h, v1.16b
223
uaddw2 v23.8h, v23.8h, v6.16b
224
225
rshrn v24.8b, v20.8h, #2 // p0'_1
226
rshrn v25.8b, v22.8h, #2 // q0'_1
227
rshrn2 v24.16b, v21.8h, #2 // p0'_1
228
rshrn2 v25.16b, v23.8h, #2 // q0'_1
229
230
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
231
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
232
cmhi v17.16b, v31.16b, v17.16b // < beta
233
cmhi v18.16b, v31.16b, v18.16b // < beta
234
235
and v17.16b, v16.16b, v17.16b // if_2 && if_3
236
and v18.16b, v16.16b, v18.16b // if_2 && if_4
237
238
not v30.16b, v17.16b
239
not v31.16b, v18.16b
240
241
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
242
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
243
244
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
245
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
246
247
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248
uaddl v26.8h, v5.8b, v7.8b
249
uaddl2 v27.8h, v5.16b, v7.16b
250
uaddw v26.8h, v26.8h, v0.8b
251
uaddw2 v27.8h, v27.8h, v0.16b
252
add v20.8h, v20.8h, v26.8h
253
add v21.8h, v21.8h, v27.8h
254
uaddw v20.8h, v20.8h, v0.8b
255
uaddw2 v21.8h, v21.8h, v0.16b
256
rshrn v20.8b, v20.8h, #3 // p0'_2
257
rshrn2 v20.16b, v21.8h, #3 // p0'_2
258
uaddw v26.8h, v26.8h, v6.8b
259
uaddw2 v27.8h, v27.8h, v6.16b
260
rshrn v21.8b, v26.8h, #2 // p1'_2
261
rshrn2 v21.16b, v27.8h, #2 // p1'_2
262
uaddl v28.8h, v4.8b, v5.8b
263
uaddl2 v29.8h, v4.16b, v5.16b
264
shl v28.8h, v28.8h, #1
265
shl v29.8h, v29.8h, #1
266
add v28.8h, v28.8h, v26.8h
267
add v29.8h, v29.8h, v27.8h
268
rshrn v19.8b, v28.8h, #3 // p2'_2
269
rshrn2 v19.16b, v29.8h, #3 // p2'_2
270
271
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272
uaddl v26.8h, v2.8b, v0.8b
273
uaddl2 v27.8h, v2.16b, v0.16b
274
uaddw v26.8h, v26.8h, v7.8b
275
uaddw2 v27.8h, v27.8h, v7.16b
276
add v22.8h, v22.8h, v26.8h
277
add v23.8h, v23.8h, v27.8h
278
uaddw v22.8h, v22.8h, v7.8b
279
uaddw2 v23.8h, v23.8h, v7.16b
280
rshrn v22.8b, v22.8h, #3 // q0'_2
281
rshrn2 v22.16b, v23.8h, #3 // q0'_2
282
uaddw v26.8h, v26.8h, v1.8b
283
uaddw2 v27.8h, v27.8h, v1.16b
284
rshrn v23.8b, v26.8h, #2 // q1'_2
285
rshrn2 v23.16b, v27.8h, #2 // q1'_2
286
uaddl v28.8h, v2.8b, v3.8b
287
uaddl2 v29.8h, v2.16b, v3.16b
288
shl v28.8h, v28.8h, #1
289
shl v29.8h, v29.8h, #1
290
add v28.8h, v28.8h, v26.8h
291
add v29.8h, v29.8h, v27.8h
292
rshrn v26.8b, v28.8h, #3 // q2'_2
293
rshrn2 v26.16b, v29.8h, #3 // q2'_2
294
295
bit v7.16b, v24.16b, v30.16b // p0'_1
296
bit v0.16b, v25.16b, v31.16b // q0'_1
297
bit v7.16b, v20.16b, v17.16b // p0'_2
298
bit v6.16b, v21.16b, v17.16b // p1'_2
299
bit v5.16b, v19.16b, v17.16b // p2'_2
300
bit v0.16b, v22.16b, v18.16b // q0'_2
301
bit v1.16b, v23.16b, v18.16b // q1'_2
302
bit v2.16b, v26.16b, v18.16b // q2'_2
303
.endm
304
305
function x264_deblock_v_luma_intra_neon, export=1
306
h264_loop_filter_start_intra
307
308
ld1 {v0.16b}, [x0], x1 // q0
309
ld1 {v1.16b}, [x0], x1 // q1
310
ld1 {v2.16b}, [x0], x1 // q2
311
ld1 {v3.16b}, [x0], x1 // q3
312
sub x0, x0, x1, lsl #3
313
ld1 {v4.16b}, [x0], x1 // p3
314
ld1 {v5.16b}, [x0], x1 // p2
315
ld1 {v6.16b}, [x0], x1 // p1
316
ld1 {v7.16b}, [x0] // p0
317
318
h264_loop_filter_luma_intra
319
320
sub x0, x0, x1, lsl #1
321
st1 {v5.16b}, [x0], x1 // p2
322
st1 {v6.16b}, [x0], x1 // p1
323
st1 {v7.16b}, [x0], x1 // p0
324
st1 {v0.16b}, [x0], x1 // q0
325
st1 {v1.16b}, [x0], x1 // q1
326
st1 {v2.16b}, [x0] // q2
327
9:
328
ret
329
endfunc
330
331
function x264_deblock_h_luma_intra_neon, export=1
332
h264_loop_filter_start_intra
333
334
sub x0, x0, #4
335
ld1 {v4.8b}, [x0], x1
336
ld1 {v5.8b}, [x0], x1
337
ld1 {v6.8b}, [x0], x1
338
ld1 {v7.8b}, [x0], x1
339
ld1 {v0.8b}, [x0], x1
340
ld1 {v1.8b}, [x0], x1
341
ld1 {v2.8b}, [x0], x1
342
ld1 {v3.8b}, [x0], x1
343
ld1 {v4.d}[1], [x0], x1
344
ld1 {v5.d}[1], [x0], x1
345
ld1 {v6.d}[1], [x0], x1
346
ld1 {v7.d}[1], [x0], x1
347
ld1 {v0.d}[1], [x0], x1
348
ld1 {v1.d}[1], [x0], x1
349
ld1 {v2.d}[1], [x0], x1
350
ld1 {v3.d}[1], [x0], x1
351
352
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353
354
h264_loop_filter_luma_intra
355
356
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357
358
sub x0, x0, x1, lsl #4
359
st1 {v4.8b}, [x0], x1
360
st1 {v5.8b}, [x0], x1
361
st1 {v6.8b}, [x0], x1
362
st1 {v7.8b}, [x0], x1
363
st1 {v0.8b}, [x0], x1
364
st1 {v1.8b}, [x0], x1
365
st1 {v2.8b}, [x0], x1
366
st1 {v3.8b}, [x0], x1
367
st1 {v4.d}[1], [x0], x1
368
st1 {v5.d}[1], [x0], x1
369
st1 {v6.d}[1], [x0], x1
370
st1 {v7.d}[1], [x0], x1
371
st1 {v0.d}[1], [x0], x1
372
st1 {v1.d}[1], [x0], x1
373
st1 {v2.d}[1], [x0], x1
374
st1 {v3.d}[1], [x0], x1
375
9:
376
ret
377
endfunc
378
379
.macro h264_loop_filter_chroma
380
dup v22.16b, w2 // alpha
381
uxtl v24.8h, v24.8b
382
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
383
uxtl v4.8h, v0.8b
384
uxtl2 v5.8h, v0.16b
385
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
386
usubw v4.8h, v4.8h, v16.8b
387
usubw2 v5.8h, v5.8h, v16.16b
388
sli v24.8h, v24.8h, #8
389
shl v4.8h, v4.8h, #2
390
shl v5.8h, v5.8h, #2
391
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
392
uxtl v24.4s, v24.4h
393
uaddw v4.8h, v4.8h, v18.8b
394
uaddw2 v5.8h, v5.8h, v18.16b
395
cmhi v26.16b, v22.16b, v26.16b // < alpha
396
usubw v4.8h, v4.8h, v2.8b
397
usubw2 v5.8h, v5.8h, v2.16b
398
sli v24.4s, v24.4s, #16
399
dup v22.16b, w3 // beta
400
rshrn v4.8b, v4.8h, #3
401
rshrn2 v4.16b, v5.8h, #3
402
cmhi v28.16b, v22.16b, v28.16b // < beta
403
cmhi v30.16b, v22.16b, v30.16b // < beta
404
smin v4.16b, v4.16b, v24.16b
405
neg v25.16b, v24.16b
406
and v26.16b, v26.16b, v28.16b
407
smax v4.16b, v4.16b, v25.16b
408
and v26.16b, v26.16b, v30.16b
409
uxtl v22.8h, v0.8b
410
uxtl2 v23.8h, v0.16b
411
and v4.16b, v4.16b, v26.16b
412
uxtl v28.8h, v16.8b
413
uxtl2 v29.8h, v16.16b
414
saddw v28.8h, v28.8h, v4.8b
415
saddw2 v29.8h, v29.8h, v4.16b
416
ssubw v22.8h, v22.8h, v4.8b
417
ssubw2 v23.8h, v23.8h, v4.16b
418
sqxtun v16.8b, v28.8h
419
sqxtun v0.8b, v22.8h
420
sqxtun2 v16.16b, v29.8h
421
sqxtun2 v0.16b, v23.8h
422
.endm
423
424
function x264_deblock_v_chroma_neon, export=1
425
h264_loop_filter_start
426
427
sub x0, x0, x1, lsl #1
428
ld1 {v18.16b}, [x0], x1
429
ld1 {v16.16b}, [x0], x1
430
ld1 {v0.16b}, [x0], x1
431
ld1 {v2.16b}, [x0]
432
433
h264_loop_filter_chroma
434
435
sub x0, x0, x1, lsl #1
436
st1 {v16.16b}, [x0], x1
437
st1 {v0.16b}, [x0], x1
438
439
ret
440
endfunc
441
442
function x264_deblock_h_chroma_neon, export=1
443
h264_loop_filter_start
444
445
sub x0, x0, #4
446
deblock_h_chroma:
447
ld1 {v18.d}[0], [x0], x1
448
ld1 {v16.d}[0], [x0], x1
449
ld1 {v0.d}[0], [x0], x1
450
ld1 {v2.d}[0], [x0], x1
451
ld1 {v18.d}[1], [x0], x1
452
ld1 {v16.d}[1], [x0], x1
453
ld1 {v0.d}[1], [x0], x1
454
ld1 {v2.d}[1], [x0], x1
455
456
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
457
458
h264_loop_filter_chroma
459
460
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
461
462
sub x0, x0, x1, lsl #3
463
st1 {v18.d}[0], [x0], x1
464
st1 {v16.d}[0], [x0], x1
465
st1 {v0.d}[0], [x0], x1
466
st1 {v2.d}[0], [x0], x1
467
st1 {v18.d}[1], [x0], x1
468
st1 {v16.d}[1], [x0], x1
469
st1 {v0.d}[1], [x0], x1
470
st1 {v2.d}[1], [x0], x1
471
472
ret
473
endfunc
474
475
function x264_deblock_h_chroma_422_neon, export=1
476
add x5, x0, x1
477
sub x0, x0, #4
478
add x1, x1, x1
479
h264_loop_filter_start
480
mov x7, x30
481
bl deblock_h_chroma
482
mov x30, x7
483
sub x0, x5, #4
484
mov v24.s[0], w6
485
b deblock_h_chroma
486
endfunc
487
488
.macro h264_loop_filter_chroma8
489
dup v22.8b, w2 // alpha
490
uxtl v24.8h, v24.8b
491
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
492
uxtl v4.8h, v17.8b
493
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
494
usubw v4.8h, v4.8h, v16.8b
495
sli v24.8h, v24.8h, #8
496
shl v4.8h, v4.8h, #2
497
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
498
uaddw v4.8h, v4.8h, v18.8b
499
cmhi v26.8b, v22.8b, v26.8b // < alpha
500
usubw v4.8h, v4.8h, v19.8b
501
dup v22.8b, w3 // beta
502
rshrn v4.8b, v4.8h, #3
503
cmhi v28.8b, v22.8b, v28.8b // < beta
504
cmhi v30.8b, v22.8b, v30.8b // < beta
505
smin v4.8b, v4.8b, v24.8b
506
neg v25.8b, v24.8b
507
and v26.8b, v26.8b, v28.8b
508
smax v4.8b, v4.8b, v25.8b
509
and v26.8b, v26.8b, v30.8b
510
uxtl v22.8h, v17.8b
511
and v4.8b, v4.8b, v26.8b
512
uxtl v28.8h, v16.8b
513
saddw v28.8h, v28.8h, v4.8b
514
ssubw v22.8h, v22.8h, v4.8b
515
sqxtun v16.8b, v28.8h
516
sqxtun v17.8b, v22.8h
517
.endm
518
519
function x264_deblock_h_chroma_mbaff_neon, export=1
520
h264_loop_filter_start
521
522
sub x4, x0, #4
523
sub x0, x0, #2
524
525
ld1 {v18.8b}, [x4], x1
526
ld1 {v16.8b}, [x4], x1
527
ld1 {v17.8b}, [x4], x1
528
ld1 {v19.8b}, [x4]
529
530
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
531
532
h264_loop_filter_chroma8
533
534
st2 {v16.h,v17.h}[0], [x0], x1
535
st2 {v16.h,v17.h}[1], [x0], x1
536
st2 {v16.h,v17.h}[2], [x0], x1
537
st2 {v16.h,v17.h}[3], [x0]
538
539
ret
540
endfunc
541
542
.macro h264_loop_filter_chroma_intra width=16
543
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
544
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
545
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
546
cmhi v26.16b, v30.16b, v26.16b // < alpha
547
cmhi v27.16b, v31.16b, v27.16b // < beta
548
cmhi v28.16b, v31.16b, v28.16b // < beta
549
and v26.16b, v26.16b, v27.16b
550
and v26.16b, v26.16b, v28.16b
551
552
ushll v4.8h, v18.8b, #1
553
ushll v6.8h, v19.8b, #1
554
.ifc \width, 16
555
ushll2 v5.8h, v18.16b, #1
556
ushll2 v7.8h, v19.16b, #1
557
uaddl2 v21.8h, v16.16b, v19.16b
558
uaddl2 v23.8h, v17.16b, v18.16b
559
.endif
560
uaddl v20.8h, v16.8b, v19.8b
561
uaddl v22.8h, v17.8b, v18.8b
562
add v20.8h, v20.8h, v4.8h // mlal?
563
add v22.8h, v22.8h, v6.8h
564
.ifc \width, 16
565
add v21.8h, v21.8h, v5.8h
566
add v23.8h, v23.8h, v7.8h
567
.endif
568
uqrshrn v24.8b, v20.8h, #2
569
uqrshrn v25.8b, v22.8h, #2
570
.ifc \width, 16
571
uqrshrn2 v24.16b, v21.8h, #2
572
uqrshrn2 v25.16b, v23.8h, #2
573
.endif
574
bit v16.16b, v24.16b, v26.16b
575
bit v17.16b, v25.16b, v26.16b
576
.endm
577
578
function x264_deblock_v_chroma_intra_neon, export=1
579
h264_loop_filter_start_intra
580
581
sub x0, x0, x1, lsl #1
582
ld1 {v18.16b}, [x0], x1
583
ld1 {v16.16b}, [x0], x1
584
ld1 {v17.16b}, [x0], x1
585
ld1 {v19.16b}, [x0]
586
587
h264_loop_filter_chroma_intra
588
589
sub x0, x0, x1, lsl #1
590
st1 {v16.16b}, [x0], x1
591
st1 {v17.16b}, [x0], x1
592
593
ret
594
endfunc
595
596
function x264_deblock_h_chroma_intra_mbaff_neon, export=1
597
h264_loop_filter_start_intra
598
599
sub x4, x0, #4
600
sub x0, x0, #2
601
ld1 {v18.8b}, [x4], x1
602
ld1 {v16.8b}, [x4], x1
603
ld1 {v17.8b}, [x4], x1
604
ld1 {v19.8b}, [x4], x1
605
606
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
607
608
h264_loop_filter_chroma_intra width=8
609
610
st2 {v16.h,v17.h}[0], [x0], x1
611
st2 {v16.h,v17.h}[1], [x0], x1
612
st2 {v16.h,v17.h}[2], [x0], x1
613
st2 {v16.h,v17.h}[3], [x0], x1
614
615
ret
616
endfunc
617
618
function x264_deblock_h_chroma_intra_neon, export=1
619
h264_loop_filter_start_intra
620
621
sub x4, x0, #4
622
sub x0, x0, #2
623
ld1 {v18.d}[0], [x4], x1
624
ld1 {v16.d}[0], [x4], x1
625
ld1 {v17.d}[0], [x4], x1
626
ld1 {v19.d}[0], [x4], x1
627
ld1 {v18.d}[1], [x4], x1
628
ld1 {v16.d}[1], [x4], x1
629
ld1 {v17.d}[1], [x4], x1
630
ld1 {v19.d}[1], [x4], x1
631
632
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
633
634
h264_loop_filter_chroma_intra
635
636
st2 {v16.h,v17.h}[0], [x0], x1
637
st2 {v16.h,v17.h}[1], [x0], x1
638
st2 {v16.h,v17.h}[2], [x0], x1
639
st2 {v16.h,v17.h}[3], [x0], x1
640
st2 {v16.h,v17.h}[4], [x0], x1
641
st2 {v16.h,v17.h}[5], [x0], x1
642
st2 {v16.h,v17.h}[6], [x0], x1
643
st2 {v16.h,v17.h}[7], [x0], x1
644
645
ret
646
endfunc
647
648
function x264_deblock_h_chroma_422_intra_neon, export=1
649
h264_loop_filter_start_intra
650
651
sub x4, x0, #4
652
sub x0, x0, #2
653
ld1 {v18.d}[0], [x4], x1
654
ld1 {v16.d}[0], [x4], x1
655
ld1 {v17.d}[0], [x4], x1
656
ld1 {v19.d}[0], [x4], x1
657
ld1 {v18.d}[1], [x4], x1
658
ld1 {v16.d}[1], [x4], x1
659
ld1 {v17.d}[1], [x4], x1
660
ld1 {v19.d}[1], [x4], x1
661
662
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
663
664
h264_loop_filter_chroma_intra
665
666
st2 {v16.h,v17.h}[0], [x0], x1
667
st2 {v16.h,v17.h}[1], [x0], x1
668
st2 {v16.h,v17.h}[2], [x0], x1
669
st2 {v16.h,v17.h}[3], [x0], x1
670
st2 {v16.h,v17.h}[4], [x0], x1
671
st2 {v16.h,v17.h}[5], [x0], x1
672
st2 {v16.h,v17.h}[6], [x0], x1
673
st2 {v16.h,v17.h}[7], [x0], x1
674
675
ld1 {v18.d}[0], [x4], x1
676
ld1 {v16.d}[0], [x4], x1
677
ld1 {v17.d}[0], [x4], x1
678
ld1 {v19.d}[0], [x4], x1
679
ld1 {v18.d}[1], [x4], x1
680
ld1 {v16.d}[1], [x4], x1
681
ld1 {v17.d}[1], [x4], x1
682
ld1 {v19.d}[1], [x4], x1
683
684
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
685
686
h264_loop_filter_chroma_intra
687
688
st2 {v16.h,v17.h}[0], [x0], x1
689
st2 {v16.h,v17.h}[1], [x0], x1
690
st2 {v16.h,v17.h}[2], [x0], x1
691
st2 {v16.h,v17.h}[3], [x0], x1
692
st2 {v16.h,v17.h}[4], [x0], x1
693
st2 {v16.h,v17.h}[5], [x0], x1
694
st2 {v16.h,v17.h}[6], [x0], x1
695
st2 {v16.h,v17.h}[7], [x0], x1
696
697
ret
698
endfunc
699
700
//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
701
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
702
// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
703
// uint8_t bs[2][8][4], int mvy_limit,
704
// int bframe )
705
function x264_deblock_strength_neon, export=1
706
movi v4.16b, #0
707
lsl w4, w4, #8
708
add x3, x3, #32
709
sub w4, w4, #(1<<8)-3
710
movi v5.16b, #0
711
dup v6.8h, w4
712
mov x6, #-32
713
714
bframe:
715
// load bytes ref
716
add x2, x2, #16
717
ld1 {v31.d}[1], [x1], #8
718
ld1 {v1.16b}, [x1], #16
719
movi v0.16b, #0
720
ld1 {v2.16b}, [x1], #16
721
ext v3.16b, v0.16b, v1.16b, #15
722
ext v0.16b, v0.16b, v2.16b, #15
723
unzip v21.4s, v22.4s, v1.4s, v2.4s
724
unzip v23.4s, v20.4s, v3.4s, v0.4s
725
ext v21.16b, v31.16b, v22.16b, #12
726
727
eor v0.16b, v20.16b, v22.16b
728
eor v1.16b, v21.16b, v22.16b
729
orr v4.16b, v4.16b, v0.16b
730
orr v5.16b, v5.16b, v1.16b
731
732
ld1 {v21.8h}, [x2], #16 // mv + 0x10
733
ld1 {v19.8h}, [x2], #16 // mv + 0x20
734
ld1 {v22.8h}, [x2], #16 // mv + 0x30
735
ld1 {v18.8h}, [x2], #16 // mv + 0x40
736
ld1 {v23.8h}, [x2], #16 // mv + 0x50
737
ext v19.16b, v19.16b, v22.16b, #12
738
ext v18.16b, v18.16b, v23.16b, #12
739
sabd v0.8h, v22.8h, v19.8h
740
ld1 {v19.8h}, [x2], #16 // mv + 0x60
741
sabd v1.8h, v23.8h, v18.8h
742
ld1 {v24.8h}, [x2], #16 // mv + 0x70
743
uqxtn v0.8b, v0.8h
744
ld1 {v18.8h}, [x2], #16 // mv + 0x80
745
ld1 {v25.8h}, [x2], #16 // mv + 0x90
746
uqxtn2 v0.16b, v1.8h
747
ext v19.16b, v19.16b, v24.16b, #12
748
ext v18.16b, v18.16b, v25.16b, #12
749
sabd v1.8h, v24.8h, v19.8h
750
sabd v2.8h, v25.8h, v18.8h
751
uqxtn v1.8b, v1.8h
752
uqxtn2 v1.16b, v2.8h
753
754
uqsub v0.16b, v0.16b, v6.16b
755
uqsub v1.16b, v1.16b, v6.16b
756
uqxtn v0.8b, v0.8h
757
uqxtn2 v0.16b, v1.8h
758
759
sabd v1.8h, v22.8h, v23.8h
760
orr v4.16b, v4.16b, v0.16b
761
762
sabd v0.8h, v21.8h, v22.8h
763
sabd v2.8h, v23.8h, v24.8h
764
sabd v3.8h, v24.8h, v25.8h
765
uqxtn v0.8b, v0.8h
766
uqxtn2 v0.16b, v1.8h
767
uqxtn v1.8b, v2.8h
768
uqxtn2 v1.16b, v3.8h
769
770
uqsub v0.16b, v0.16b, v6.16b
771
uqsub v1.16b, v1.16b, v6.16b
772
uqxtn v0.8b, v0.8h
773
uqxtn2 v0.16b, v1.8h
774
subs w5, w5, #1
775
orr v5.16b, v5.16b, v0.16b
776
b.eq bframe
777
778
movi v6.16b, #1
779
// load bytes nnz
780
ld1 {v31.d}[1], [x0], #8
781
ld1 {v1.16b}, [x0], #16
782
movi v0.16b, #0
783
ld1 {v2.16b}, [x0], #16
784
ext v3.16b, v0.16b, v1.16b, #15
785
ext v0.16b, v0.16b, v2.16b, #15
786
unzip v21.4s, v22.4s, v1.4s, v2.4s
787
unzip v23.4s, v20.4s, v3.4s, v0.4s
788
ext v21.16b, v31.16b, v22.16b, #12
789
790
movrel x7, transpose_table
791
ld1 {v7.16b}, [x7]
792
orr v0.16b, v20.16b, v22.16b
793
orr v1.16b, v21.16b, v22.16b
794
umin v0.16b, v0.16b, v6.16b
795
umin v1.16b, v1.16b, v6.16b
796
umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
797
umin v5.16b, v5.16b, v6.16b
798
add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
799
add v1.16b, v1.16b, v1.16b
800
umax v4.16b, v4.16b, v0.16b
801
umax v5.16b, v5.16b, v1.16b
802
tbl v6.16b, {v4.16b}, v7.16b
803
st1 {v5.16b}, [x3], x6 // bs[1]
804
st1 {v6.16b}, [x3] // bs[0]
805
ret
806
endfunc
807
808
const transpose_table
809
.byte 0, 4, 8, 12
810
.byte 1, 5, 9, 13
811
.byte 2, 6, 10, 14
812
.byte 3, 7, 11, 15
813
endconst
814
815