Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* pixel.S: aarch64 pixel metrics
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Janne Grunau <[email protected]>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
13
*
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
18
*
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
*
23
* This program is also available under a commercial proprietary license.
24
* For more information, contact us at [email protected].
25
*****************************************************************************/
26
27
#include "asm.S"
28
29
const mask
30
.rept 16
31
.byte 0xff
32
.endr
33
.rept 16
34
.byte 0x00
35
.endr
36
endconst
37
38
const mask_ac_4_8
39
.short 0, -1, -1, -1, 0, -1, -1, -1
40
.short 0, -1, -1, -1, -1, -1, -1, -1
41
endconst
42
43
.macro SAD_START_4
44
ld1 {v1.s}[0], [x2], x3
45
ld1 {v0.s}[0], [x0], x1
46
ld1 {v1.s}[1], [x2], x3
47
ld1 {v0.s}[1], [x0], x1
48
uabdl v16.8h, v0.8b, v1.8b
49
.endm
50
51
.macro SAD_4
52
ld1 {v1.s}[0], [x2], x3
53
ld1 {v0.s}[0], [x0], x1
54
ld1 {v1.s}[1], [x2], x3
55
ld1 {v0.s}[1], [x0], x1
56
uabal v16.8h, v0.8b, v1.8b
57
.endm
58
59
.macro SAD_START_8
60
ld1 {v1.8b}, [x2], x3
61
ld1 {v0.8b}, [x0], x1
62
ld1 {v3.8b}, [x2], x3
63
ld1 {v2.8b}, [x0], x1
64
uabdl v16.8h, v0.8b, v1.8b
65
uabdl v17.8h, v2.8b, v3.8b
66
.endm
67
68
.macro SAD_8
69
ld1 {v1.8b}, [x2], x3
70
ld1 {v0.8b}, [x0], x1
71
ld1 {v3.8b}, [x2], x3
72
ld1 {v2.8b}, [x0], x1
73
uabal v16.8h, v0.8b, v1.8b
74
uabal v17.8h, v2.8b, v3.8b
75
.endm
76
77
.macro SAD_START_16
78
ld1 {v1.16b}, [x2], x3
79
ld1 {v0.16b}, [x0], x1
80
ld1 {v3.16b}, [x2], x3
81
ld1 {v2.16b}, [x0], x1
82
uabdl v16.8h, v0.8b, v1.8b
83
uabdl2 v17.8h, v0.16b, v1.16b
84
uabal v16.8h, v2.8b, v3.8b
85
uabal2 v17.8h, v2.16b, v3.16b
86
.endm
87
88
.macro SAD_16
89
ld1 {v1.16b}, [x2], x3
90
ld1 {v0.16b}, [x0], x1
91
ld1 {v3.16b}, [x2], x3
92
ld1 {v2.16b}, [x0], x1
93
uabal v16.8h, v0.8b, v1.8b
94
uabal2 v17.8h, v0.16b, v1.16b
95
uabal v16.8h, v2.8b, v3.8b
96
uabal2 v17.8h, v2.16b, v3.16b
97
.endm
98
99
.macro SAD_FUNC w, h, name
100
function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
101
SAD_START_\w
102
103
.rept \h / 2 - 1
104
SAD_\w
105
.endr
106
.if \w > 4
107
add v16.8h, v16.8h, v17.8h
108
.endif
109
uaddlv s0, v16.8h
110
fmov w0, s0
111
ret
112
endfunc
113
.endm
114
115
SAD_FUNC 4, 4
116
SAD_FUNC 4, 8
117
SAD_FUNC 4, 16
118
SAD_FUNC 8, 4
119
SAD_FUNC 8, 8
120
SAD_FUNC 8, 16
121
SAD_FUNC 16, 8
122
SAD_FUNC 16, 16
123
124
.macro SAD_X_4 x, first=uabal
125
ld1 {v0.s}[0], [x0], x7
126
ld1 {v1.s}[0], [x1], x5
127
ld1 {v0.s}[1], [x0], x7
128
ld1 {v1.s}[1], [x1], x5
129
\first v16.8h, v1.8b, v0.8b
130
ld1 {v2.s}[0], [x2], x5
131
ld1 {v2.s}[1], [x2], x5
132
\first v17.8h, v2.8b, v0.8b
133
ld1 {v3.s}[0], [x3], x5
134
ld1 {v3.s}[1], [x3], x5
135
\first v18.8h, v3.8b, v0.8b
136
.if \x == 4
137
ld1 {v4.s}[0], [x4], x5
138
ld1 {v4.s}[1], [x4], x5
139
\first v19.8h, v4.8b, v0.8b
140
.endif
141
.endm
142
143
.macro SAD_X_8 x, first=uabal
144
ld1 {v0.8b}, [x0], x7
145
ld1 {v1.8b}, [x1], x5
146
\first v16.8h, v1.8b, v0.8b
147
ld1 {v2.8b}, [x2], x5
148
ld1 {v5.8b}, [x0], x7
149
\first v17.8h, v2.8b, v0.8b
150
ld1 {v3.8b}, [x3], x5
151
ld1 {v1.8b}, [x1], x5
152
\first v18.8h, v3.8b, v0.8b
153
uabal v16.8h, v1.8b, v5.8b
154
ld1 {v2.8b}, [x2], x5
155
ld1 {v3.8b}, [x3], x5
156
uabal v17.8h, v2.8b, v5.8b
157
uabal v18.8h, v3.8b, v5.8b
158
.if \x == 4
159
ld1 {v4.8b}, [x4], x5
160
\first v19.8h, v4.8b, v0.8b
161
ld1 {v4.8b}, [x4], x5
162
uabal v19.8h, v4.8b, v5.8b
163
.endif
164
.endm
165
166
.macro SAD_X_16 x, first=uabal
167
ld1 {v0.16b}, [x0], x7
168
ld1 {v1.16b}, [x1], x5
169
\first v16.8h, v1.8b, v0.8b
170
\first\()2 v20.8h, v1.16b, v0.16b
171
ld1 {v2.16b}, [x2], x5
172
ld1 {v5.16b}, [x0], x7
173
\first v17.8h, v2.8b, v0.8b
174
\first\()2 v21.8h, v2.16b, v0.16b
175
ld1 {v3.16b}, [x3], x5
176
ld1 {v1.16b}, [x1], x5
177
\first v18.8h, v3.8b, v0.8b
178
\first\()2 v22.8h, v3.16b, v0.16b
179
uabal v16.8h, v1.8b, v5.8b
180
uabal2 v20.8h, v1.16b, v5.16b
181
ld1 {v2.16b}, [x2], x5
182
ld1 {v3.16b}, [x3], x5
183
uabal v17.8h, v2.8b, v5.8b
184
uabal2 v21.8h, v2.16b, v5.16b
185
uabal v18.8h, v3.8b, v5.8b
186
uabal2 v22.8h, v3.16b, v5.16b
187
.if \x == 4
188
ld1 {v4.16b}, [x4], x5
189
\first v19.8h, v4.8b, v0.8b
190
\first\()2 v23.8h, v4.16b, v0.16b
191
ld1 {v4.16b}, [x4], x5
192
uabal v19.8h, v4.8b, v5.8b
193
uabal2 v23.8h, v4.16b, v5.16b
194
.endif
195
.endm
196
197
.macro SAD_X_FUNC x, w, h
198
function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
199
.if \x == 3
200
mov x6, x5
201
mov x5, x4
202
.endif
203
mov x7, #FENC_STRIDE
204
205
SAD_X_\w \x, uabdl
206
207
.rept \h / 2 - 1
208
SAD_X_\w \x
209
.endr
210
211
.if \w > 8
212
add v16.8h, v16.8h, v20.8h
213
add v17.8h, v17.8h, v21.8h
214
add v18.8h, v18.8h, v22.8h
215
.if \x == 4
216
add v19.8h, v19.8h, v23.8h
217
.endif
218
.endif
219
// add up the sads
220
uaddlv s0, v16.8h
221
uaddlv s1, v17.8h
222
uaddlv s2, v18.8h
223
224
stp s0, s1, [x6], #8
225
.if \x == 3
226
str s2, [x6]
227
.else
228
uaddlv s3, v19.8h
229
stp s2, s3, [x6]
230
.endif
231
ret
232
endfunc
233
.endm
234
235
SAD_X_FUNC 3, 4, 4
236
SAD_X_FUNC 3, 4, 8
237
SAD_X_FUNC 3, 8, 4
238
SAD_X_FUNC 3, 8, 8
239
SAD_X_FUNC 3, 8, 16
240
SAD_X_FUNC 3, 16, 8
241
SAD_X_FUNC 3, 16, 16
242
243
SAD_X_FUNC 4, 4, 4
244
SAD_X_FUNC 4, 4, 8
245
SAD_X_FUNC 4, 8, 4
246
SAD_X_FUNC 4, 8, 8
247
SAD_X_FUNC 4, 8, 16
248
SAD_X_FUNC 4, 16, 8
249
SAD_X_FUNC 4, 16, 16
250
251
252
function x264_pixel_vsad_neon, export=1
253
subs w2, w2, #2
254
ld1 {v0.16b}, [x0], x1
255
ld1 {v1.16b}, [x0], x1
256
uabdl v6.8h, v0.8b, v1.8b
257
uabdl2 v7.8h, v0.16b, v1.16b
258
b.le 2f
259
1:
260
subs w2, w2, #2
261
ld1 {v0.16b}, [x0], x1
262
uabal v6.8h, v1.8b, v0.8b
263
uabal2 v7.8h, v1.16b, v0.16b
264
ld1 {v1.16b}, [x0], x1
265
b.lt 2f
266
uabal v6.8h, v0.8b, v1.8b
267
uabal2 v7.8h, v0.16b, v1.16b
268
b.gt 1b
269
2:
270
add v5.8h, v6.8h, v7.8h
271
uaddlv s0, v5.8h
272
fmov w0, s0
273
ret
274
endfunc
275
276
function x264_pixel_asd8_neon, export=1
277
sub w4, w4, #2
278
ld1 {v0.8b}, [x0], x1
279
ld1 {v1.8b}, [x2], x3
280
ld1 {v2.8b}, [x0], x1
281
ld1 {v3.8b}, [x2], x3
282
usubl v16.8h, v0.8b, v1.8b
283
1:
284
subs w4, w4, #2
285
ld1 {v4.8b}, [x0], x1
286
ld1 {v5.8b}, [x2], x3
287
usubl v17.8h, v2.8b, v3.8b
288
usubl v18.8h, v4.8b, v5.8b
289
add v16.8h, v16.8h, v17.8h
290
ld1 {v2.8b}, [x0], x1
291
ld1 {v3.8b}, [x2], x3
292
add v16.8h, v16.8h, v18.8h
293
b.gt 1b
294
usubl v17.8h, v2.8b, v3.8b
295
add v16.8h, v16.8h, v17.8h
296
saddlv s0, v16.8h
297
abs v0.2s, v0.2s
298
fmov w0, s0
299
ret
300
endfunc
301
302
.macro SSD_START_4
303
ld1 {v16.s}[0], [x0], x1
304
ld1 {v17.s}[0], [x2], x3
305
usubl v2.8h, v16.8b, v17.8b
306
ld1 {v16.s}[0], [x0], x1
307
ld1 {v17.s}[0], [x2], x3
308
smull v0.4s, v2.4h, v2.4h
309
.endm
310
311
.macro SSD_4
312
usubl v2.8h, v16.8b, v17.8b
313
ld1 {v16.s}[0], [x0], x1
314
ld1 {v17.s}[0], [x2], x3
315
smlal v0.4s, v2.4h, v2.4h
316
.endm
317
318
.macro SSD_END_4
319
usubl v2.8h, v16.8b, v17.8b
320
smlal v0.4s, v2.4h, v2.4h
321
.endm
322
323
.macro SSD_START_8
324
ld1 {v16.8b}, [x0], x1
325
ld1 {v17.8b}, [x2], x3
326
usubl v2.8h, v16.8b, v17.8b
327
ld1 {v16.8b}, [x0], x1
328
smull v0.4s, v2.4h, v2.4h
329
ld1 {v17.8b}, [x2], x3
330
smlal2 v0.4s, v2.8h, v2.8h
331
.endm
332
333
.macro SSD_8
334
usubl v2.8h, v16.8b, v17.8b
335
ld1 {v16.8b}, [x0], x1
336
smlal v0.4s, v2.4h, v2.4h
337
ld1 {v17.8b}, [x2], x3
338
smlal2 v0.4s, v2.8h, v2.8h
339
.endm
340
341
.macro SSD_END_8
342
usubl v2.8h, v16.8b, v17.8b
343
smlal v0.4s, v2.4h, v2.4h
344
smlal2 v0.4s, v2.8h, v2.8h
345
.endm
346
347
.macro SSD_START_16
348
ld1 {v16.16b}, [x0], x1
349
ld1 {v17.16b}, [x2], x3
350
usubl v2.8h, v16.8b, v17.8b
351
usubl2 v3.8h, v16.16b, v17.16b
352
ld1 {v16.16b}, [x0], x1
353
smull v0.4s, v2.4h, v2.4h
354
smull2 v1.4s, v2.8h, v2.8h
355
ld1 {v17.16b}, [x2], x3
356
smlal v0.4s, v3.4h, v3.4h
357
smlal2 v1.4s, v3.8h, v3.8h
358
.endm
359
360
.macro SSD_16
361
usubl v2.8h, v16.8b, v17.8b
362
usubl2 v3.8h, v16.16b, v17.16b
363
ld1 {v16.16b}, [x0], x1
364
smlal v0.4s, v2.4h, v2.4h
365
smlal2 v1.4s, v2.8h, v2.8h
366
ld1 {v17.16b}, [x2], x3
367
smlal v0.4s, v3.4h, v3.4h
368
smlal2 v1.4s, v3.8h, v3.8h
369
.endm
370
371
.macro SSD_END_16
372
usubl v2.8h, v16.8b, v17.8b
373
usubl2 v3.8h, v16.16b, v17.16b
374
smlal v0.4s, v2.4h, v2.4h
375
smlal2 v1.4s, v2.8h, v2.8h
376
smlal v0.4s, v3.4h, v3.4h
377
smlal2 v1.4s, v3.8h, v3.8h
378
add v0.4s, v0.4s, v1.4s
379
.endm
380
381
.macro SSD_FUNC w h
382
function x264_pixel_ssd_\w\()x\h\()_neon, export=1
383
SSD_START_\w
384
.rept \h-2
385
SSD_\w
386
.endr
387
SSD_END_\w
388
389
addv s0, v0.4s
390
mov w0, v0.s[0]
391
ret
392
endfunc
393
.endm
394
395
SSD_FUNC 4, 4
396
SSD_FUNC 4, 8
397
SSD_FUNC 4, 16
398
SSD_FUNC 8, 4
399
SSD_FUNC 8, 8
400
SSD_FUNC 8, 16
401
SSD_FUNC 16, 8
402
SSD_FUNC 16, 16
403
404
405
function x264_pixel_ssd_nv12_core_neon, export=1
406
sxtw x8, w4
407
add x8, x8, #8
408
and x8, x8, #~15
409
movi v6.2d, #0
410
movi v7.2d, #0
411
sub x1, x1, x8, lsl #1
412
sub x3, x3, x8, lsl #1
413
1:
414
subs w8, w4, #16
415
ld2 {v0.8b,v1.8b}, [x0], #16
416
ld2 {v2.8b,v3.8b}, [x2], #16
417
ld2 {v24.8b,v25.8b}, [x0], #16
418
ld2 {v26.8b,v27.8b}, [x2], #16
419
420
usubl v16.8h, v0.8b, v2.8b
421
usubl v17.8h, v1.8b, v3.8b
422
smull v20.4s, v16.4h, v16.4h
423
smull v21.4s, v17.4h, v17.4h
424
usubl v18.8h, v24.8b, v26.8b
425
usubl v19.8h, v25.8b, v27.8b
426
smlal2 v20.4s, v16.8h, v16.8h
427
smlal2 v21.4s, v17.8h, v17.8h
428
429
b.lt 4f
430
b.eq 3f
431
2:
432
smlal v20.4s, v18.4h, v18.4h
433
smlal v21.4s, v19.4h, v19.4h
434
ld2 {v0.8b,v1.8b}, [x0], #16
435
ld2 {v2.8b,v3.8b}, [x2], #16
436
smlal2 v20.4s, v18.8h, v18.8h
437
smlal2 v21.4s, v19.8h, v19.8h
438
439
subs w8, w8, #16
440
usubl v16.8h, v0.8b, v2.8b
441
usubl v17.8h, v1.8b, v3.8b
442
smlal v20.4s, v16.4h, v16.4h
443
smlal v21.4s, v17.4h, v17.4h
444
ld2 {v24.8b,v25.8b}, [x0], #16
445
ld2 {v26.8b,v27.8b}, [x2], #16
446
smlal2 v20.4s, v16.8h, v16.8h
447
smlal2 v21.4s, v17.8h, v17.8h
448
b.lt 4f
449
450
usubl v18.8h, v24.8b, v26.8b
451
usubl v19.8h, v25.8b, v27.8b
452
b.gt 2b
453
3:
454
smlal v20.4s, v18.4h, v18.4h
455
smlal v21.4s, v19.4h, v19.4h
456
smlal2 v20.4s, v18.8h, v18.8h
457
smlal2 v21.4s, v19.8h, v19.8h
458
4:
459
subs w5, w5, #1
460
uaddw v6.2d, v6.2d, v20.2s
461
uaddw v7.2d, v7.2d, v21.2s
462
add x0, x0, x1
463
add x2, x2, x3
464
uaddw2 v6.2d, v6.2d, v20.4s
465
uaddw2 v7.2d, v7.2d, v21.4s
466
b.gt 1b
467
468
addp v6.2d, v6.2d, v7.2d
469
st1 {v6.d}[0], [x6]
470
st1 {v6.d}[1], [x7]
471
472
ret
473
endfunc
474
475
.macro pixel_var_8 h
476
function x264_pixel_var_8x\h\()_neon, export=1
477
ld1 {v16.8b}, [x0], x1
478
ld1 {v17.8b}, [x0], x1
479
mov x2, \h - 4
480
umull v1.8h, v16.8b, v16.8b
481
uxtl v0.8h, v16.8b
482
umull v2.8h, v17.8b, v17.8b
483
uaddw v0.8h, v0.8h, v17.8b
484
ld1 {v18.8b}, [x0], x1
485
uaddlp v1.4s, v1.8h
486
uaddlp v2.4s, v2.8h
487
ld1 {v19.8b}, [x0], x1
488
489
1: subs x2, x2, #4
490
uaddw v0.8h, v0.8h, v18.8b
491
umull v24.8h, v18.8b, v18.8b
492
ld1 {v20.8b}, [x0], x1
493
uaddw v0.8h, v0.8h, v19.8b
494
umull v25.8h, v19.8b, v19.8b
495
uadalp v1.4s, v24.8h
496
ld1 {v21.8b}, [x0], x1
497
uaddw v0.8h, v0.8h, v20.8b
498
umull v26.8h, v20.8b, v20.8b
499
uadalp v2.4s, v25.8h
500
ld1 {v18.8b}, [x0], x1
501
uaddw v0.8h, v0.8h, v21.8b
502
umull v27.8h, v21.8b, v21.8b
503
uadalp v1.4s, v26.8h
504
ld1 {v19.8b}, [x0], x1
505
uadalp v2.4s, v27.8h
506
b.gt 1b
507
508
uaddw v0.8h, v0.8h, v18.8b
509
umull v28.8h, v18.8b, v18.8b
510
uaddw v0.8h, v0.8h, v19.8b
511
umull v29.8h, v19.8b, v19.8b
512
uadalp v1.4s, v28.8h
513
uadalp v2.4s, v29.8h
514
515
b x264_var_end
516
endfunc
517
.endm
518
519
pixel_var_8 8
520
pixel_var_8 16
521
522
function x264_pixel_var_16x16_neon, export=1
523
ld1 {v16.16b}, [x0], x1
524
ld1 {v17.16b}, [x0], x1
525
mov x2, #14
526
umull v1.8h, v16.8b, v16.8b
527
umull2 v2.8h, v16.16b, v16.16b
528
uxtl v0.8h, v16.8b
529
uaddlp v1.4s, v1.8h
530
uaddlp v2.4s, v2.8h
531
uaddw2 v0.8h, v0.8h, v16.16b
532
533
1: subs x2, x2, #2
534
ld1 {v18.16b}, [x0], x1
535
uaddw v0.8h, v0.8h, v17.8b
536
umull v3.8h, v17.8b, v17.8b
537
uaddw2 v0.8h, v0.8h, v17.16b
538
umull2 v4.8h, v17.16b, v17.16b
539
uadalp v1.4s, v3.8h
540
uadalp v2.4s, v4.8h
541
542
ld1 {v17.16b}, [x0], x1
543
uaddw v0.8h, v0.8h, v18.8b
544
umull v5.8h, v18.8b, v18.8b
545
uaddw2 v0.8h, v0.8h, v18.16b
546
umull2 v6.8h, v18.16b, v18.16b
547
uadalp v1.4s, v5.8h
548
uadalp v2.4s, v6.8h
549
b.gt 1b
550
551
uaddw v0.8h, v0.8h, v17.8b
552
umull v3.8h, v17.8b, v17.8b
553
uaddw2 v0.8h, v0.8h, v17.16b
554
umull2 v4.8h, v17.16b, v17.16b
555
uadalp v1.4s, v3.8h
556
uadalp v2.4s, v4.8h
557
endfunc
558
559
function x264_var_end
560
add v1.4s, v1.4s, v2.4s
561
uaddlv s0, v0.8h
562
uaddlv d1, v1.4s
563
mov w0, v0.s[0]
564
mov x1, v1.d[0]
565
orr x0, x0, x1, lsl #32
566
ret
567
endfunc
568
569
570
.macro pixel_var2_8 h
571
function x264_pixel_var2_8x\h\()_neon, export=1
572
ld1 {v16.8b}, [x0], x1
573
ld1 {v18.8b}, [x2], x3
574
ld1 {v17.8b}, [x0], x1
575
ld1 {v19.8b}, [x2], x3
576
mov x5, \h - 4
577
usubl v6.8h, v16.8b, v18.8b
578
usubl v7.8h, v17.8b, v19.8b
579
ld1 {v16.8b}, [x0], x1
580
ld1 {v18.8b}, [x2], x3
581
smull v2.4s, v6.4h, v6.4h
582
smull2 v3.4s, v6.8h, v6.8h
583
add v0.8h, v6.8h, v7.8h
584
smlal v2.4s, v7.4h, v7.4h
585
smlal2 v3.4s, v7.8h, v7.8h
586
587
usubl v6.8h, v16.8b, v18.8b
588
589
1: subs x5, x5, #2
590
ld1 {v17.8b}, [x0], x1
591
ld1 {v19.8b}, [x2], x3
592
smlal v2.4s, v6.4h, v6.4h
593
smlal2 v3.4s, v6.8h, v6.8h
594
usubl v7.8h, v17.8b, v19.8b
595
add v0.8h, v0.8h, v6.8h
596
ld1 {v16.8b}, [x0], x1
597
ld1 {v18.8b}, [x2], x3
598
smlal v2.4s, v7.4h, v7.4h
599
smlal2 v3.4s, v7.8h, v7.8h
600
usubl v6.8h, v16.8b, v18.8b
601
add v0.8h, v0.8h, v7.8h
602
b.gt 1b
603
604
ld1 {v17.8b}, [x0], x1
605
ld1 {v19.8b}, [x2], x3
606
smlal v2.4s, v6.4h, v6.4h
607
smlal2 v3.4s, v6.8h, v6.8h
608
usubl v7.8h, v17.8b, v19.8b
609
add v0.8h, v0.8h, v6.8h
610
smlal v2.4s, v7.4h, v7.4h
611
add v0.8h, v0.8h, v7.8h
612
smlal2 v3.4s, v7.8h, v7.8h
613
614
saddlv s0, v0.8h
615
add v2.4s, v2.4s, v3.4s
616
mov w0, v0.s[0]
617
addv s1, v2.4s
618
sxtw x0, w0
619
mov w1, v1.s[0]
620
mul x0, x0, x0
621
str w1, [x4]
622
sub x0, x1, x0, lsr # 6 + (\h >> 4)
623
624
ret
625
endfunc
626
.endm
627
628
pixel_var2_8 8
629
pixel_var2_8 16
630
631
632
function x264_pixel_satd_4x4_neon, export=1
633
ld1 {v1.s}[0], [x2], x3
634
ld1 {v0.s}[0], [x0], x1
635
ld1 {v3.s}[0], [x2], x3
636
ld1 {v2.s}[0], [x0], x1
637
ld1 {v1.s}[1], [x2], x3
638
ld1 {v0.s}[1], [x0], x1
639
ld1 {v3.s}[1], [x2], x3
640
ld1 {v2.s}[1], [x0], x1
641
642
usubl v0.8h, v0.8b, v1.8b
643
usubl v1.8h, v2.8b, v3.8b
644
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
645
646
zip1 v0.2d, v2.2d, v3.2d
647
zip2 v1.2d, v2.2d, v3.2d
648
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
649
650
trn1 v0.8h, v2.8h, v3.8h
651
trn2 v1.8h, v2.8h, v3.8h
652
SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
653
654
trn1 v0.4s, v2.4s, v3.4s
655
trn2 v1.4s, v2.4s, v3.4s
656
abs v0.8h, v0.8h
657
abs v1.8h, v1.8h
658
umax v0.8h, v0.8h, v1.8h
659
660
uaddlv s0, v0.8h
661
mov w0, v0.s[0]
662
ret
663
endfunc
664
665
function x264_pixel_satd_4x8_neon, export=1
666
ld1 {v1.s}[0], [x2], x3
667
ld1 {v0.s}[0], [x0], x1
668
ld1 {v3.s}[0], [x2], x3
669
ld1 {v2.s}[0], [x0], x1
670
ld1 {v5.s}[0], [x2], x3
671
ld1 {v4.s}[0], [x0], x1
672
ld1 {v7.s}[0], [x2], x3
673
ld1 {v6.s}[0], [x0], x1
674
ld1 {v1.s}[1], [x2], x3
675
ld1 {v0.s}[1], [x0], x1
676
ld1 {v3.s}[1], [x2], x3
677
ld1 {v2.s}[1], [x0], x1
678
ld1 {v5.s}[1], [x2], x3
679
ld1 {v4.s}[1], [x0], x1
680
ld1 {v7.s}[1], [x2], x3
681
ld1 {v6.s}[1], [x0], x1
682
b x264_satd_4x8_8x4_end_neon
683
endfunc
684
685
function x264_pixel_satd_8x4_neon, export=1
686
ld1 {v1.8b}, [x2], x3
687
ld1 {v0.8b}, [x0], x1
688
ld1 {v3.8b}, [x2], x3
689
ld1 {v2.8b}, [x0], x1
690
ld1 {v5.8b}, [x2], x3
691
ld1 {v4.8b}, [x0], x1
692
ld1 {v7.8b}, [x2], x3
693
ld1 {v6.8b}, [x0], x1
694
endfunc
695
696
function x264_satd_4x8_8x4_end_neon
697
usubl v0.8h, v0.8b, v1.8b
698
usubl v1.8h, v2.8b, v3.8b
699
usubl v2.8h, v4.8b, v5.8b
700
usubl v3.8h, v6.8b, v7.8b
701
702
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
703
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
704
705
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
706
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
707
708
trn1 v0.8h, v4.8h, v5.8h
709
trn2 v1.8h, v4.8h, v5.8h
710
trn1 v2.8h, v6.8h, v7.8h
711
trn2 v3.8h, v6.8h, v7.8h
712
713
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
714
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
715
716
trn1 v0.4s, v16.4s, v18.4s
717
trn2 v1.4s, v16.4s, v18.4s
718
trn1 v2.4s, v17.4s, v19.4s
719
trn2 v3.4s, v17.4s, v19.4s
720
abs v0.8h, v0.8h
721
abs v1.8h, v1.8h
722
abs v2.8h, v2.8h
723
abs v3.8h, v3.8h
724
umax v0.8h, v0.8h, v1.8h
725
umax v1.8h, v2.8h, v3.8h
726
add v0.8h, v0.8h, v1.8h
727
uaddlv s0, v0.8h
728
mov w0, v0.s[0]
729
ret
730
endfunc
731
732
function x264_pixel_satd_8x8_neon, export=1
733
mov x4, x30
734
735
bl x264_satd_8x8_neon
736
add v0.8h, v0.8h, v1.8h
737
add v1.8h, v2.8h, v3.8h
738
add v0.8h, v0.8h, v1.8h
739
uaddlv s0, v0.8h
740
mov w0, v0.s[0]
741
ret x4
742
endfunc
743
744
function x264_pixel_satd_8x16_neon, export=1
745
mov x4, x30
746
747
bl x264_satd_8x8_neon
748
add v0.8h, v0.8h, v1.8h
749
add v1.8h, v2.8h, v3.8h
750
add v30.8h, v0.8h, v1.8h
751
752
bl x264_satd_8x8_neon
753
add v0.8h, v0.8h, v1.8h
754
add v1.8h, v2.8h, v3.8h
755
add v31.8h, v0.8h, v1.8h
756
add v0.8h, v30.8h, v31.8h
757
uaddlv s0, v0.8h
758
mov w0, v0.s[0]
759
ret x4
760
endfunc
761
762
.macro SUMSUBL_AB sum, sub, a, b
763
uaddl \sum, \a, \b
764
usubl \sub, \a, \b
765
.endm
766
767
.macro load_diff_fly_8x8
768
ld1 {v1.8b}, [x2], x3
769
ld1 {v0.8b}, [x0], x1
770
ld1 {v3.8b}, [x2], x3
771
ld1 {v2.8b}, [x0], x1
772
usubl v16.8h, v0.8b, v1.8b
773
ld1 {v5.8b}, [x2], x3
774
ld1 {v4.8b}, [x0], x1
775
usubl v17.8h, v2.8b, v3.8b
776
ld1 {v7.8b}, [x2], x3
777
ld1 {v6.8b}, [x0], x1
778
usubl v18.8h, v4.8b, v5.8b
779
ld1 {v1.8b}, [x2], x3
780
ld1 {v0.8b}, [x0], x1
781
usubl v19.8h, v6.8b, v7.8b
782
ld1 {v3.8b}, [x2], x3
783
ld1 {v2.8b}, [x0], x1
784
usubl v20.8h, v0.8b, v1.8b
785
ld1 {v5.8b}, [x2], x3
786
ld1 {v4.8b}, [x0], x1
787
usubl v21.8h, v2.8b, v3.8b
788
ld1 {v7.8b}, [x2], x3
789
ld1 {v6.8b}, [x0], x1
790
791
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
792
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
793
794
usubl v22.8h, v4.8b, v5.8b
795
usubl v23.8h, v6.8b, v7.8b
796
.endm
797
798
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
799
SUMSUB_AB \s1, \d1, \a, \b
800
SUMSUB_AB \s2, \d2, \c, \d
801
.endm
802
803
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
804
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
805
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
806
.endm
807
808
function x264_satd_8x8_neon
809
load_diff_fly_8x8
810
endfunc
811
812
// one vertical hadamard pass and two horizontal
813
function x264_satd_8x4v_8x8h_neon
814
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
815
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
816
817
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
818
819
transpose v0.8h, v1.8h, v16.8h, v17.8h
820
transpose v2.8h, v3.8h, v18.8h, v19.8h
821
transpose v4.8h, v5.8h, v20.8h, v21.8h
822
transpose v6.8h, v7.8h, v22.8h, v23.8h
823
824
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
825
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
826
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
827
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
828
829
transpose v0.4s, v2.4s, v16.4s, v18.4s
830
transpose v1.4s, v3.4s, v17.4s, v19.4s
831
transpose v4.4s, v6.4s, v20.4s, v22.4s
832
transpose v5.4s, v7.4s, v21.4s, v23.4s
833
834
abs v0.8h, v0.8h
835
abs v1.8h, v1.8h
836
abs v2.8h, v2.8h
837
abs v3.8h, v3.8h
838
abs v4.8h, v4.8h
839
abs v5.8h, v5.8h
840
abs v6.8h, v6.8h
841
abs v7.8h, v7.8h
842
843
umax v0.8h, v0.8h, v2.8h
844
umax v1.8h, v1.8h, v3.8h
845
umax v2.8h, v4.8h, v6.8h
846
umax v3.8h, v5.8h, v7.8h
847
848
ret
849
endfunc
850
851
function x264_pixel_satd_16x8_neon, export=1
852
mov x4, x30
853
854
bl x264_satd_16x4_neon
855
add v30.8h, v0.8h, v1.8h
856
add v31.8h, v2.8h, v3.8h
857
858
bl x264_satd_16x4_neon
859
add v0.8h, v0.8h, v1.8h
860
add v1.8h, v2.8h, v3.8h
861
add v30.8h, v30.8h, v0.8h
862
add v31.8h, v31.8h, v1.8h
863
864
add v0.8h, v30.8h, v31.8h
865
uaddlv s0, v0.8h
866
mov w0, v0.s[0]
867
ret x4
868
endfunc
869
870
function x264_pixel_satd_16x16_neon, export=1
871
mov x4, x30
872
873
bl x264_satd_16x4_neon
874
add v30.8h, v0.8h, v1.8h
875
add v31.8h, v2.8h, v3.8h
876
877
bl x264_satd_16x4_neon
878
add v0.8h, v0.8h, v1.8h
879
add v1.8h, v2.8h, v3.8h
880
add v30.8h, v30.8h, v0.8h
881
add v31.8h, v31.8h, v1.8h
882
883
bl x264_satd_16x4_neon
884
add v0.8h, v0.8h, v1.8h
885
add v1.8h, v2.8h, v3.8h
886
add v30.8h, v30.8h, v0.8h
887
add v31.8h, v31.8h, v1.8h
888
889
bl x264_satd_16x4_neon
890
add v0.8h, v0.8h, v1.8h
891
add v1.8h, v2.8h, v3.8h
892
add v30.8h, v30.8h, v0.8h
893
add v31.8h, v31.8h, v1.8h
894
895
add v0.8h, v30.8h, v31.8h
896
uaddlv s0, v0.8h
897
mov w0, v0.s[0]
898
ret x4
899
endfunc
900
901
function x264_satd_16x4_neon
902
ld1 {v1.16b}, [x2], x3
903
ld1 {v0.16b}, [x0], x1
904
ld1 {v3.16b}, [x2], x3
905
ld1 {v2.16b}, [x0], x1
906
usubl v16.8h, v0.8b, v1.8b
907
usubl2 v20.8h, v0.16b, v1.16b
908
ld1 {v5.16b}, [x2], x3
909
ld1 {v4.16b}, [x0], x1
910
usubl v17.8h, v2.8b, v3.8b
911
usubl2 v21.8h, v2.16b, v3.16b
912
ld1 {v7.16b}, [x2], x3
913
ld1 {v6.16b}, [x0], x1
914
915
usubl v18.8h, v4.8b, v5.8b
916
usubl2 v22.8h, v4.16b, v5.16b
917
usubl v19.8h, v6.8b, v7.8b
918
usubl2 v23.8h, v6.16b, v7.16b
919
920
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
921
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
922
923
b x264_satd_8x4v_8x8h_neon
924
endfunc
925
926
function x264_pixel_satd_4x16_neon, export=1
927
mov x4, x30
928
ld1 {v1.s}[0], [x2], x3
929
ld1 {v0.s}[0], [x0], x1
930
ld1 {v3.s}[0], [x2], x3
931
ld1 {v2.s}[0], [x0], x1
932
ld1 {v5.s}[0], [x2], x3
933
ld1 {v4.s}[0], [x0], x1
934
ld1 {v7.s}[0], [x2], x3
935
ld1 {v6.s}[0], [x0], x1
936
ld1 {v1.s}[1], [x2], x3
937
ld1 {v0.s}[1], [x0], x1
938
ld1 {v3.s}[1], [x2], x3
939
ld1 {v2.s}[1], [x0], x1
940
ld1 {v5.s}[1], [x2], x3
941
ld1 {v4.s}[1], [x0], x1
942
ld1 {v7.s}[1], [x2], x3
943
ld1 {v6.s}[1], [x0], x1
944
usubl v16.8h, v0.8b, v1.8b
945
usubl v17.8h, v2.8b, v3.8b
946
usubl v18.8h, v4.8b, v5.8b
947
usubl v19.8h, v6.8b, v7.8b
948
ld1 {v1.s}[0], [x2], x3
949
ld1 {v0.s}[0], [x0], x1
950
ld1 {v3.s}[0], [x2], x3
951
ld1 {v2.s}[0], [x0], x1
952
ld1 {v5.s}[0], [x2], x3
953
ld1 {v4.s}[0], [x0], x1
954
ld1 {v7.s}[0], [x2], x3
955
ld1 {v6.s}[0], [x0], x1
956
ld1 {v1.s}[1], [x2], x3
957
ld1 {v0.s}[1], [x0], x1
958
ld1 {v3.s}[1], [x2], x3
959
ld1 {v2.s}[1], [x0], x1
960
ld1 {v5.s}[1], [x2], x3
961
ld1 {v4.s}[1], [x0], x1
962
ld1 {v7.s}[1], [x2], x3
963
ld1 {v6.s}[1], [x0], x1
964
usubl v20.8h, v0.8b, v1.8b
965
usubl v21.8h, v2.8b, v3.8b
966
usubl v22.8h, v4.8b, v5.8b
967
usubl v23.8h, v6.8b, v7.8b
968
969
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
970
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
971
972
bl x264_satd_8x4v_8x8h_neon
973
974
add v30.8h, v0.8h, v1.8h
975
add v31.8h, v2.8h, v3.8h
976
add v0.8h, v30.8h, v31.8h
977
uaddlv s0, v0.8h
978
mov w0, v0.s[0]
979
ret x4
980
endfunc
981
982
function x264_pixel_sa8d_8x8_neon, export=1
983
mov x4, x30
984
bl pixel_sa8d_8x8_neon
985
add v0.8h, v0.8h, v1.8h
986
uaddlv s0, v0.8h
987
mov w0, v0.s[0]
988
add w0, w0, #1
989
lsr w0, w0, #1
990
ret x4
991
endfunc
992
993
function x264_pixel_sa8d_16x16_neon, export=1
994
mov x4, x30
995
bl pixel_sa8d_8x8_neon
996
uaddlp v30.4s, v0.8h
997
uaddlp v31.4s, v1.8h
998
bl pixel_sa8d_8x8_neon
999
uadalp v30.4s, v0.8h
1000
uadalp v31.4s, v1.8h
1001
sub x0, x0, x1, lsl #4
1002
sub x2, x2, x3, lsl #4
1003
add x0, x0, #8
1004
add x2, x2, #8
1005
bl pixel_sa8d_8x8_neon
1006
uadalp v30.4s, v0.8h
1007
uadalp v31.4s, v1.8h
1008
bl pixel_sa8d_8x8_neon
1009
uadalp v30.4s, v0.8h
1010
uadalp v31.4s, v1.8h
1011
add v0.4s, v30.4s, v31.4s
1012
addv s0, v0.4s
1013
mov w0, v0.s[0]
1014
add w0, w0, #1
1015
lsr w0, w0, #1
1016
ret x4
1017
endfunc
1018
1019
.macro sa8d_satd_8x8 satd=
1020
function pixel_sa8d_\satd\()8x8_neon
1021
load_diff_fly_8x8
1022
1023
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
1024
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
1025
1026
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
1027
.ifc \satd, satd_
1028
transpose v0.8h, v1.8h, v16.8h, v17.8h
1029
transpose v2.8h, v3.8h, v18.8h, v19.8h
1030
transpose v4.8h, v5.8h, v20.8h, v21.8h
1031
transpose v6.8h, v7.8h, v22.8h, v23.8h
1032
1033
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
1034
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
1035
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
1036
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
1037
1038
transpose v4.4s, v6.4s, v24.4s, v26.4s
1039
transpose v5.4s, v7.4s, v25.4s, v27.4s
1040
transpose v24.4s, v26.4s, v0.4s, v2.4s
1041
transpose v25.4s, v27.4s, v1.4s, v3.4s
1042
1043
abs v0.8h, v4.8h
1044
abs v1.8h, v5.8h
1045
abs v2.8h, v6.8h
1046
abs v3.8h, v7.8h
1047
abs v4.8h, v24.8h
1048
abs v5.8h, v25.8h
1049
abs v6.8h, v26.8h
1050
abs v7.8h, v27.8h
1051
1052
umax v0.8h, v0.8h, v2.8h
1053
umax v1.8h, v1.8h, v3.8h
1054
umax v2.8h, v4.8h, v6.8h
1055
umax v3.8h, v5.8h, v7.8h
1056
1057
add v26.8h, v0.8h, v1.8h
1058
add v27.8h, v2.8h, v3.8h
1059
.endif
1060
1061
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
1062
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
1063
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
1064
SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
1065
1066
transpose v20.8h, v21.8h, v16.8h, v17.8h
1067
transpose v4.8h, v5.8h, v0.8h, v1.8h
1068
transpose v22.8h, v23.8h, v18.8h, v19.8h
1069
transpose v6.8h, v7.8h, v2.8h, v3.8h
1070
1071
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
1072
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
1073
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
1074
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
1075
1076
transpose v20.4s, v22.4s, v2.4s, v0.4s
1077
transpose v21.4s, v23.4s, v3.4s, v1.4s
1078
transpose v16.4s, v18.4s, v24.4s, v4.4s
1079
transpose v17.4s, v19.4s, v25.4s, v5.4s
1080
1081
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
1082
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
1083
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
1084
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
1085
1086
transpose v16.2d, v20.2d, v0.2d, v4.2d
1087
transpose v17.2d, v21.2d, v1.2d, v5.2d
1088
transpose v18.2d, v22.2d, v2.2d, v6.2d
1089
transpose v19.2d, v23.2d, v3.2d, v7.2d
1090
1091
abs v16.8h, v16.8h
1092
abs v20.8h, v20.8h
1093
abs v17.8h, v17.8h
1094
abs v21.8h, v21.8h
1095
abs v18.8h, v18.8h
1096
abs v22.8h, v22.8h
1097
abs v19.8h, v19.8h
1098
abs v23.8h, v23.8h
1099
1100
umax v16.8h, v16.8h, v20.8h
1101
umax v17.8h, v17.8h, v21.8h
1102
umax v18.8h, v18.8h, v22.8h
1103
umax v19.8h, v19.8h, v23.8h
1104
1105
add v0.8h, v16.8h, v17.8h
1106
add v1.8h, v18.8h, v19.8h
1107
1108
ret
1109
endfunc
1110
.endm
1111
1112
sa8d_satd_8x8
1113
sa8d_satd_8x8 satd_
1114
1115
function x264_pixel_sa8d_satd_16x16_neon, export=1
1116
mov x4, x30
1117
bl pixel_sa8d_satd_8x8_neon
1118
uaddlp v30.4s, v0.8h
1119
uaddlp v31.4s, v1.8h
1120
uaddlp v28.4s, v26.8h
1121
uaddlp v29.4s, v27.8h
1122
bl pixel_sa8d_satd_8x8_neon
1123
uadalp v30.4s, v0.8h
1124
uadalp v31.4s, v1.8h
1125
uadalp v28.4s, v26.8h
1126
uadalp v29.4s, v27.8h
1127
sub x0, x0, x1, lsl #4
1128
sub x2, x2, x3, lsl #4
1129
add x0, x0, #8
1130
add x2, x2, #8
1131
bl pixel_sa8d_satd_8x8_neon
1132
uadalp v30.4s, v0.8h
1133
uadalp v31.4s, v1.8h
1134
uadalp v28.4s, v26.8h
1135
uadalp v29.4s, v27.8h
1136
bl pixel_sa8d_satd_8x8_neon
1137
uadalp v30.4s, v0.8h
1138
uadalp v31.4s, v1.8h
1139
uadalp v28.4s, v26.8h
1140
uadalp v29.4s, v27.8h
1141
add v0.4s, v30.4s, v31.4s // sa8d
1142
add v1.4s, v28.4s, v29.4s // satd
1143
addv s0, v0.4s
1144
addv s1, v1.4s
1145
urshr v0.4s, v0.4s, #1
1146
fmov w0, s0
1147
fmov w1, s1
1148
add x0, x0, x1, lsl #32
1149
ret x4
1150
endfunc
1151
1152
.macro HADAMARD_AC w h
1153
function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
1154
movrel x5, mask_ac_4_8
1155
mov x4, x30
1156
ld1 {v30.8h,v31.8h}, [x5]
1157
movi v28.16b, #0
1158
movi v29.16b, #0
1159
1160
bl x264_hadamard_ac_8x8_neon
1161
.if \h > 8
1162
bl x264_hadamard_ac_8x8_neon
1163
.endif
1164
.if \w > 8
1165
sub x0, x0, x1, lsl #3
1166
add x0, x0, #8
1167
bl x264_hadamard_ac_8x8_neon
1168
.endif
1169
.if \w * \h == 256
1170
sub x0, x0, x1, lsl #4
1171
bl x264_hadamard_ac_8x8_neon
1172
.endif
1173
1174
addv s1, v29.4s
1175
addv s0, v28.4s
1176
mov w1, v1.s[0]
1177
mov w0, v0.s[0]
1178
lsr w1, w1, #2
1179
lsr w0, w0, #1
1180
orr x0, x0, x1, lsl #32
1181
ret x4
1182
endfunc
1183
.endm
1184
1185
HADAMARD_AC 8, 8
1186
HADAMARD_AC 8, 16
1187
HADAMARD_AC 16, 8
1188
HADAMARD_AC 16, 16
1189
1190
// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
1191
function x264_hadamard_ac_8x8_neon
1192
ld1 {v16.8b}, [x0], x1
1193
ld1 {v17.8b}, [x0], x1
1194
ld1 {v18.8b}, [x0], x1
1195
ld1 {v19.8b}, [x0], x1
1196
SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b
1197
ld1 {v20.8b}, [x0], x1
1198
ld1 {v21.8b}, [x0], x1
1199
SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b
1200
ld1 {v22.8b}, [x0], x1
1201
ld1 {v23.8b}, [x0], x1
1202
SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b
1203
SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b
1204
1205
SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
1206
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
1207
1208
transpose v0.8h, v1.8h, v16.8h, v17.8h
1209
transpose v2.8h, v3.8h, v18.8h, v19.8h
1210
transpose v4.8h, v5.8h, v20.8h, v21.8h
1211
transpose v6.8h, v7.8h, v22.8h, v23.8h
1212
1213
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
1214
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
1215
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
1216
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
1217
1218
transpose v0.4s, v2.4s, v16.4s, v18.4s
1219
transpose v1.4s, v3.4s, v17.4s, v19.4s
1220
transpose v4.4s, v6.4s, v20.4s, v22.4s
1221
transpose v5.4s, v7.4s, v21.4s, v23.4s
1222
1223
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
1224
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
1225
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
1226
1227
abs v0.8h, v16.8h
1228
abs v4.8h, v20.8h
1229
abs v1.8h, v17.8h
1230
abs v5.8h, v21.8h
1231
abs v2.8h, v18.8h
1232
abs v6.8h, v22.8h
1233
abs v3.8h, v19.8h
1234
abs v7.8h, v23.8h
1235
1236
add v0.8h, v0.8h, v4.8h
1237
add v1.8h, v1.8h, v5.8h
1238
and v0.16b, v0.16b, v30.16b
1239
add v2.8h, v2.8h, v6.8h
1240
add v3.8h, v3.8h, v7.8h
1241
add v0.8h, v0.8h, v2.8h
1242
add v1.8h, v1.8h, v3.8h
1243
uadalp v28.4s, v0.8h
1244
uadalp v28.4s, v1.8h
1245
1246
SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
1247
SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
1248
SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
1249
SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
1250
1251
transpose v16.2d, v17.2d, v6.2d, v7.2d
1252
transpose v18.2d, v19.2d, v4.2d, v5.2d
1253
transpose v20.2d, v21.2d, v2.2d, v3.2d
1254
1255
abs v16.8h, v16.8h
1256
abs v17.8h, v17.8h
1257
abs v18.8h, v18.8h
1258
abs v19.8h, v19.8h
1259
abs v20.8h, v20.8h
1260
abs v21.8h, v21.8h
1261
1262
transpose v7.2d, v6.2d, v1.2d, v0.2d
1263
1264
umax v3.8h, v16.8h, v17.8h
1265
umax v2.8h, v18.8h, v19.8h
1266
umax v1.8h, v20.8h, v21.8h
1267
1268
SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
1269
1270
add v2.8h, v2.8h, v3.8h
1271
add v2.8h, v2.8h, v1.8h
1272
and v4.16b, v4.16b, v31.16b
1273
add v2.8h, v2.8h, v2.8h
1274
abs v5.8h, v5.8h
1275
abs v4.8h, v4.8h
1276
add v2.8h, v2.8h, v5.8h
1277
add v2.8h, v2.8h, v4.8h
1278
uadalp v29.4s, v2.8h
1279
ret
1280
endfunc
1281
1282
1283
function x264_pixel_ssim_4x4x2_core_neon, export=1
1284
ld1 {v0.8b}, [x0], x1
1285
ld1 {v2.8b}, [x2], x3
1286
umull v16.8h, v0.8b, v0.8b
1287
umull v17.8h, v0.8b, v2.8b
1288
umull v18.8h, v2.8b, v2.8b
1289
1290
ld1 {v28.8b}, [x0], x1
1291
ld1 {v29.8b}, [x2], x3
1292
umull v20.8h, v28.8b, v28.8b
1293
umull v21.8h, v28.8b, v29.8b
1294
umull v22.8h, v29.8b, v29.8b
1295
1296
uaddlp v16.4s, v16.8h
1297
uaddlp v17.4s, v17.8h
1298
uaddl v0.8h, v0.8b, v28.8b
1299
uadalp v16.4s, v18.8h
1300
uaddl v1.8h, v2.8b, v29.8b
1301
1302
ld1 {v26.8b}, [x0], x1
1303
ld1 {v27.8b}, [x2], x3
1304
umull v23.8h, v26.8b, v26.8b
1305
umull v24.8h, v26.8b, v27.8b
1306
umull v25.8h, v27.8b, v27.8b
1307
1308
uadalp v16.4s, v20.8h
1309
uaddw v0.8h, v0.8h, v26.8b
1310
uadalp v17.4s, v21.8h
1311
uaddw v1.8h, v1.8h, v27.8b
1312
uadalp v16.4s, v22.8h
1313
1314
ld1 {v28.8b}, [x0], x1
1315
ld1 {v29.8b}, [x2], x3
1316
umull v20.8h, v28.8b, v28.8b
1317
umull v21.8h, v28.8b, v29.8b
1318
umull v22.8h, v29.8b, v29.8b
1319
1320
uadalp v16.4s, v23.8h
1321
uaddw v0.8h, v0.8h, v28.8b
1322
uadalp v17.4s, v24.8h
1323
uaddw v1.8h, v1.8h, v29.8b
1324
uadalp v16.4s, v25.8h
1325
1326
uadalp v16.4s, v20.8h
1327
uadalp v17.4s, v21.8h
1328
uadalp v16.4s, v22.8h
1329
1330
uaddlp v0.4s, v0.8h
1331
uaddlp v1.4s, v1.8h
1332
1333
addp v0.4s, v0.4s, v0.4s
1334
addp v1.4s, v1.4s, v1.4s
1335
addp v2.4s, v16.4s, v16.4s
1336
addp v3.4s, v17.4s, v17.4s
1337
1338
st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
1339
ret
1340
endfunc
1341
1342
function x264_pixel_ssim_end4_neon, export=1
1343
mov x5, #4
1344
ld1 {v16.4s,v17.4s}, [x0], #32
1345
ld1 {v18.4s,v19.4s}, [x1], #32
1346
mov w4, #0x99bb
1347
subs x2, x5, w2, uxtw
1348
mov w3, #416 // ssim_c1 = .01*.01*255*255*64
1349
movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63
1350
add v0.4s, v16.4s, v18.4s
1351
add v1.4s, v17.4s, v19.4s
1352
add v0.4s, v0.4s, v1.4s
1353
ld1 {v20.4s,v21.4s}, [x0], #32
1354
ld1 {v22.4s,v23.4s}, [x1], #32
1355
add v2.4s, v20.4s, v22.4s
1356
add v3.4s, v21.4s, v23.4s
1357
add v1.4s, v1.4s, v2.4s
1358
ld1 {v16.4s}, [x0], #16
1359
ld1 {v18.4s}, [x1], #16
1360
add v16.4s, v16.4s, v18.4s
1361
add v2.4s, v2.4s, v3.4s
1362
add v3.4s, v3.4s, v16.4s
1363
1364
dup v30.4s, w3
1365
dup v31.4s, w4
1366
1367
transpose v4.4s, v5.4s, v0.4s, v1.4s
1368
transpose v6.4s, v7.4s, v2.4s, v3.4s
1369
transpose v0.2d, v2.2d, v4.2d, v6.2d
1370
transpose v1.2d, v3.2d, v5.2d, v7.2d
1371
1372
mul v16.4s, v0.4s, v1.4s // s1*s2
1373
mul v0.4s, v0.4s, v0.4s
1374
mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2
1375
1376
shl v3.4s, v3.4s, #7
1377
shl v2.4s, v2.4s, #6
1378
add v1.4s, v16.4s, v16.4s
1379
1380
sub v2.4s, v2.4s, v0.4s // vars
1381
sub v3.4s, v3.4s, v1.4s // covar*2
1382
add v0.4s, v0.4s, v30.4s
1383
add v2.4s, v2.4s, v31.4s
1384
add v1.4s, v1.4s, v30.4s
1385
add v3.4s, v3.4s, v31.4s
1386
1387
scvtf v0.4s, v0.4s
1388
scvtf v2.4s, v2.4s
1389
scvtf v1.4s, v1.4s
1390
scvtf v3.4s, v3.4s
1391
1392
fmul v0.4s, v0.4s, v2.4s
1393
fmul v1.4s, v1.4s, v3.4s
1394
1395
fdiv v0.4s, v1.4s, v0.4s
1396
1397
b.eq 1f
1398
movrel x3, mask
1399
add x3, x3, x2, lsl #2
1400
ld1 {v29.4s}, [x3]
1401
and v0.16b, v0.16b, v29.16b
1402
1:
1403
faddp v0.4s, v0.4s, v0.4s
1404
faddp s0, v0.2s
1405
ret
1406
endfunc
1407
1408