Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/****************************************************************************
2
* dct-a.S: aarch64 transform and zigzag
3
*****************************************************************************
4
* Copyright (C) 2009-2016 x264 project
5
*
6
* Authors: David Conrad <[email protected]>
7
* Janne Grunau <[email protected]>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
13
*
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
18
*
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
*
23
* This program is also available under a commercial proprietary license.
24
* For more information, contact us at [email protected].
25
*****************************************************************************/
26
27
#include "asm.S"
28
29
const scan4x4_frame, align=4
30
.byte 0,1, 8,9, 2,3, 4,5
31
.byte 10,11, 16,17, 24,25, 18,19
32
.byte 12,13, 6,7, 14,15, 20,21
33
.byte 26,27, 28,29, 22,23, 30,31
34
endconst
35
36
const scan4x4_field, align=4
37
.byte 0,1, 2,3, 8,9, 4,5
38
.byte 6,7, 10,11, 12,13, 14,15
39
endconst
40
41
const sub4x4_frame, align=4
42
.byte 0, 1, 4, 8
43
.byte 5, 2, 3, 6
44
.byte 9, 12, 13, 10
45
.byte 7, 11, 14, 15
46
endconst
47
48
const sub4x4_field, align=4
49
.byte 0, 4, 1, 8
50
.byte 12, 5, 9, 13
51
.byte 2, 6, 10, 14
52
.byte 3, 7, 11, 15
53
endconst
54
55
// sum = a + (b>>shift) sub = (a>>shift) - b
56
.macro SUMSUB_SHR shift sum sub a b t0 t1
57
sshr \t0, \b, #\shift
58
sshr \t1, \a, #\shift
59
add \sum, \a, \t0
60
sub \sub, \t1, \b
61
.endm
62
63
// sum = (a>>shift) + b sub = a - (b>>shift)
64
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
65
sshr \t0, \a, #\shift
66
sshr \t1, \b, #\shift
67
add \sum, \t0, \b
68
sub \sub, \a, \t1
69
.endm
70
71
// a += 1.5*ma b -= 1.5*mb
72
.macro SUMSUB_15 a b ma mb t0 t1
73
sshr \t0, \ma, #1
74
sshr \t1, \mb, #1
75
add \t0, \t0, \ma
76
add \t1, \t1, \mb
77
add \a, \a, \t0
78
sub \b, \b, \t1
79
.endm
80
81
82
function x264_dct4x4dc_neon, export=1
83
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
84
movi v31.4h, #1
85
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
86
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
87
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
88
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
89
transpose v4.4h, v6.4h, v0.4h, v2.4h
90
transpose v5.4h, v7.4h, v1.4h, v3.4h
91
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
92
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
93
transpose v4.2s, v5.2s, v0.2s, v1.2s
94
transpose v6.2s, v7.2s, v2.2s, v3.2s
95
add v16.4h, v4.4h, v31.4h
96
add v17.4h, v6.4h, v31.4h
97
srhadd v0.4h, v4.4h, v5.4h
98
shsub v1.4h, v16.4h, v5.4h
99
shsub v2.4h, v17.4h, v7.4h
100
srhadd v3.4h, v6.4h, v7.4h
101
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
102
ret
103
endfunc
104
105
function x264_idct4x4dc_neon, export=1
106
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
107
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
108
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
109
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
110
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
111
transpose v4.4h, v6.4h, v0.4h, v2.4h
112
transpose v5.4h, v7.4h, v1.4h, v3.4h
113
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
114
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
115
transpose v4.2s, v5.2s, v0.2s, v1.2s
116
transpose v6.2s, v7.2s, v2.2s, v3.2s
117
SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
118
SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
119
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
120
ret
121
endfunc
122
123
.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
124
SUMSUB_AB \v1, \v6, \v5, \v6
125
SUMSUB_AB \v3, \v7, \v4, \v7
126
add \v0, \v3, \v1
127
add \v4, \v7, \v7
128
add \v5, \v6, \v6
129
sub \v2, \v3, \v1
130
add \v1, \v4, \v6
131
sub \v3, \v7, \v5
132
.endm
133
134
function x264_sub4x4_dct_neon, export=1
135
mov x3, #FENC_STRIDE
136
mov x4, #FDEC_STRIDE
137
ld1 {v0.s}[0], [x1], x3
138
ld1 {v1.s}[0], [x2], x4
139
ld1 {v2.s}[0], [x1], x3
140
usubl v16.8h, v0.8b, v1.8b
141
ld1 {v3.s}[0], [x2], x4
142
ld1 {v4.s}[0], [x1], x3
143
usubl v17.8h, v2.8b, v3.8b
144
ld1 {v5.s}[0], [x2], x4
145
ld1 {v6.s}[0], [x1], x3
146
usubl v18.8h, v4.8b, v5.8b
147
ld1 {v7.s}[0], [x2], x4
148
usubl v19.8h, v6.8b, v7.8b
149
150
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
151
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
152
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
153
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
154
ret
155
endfunc
156
157
function x264_sub8x4_dct_neon
158
ld1 {v0.8b}, [x1], x3
159
ld1 {v1.8b}, [x2], x4
160
usubl v16.8h, v0.8b, v1.8b
161
ld1 {v2.8b}, [x1], x3
162
ld1 {v3.8b}, [x2], x4
163
usubl v17.8h, v2.8b, v3.8b
164
ld1 {v4.8b}, [x1], x3
165
ld1 {v5.8b}, [x2], x4
166
usubl v18.8h, v4.8b, v5.8b
167
ld1 {v6.8b}, [x1], x3
168
ld1 {v7.8b}, [x2], x4
169
usubl v19.8h, v6.8b, v7.8b
170
171
DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
172
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
173
174
SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
175
SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
176
add v22.8h, v19.8h, v19.8h
177
add v21.8h, v18.8h, v18.8h
178
add v0.8h, v16.8h, v17.8h
179
sub v1.8h, v16.8h, v17.8h
180
181
add v2.8h, v22.8h, v18.8h
182
sub v3.8h, v19.8h, v21.8h
183
184
zip1 v4.2d, v0.2d, v2.2d
185
zip2 v6.2d, v0.2d, v2.2d
186
zip1 v5.2d, v1.2d, v3.2d
187
zip2 v7.2d, v1.2d, v3.2d
188
189
st1 {v4.8h}, [x0], #16
190
st1 {v5.8h}, [x0], #16
191
st1 {v6.8h}, [x0], #16
192
st1 {v7.8h}, [x0], #16
193
ret
194
endfunc
195
196
function x264_sub8x8_dct_neon, export=1
197
mov x5, x30
198
mov x3, #FENC_STRIDE
199
mov x4, #FDEC_STRIDE
200
bl x264_sub8x4_dct_neon
201
mov x30, x5
202
b x264_sub8x4_dct_neon
203
endfunc
204
205
function x264_sub16x16_dct_neon, export=1
206
mov x5, x30
207
mov x3, #FENC_STRIDE
208
mov x4, #FDEC_STRIDE
209
bl x264_sub8x4_dct_neon
210
bl x264_sub8x4_dct_neon
211
sub x1, x1, #8*FENC_STRIDE-8
212
sub x2, x2, #8*FDEC_STRIDE-8
213
bl x264_sub8x4_dct_neon
214
bl x264_sub8x4_dct_neon
215
sub x1, x1, #8
216
sub x2, x2, #8
217
bl x264_sub8x4_dct_neon
218
bl x264_sub8x4_dct_neon
219
sub x1, x1, #8*FENC_STRIDE-8
220
sub x2, x2, #8*FDEC_STRIDE-8
221
bl x264_sub8x4_dct_neon
222
mov x30, x5
223
b x264_sub8x4_dct_neon
224
endfunc
225
226
227
.macro DCT8_1D type
228
SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
229
SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
230
SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
231
SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
232
233
SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
234
SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
235
236
SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
237
sshr v23.8h, v21.8h, #1
238
sshr v18.8h, v16.8h, #1
239
add v23.8h, v23.8h, v21.8h
240
add v18.8h, v18.8h, v16.8h
241
sub v30.8h, v30.8h, v23.8h
242
sub v29.8h, v29.8h, v18.8h
243
244
SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
245
sshr v22.8h, v20.8h, #1
246
sshr v19.8h, v17.8h, #1
247
add v22.8h, v22.8h, v20.8h
248
add v19.8h, v19.8h, v17.8h
249
add v22.8h, v28.8h, v22.8h
250
add v31.8h, v31.8h, v19.8h
251
252
SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
253
SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
254
SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
255
SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
256
.endm
257
258
function x264_sub8x8_dct8_neon, export=1
259
mov x3, #FENC_STRIDE
260
mov x4, #FDEC_STRIDE
261
ld1 {v16.8b}, [x1], x3
262
ld1 {v17.8b}, [x2], x4
263
ld1 {v18.8b}, [x1], x3
264
ld1 {v19.8b}, [x2], x4
265
usubl v0.8h, v16.8b, v17.8b
266
ld1 {v20.8b}, [x1], x3
267
ld1 {v21.8b}, [x2], x4
268
usubl v1.8h, v18.8b, v19.8b
269
ld1 {v22.8b}, [x1], x3
270
ld1 {v23.8b}, [x2], x4
271
usubl v2.8h, v20.8b, v21.8b
272
ld1 {v24.8b}, [x1], x3
273
ld1 {v25.8b}, [x2], x4
274
usubl v3.8h, v22.8b, v23.8b
275
ld1 {v26.8b}, [x1], x3
276
ld1 {v27.8b}, [x2], x4
277
usubl v4.8h, v24.8b, v25.8b
278
ld1 {v28.8b}, [x1], x3
279
ld1 {v29.8b}, [x2], x4
280
usubl v5.8h, v26.8b, v27.8b
281
ld1 {v30.8b}, [x1], x3
282
ld1 {v31.8b}, [x2], x4
283
usubl v6.8h, v28.8b, v29.8b
284
usubl v7.8h, v30.8b, v31.8b
285
286
DCT8_1D row
287
transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
288
DCT8_1D col
289
290
st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
291
st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
292
ret
293
endfunc
294
295
function x264_sub16x16_dct8_neon, export=1
296
mov x7, x30
297
bl X(x264_sub8x8_dct8_neon)
298
sub x1, x1, #FENC_STRIDE*8 - 8
299
sub x2, x2, #FDEC_STRIDE*8 - 8
300
bl X(x264_sub8x8_dct8_neon)
301
sub x1, x1, #8
302
sub x2, x2, #8
303
bl X(x264_sub8x8_dct8_neon)
304
mov x30, x7
305
sub x1, x1, #FENC_STRIDE*8 - 8
306
sub x2, x2, #FDEC_STRIDE*8 - 8
307
b X(x264_sub8x8_dct8_neon)
308
endfunc
309
310
311
// First part of IDCT (minus final SUMSUB_BA)
312
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
313
SUMSUB_AB \d4, \d5, \d0, \d2
314
sshr \d7, \d1, #1
315
sshr \d6, \d3, #1
316
sub \d7, \d7, \d3
317
add \d6, \d6, \d1
318
.endm
319
320
function x264_add4x4_idct_neon, export=1
321
mov x2, #FDEC_STRIDE
322
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
323
324
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
325
ld1 {v28.s}[0], [x0], x2
326
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
327
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
328
329
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
330
331
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
332
ld1 {v29.s}[0], [x0], x2
333
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
334
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
335
336
srshr v0.4h, v0.4h, #6
337
srshr v1.4h, v1.4h, #6
338
ld1 {v31.s}[0], [x0], x2
339
srshr v2.4h, v2.4h, #6
340
srshr v3.4h, v3.4h, #6
341
ld1 {v30.s}[0], [x0], x2
342
343
sub x0, x0, x2, lsl #2
344
uaddw v0.8h, v0.8h, v28.8b
345
uaddw v1.8h, v1.8h, v29.8b
346
uaddw v2.8h, v2.8h, v30.8b
347
uaddw v3.8h, v3.8h, v31.8b
348
sqxtun v0.8b, v0.8h
349
sqxtun v1.8b, v1.8h
350
sqxtun v2.8b, v2.8h
351
sqxtun v3.8b, v3.8h
352
353
st1 {v0.s}[0], [x0], x2
354
st1 {v1.s}[0], [x0], x2
355
st1 {v3.s}[0], [x0], x2
356
st1 {v2.s}[0], [x0], x2
357
ret
358
endfunc
359
360
function x264_add8x4_idct_neon, export=1
361
ld1 {v0.8h,v1.8h}, [x1], #32
362
ld1 {v2.8h,v3.8h}, [x1], #32
363
transpose v20.2d, v21.2d, v0.2d, v2.2d
364
transpose v22.2d, v23.2d, v1.2d, v3.2d
365
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
366
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
367
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
368
369
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
370
371
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
372
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
373
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
374
375
srshr v0.8h, v0.8h, #6
376
ld1 {v28.8b}, [x0], x2
377
srshr v1.8h, v1.8h, #6
378
ld1 {v29.8b}, [x0], x2
379
srshr v2.8h, v2.8h, #6
380
ld1 {v30.8b}, [x0], x2
381
srshr v3.8h, v3.8h, #6
382
ld1 {v31.8b}, [x0], x2
383
384
sub x0, x0, x2, lsl #2
385
uaddw v0.8h, v0.8h, v28.8b
386
uaddw v1.8h, v1.8h, v29.8b
387
uaddw v2.8h, v2.8h, v30.8b
388
uaddw v3.8h, v3.8h, v31.8b
389
390
sqxtun v0.8b, v0.8h
391
sqxtun v1.8b, v1.8h
392
st1 {v0.8b}, [x0], x2
393
sqxtun v2.8b, v2.8h
394
st1 {v1.8b}, [x0], x2
395
sqxtun v3.8b, v3.8h
396
st1 {v2.8b}, [x0], x2
397
st1 {v3.8b}, [x0], x2
398
ret
399
endfunc
400
401
function x264_add8x8_idct_neon, export=1
402
mov x2, #FDEC_STRIDE
403
mov x5, x30
404
bl X(x264_add8x4_idct_neon)
405
mov x30, x5
406
b X(x264_add8x4_idct_neon)
407
endfunc
408
409
function x264_add16x16_idct_neon, export=1
410
mov x2, #FDEC_STRIDE
411
mov x5, x30
412
bl X(x264_add8x4_idct_neon)
413
bl X(x264_add8x4_idct_neon)
414
sub x0, x0, #8*FDEC_STRIDE-8
415
bl X(x264_add8x4_idct_neon)
416
bl X(x264_add8x4_idct_neon)
417
sub x0, x0, #8
418
bl X(x264_add8x4_idct_neon)
419
bl X(x264_add8x4_idct_neon)
420
sub x0, x0, #8*FDEC_STRIDE-8
421
bl X(x264_add8x4_idct_neon)
422
mov x30, x5
423
b X(x264_add8x4_idct_neon)
424
endfunc
425
426
.macro IDCT8_1D type
427
SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
428
.ifc \type, row
429
ld1 {v22.8h,v23.8h}, [x1], #32
430
.endif
431
SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
432
SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
433
SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
434
SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
435
SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
436
437
SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
438
SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
439
440
SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
441
SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
442
443
SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
444
SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
445
SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
446
SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
447
.endm
448
449
function x264_add8x8_idct8_neon, export=1
450
mov x2, #FDEC_STRIDE
451
ld1 {v16.8h,v17.8h}, [x1], #32
452
ld1 {v18.8h,v19.8h}, [x1], #32
453
ld1 {v20.8h,v21.8h}, [x1], #32
454
455
IDCT8_1D row
456
457
transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
458
459
IDCT8_1D col
460
461
ld1 {v0.8b}, [x0], x2
462
srshr v16.8h, v16.8h, #6
463
ld1 {v1.8b}, [x0], x2
464
srshr v17.8h, v17.8h, #6
465
ld1 {v2.8b}, [x0], x2
466
srshr v18.8h, v18.8h, #6
467
ld1 {v3.8b}, [x0], x2
468
srshr v19.8h, v19.8h, #6
469
ld1 {v4.8b}, [x0], x2
470
srshr v20.8h, v20.8h, #6
471
ld1 {v5.8b}, [x0], x2
472
srshr v21.8h, v21.8h, #6
473
ld1 {v6.8b}, [x0], x2
474
srshr v22.8h, v22.8h, #6
475
ld1 {v7.8b}, [x0], x2
476
srshr v23.8h, v23.8h, #6
477
sub x0, x0, x2, lsl #3
478
479
uaddw v16.8h, v16.8h, v0.8b
480
uaddw v17.8h, v17.8h, v1.8b
481
uaddw v18.8h, v18.8h, v2.8b
482
sqxtun v0.8b, v16.8h
483
sqxtun v1.8b, v17.8h
484
sqxtun v2.8b, v18.8h
485
uaddw v19.8h, v19.8h, v3.8b
486
st1 {v0.8b}, [x0], x2
487
uaddw v20.8h, v20.8h, v4.8b
488
st1 {v1.8b}, [x0], x2
489
uaddw v21.8h, v21.8h, v5.8b
490
st1 {v2.8b}, [x0], x2
491
sqxtun v3.8b, v19.8h
492
sqxtun v4.8b, v20.8h
493
uaddw v22.8h, v22.8h, v6.8b
494
uaddw v23.8h, v23.8h, v7.8b
495
st1 {v3.8b}, [x0], x2
496
sqxtun v5.8b, v21.8h
497
st1 {v4.8b}, [x0], x2
498
sqxtun v6.8b, v22.8h
499
sqxtun v7.8b, v23.8h
500
st1 {v5.8b}, [x0], x2
501
st1 {v6.8b}, [x0], x2
502
st1 {v7.8b}, [x0], x2
503
ret
504
endfunc
505
506
function x264_add16x16_idct8_neon, export=1
507
mov x7, x30
508
bl X(x264_add8x8_idct8_neon)
509
sub x0, x0, #8*FDEC_STRIDE-8
510
bl X(x264_add8x8_idct8_neon)
511
sub x0, x0, #8
512
bl X(x264_add8x8_idct8_neon)
513
sub x0, x0, #8*FDEC_STRIDE-8
514
mov x30, x7
515
b X(x264_add8x8_idct8_neon)
516
endfunc
517
518
function x264_add8x8_idct_dc_neon, export=1
519
mov x2, #FDEC_STRIDE
520
ld1 {v16.4h}, [x1]
521
ld1 {v0.8b}, [x0], x2
522
srshr v16.4h, v16.4h, #6
523
ld1 {v1.8b}, [x0], x2
524
dup v20.8h, v16.h[0]
525
dup v21.8h, v16.h[1]
526
ld1 {v2.8b}, [x0], x2
527
dup v22.8h, v16.h[2]
528
dup v23.8h, v16.h[3]
529
ld1 {v3.8b}, [x0], x2
530
trn1 v20.2d, v20.2d, v21.2d
531
ld1 {v4.8b}, [x0], x2
532
trn1 v21.2d, v22.2d, v23.2d
533
ld1 {v5.8b}, [x0], x2
534
neg v22.8h, v20.8h
535
ld1 {v6.8b}, [x0], x2
536
neg v23.8h, v21.8h
537
ld1 {v7.8b}, [x0], x2
538
539
sub x0, x0, #8*FDEC_STRIDE
540
541
sqxtun v20.8b, v20.8h
542
sqxtun v21.8b, v21.8h
543
sqxtun v22.8b, v22.8h
544
sqxtun v23.8b, v23.8h
545
546
uqadd v0.8b, v0.8b, v20.8b
547
uqadd v1.8b, v1.8b, v20.8b
548
uqadd v2.8b, v2.8b, v20.8b
549
uqadd v3.8b, v3.8b, v20.8b
550
uqadd v4.8b, v4.8b, v21.8b
551
uqadd v5.8b, v5.8b, v21.8b
552
uqadd v6.8b, v6.8b, v21.8b
553
uqadd v7.8b, v7.8b, v21.8b
554
uqsub v0.8b, v0.8b, v22.8b
555
uqsub v1.8b, v1.8b, v22.8b
556
uqsub v2.8b, v2.8b, v22.8b
557
uqsub v3.8b, v3.8b, v22.8b
558
uqsub v4.8b, v4.8b, v23.8b
559
uqsub v5.8b, v5.8b, v23.8b
560
uqsub v6.8b, v6.8b, v23.8b
561
uqsub v7.8b, v7.8b, v23.8b
562
563
st1 {v0.8b}, [x0], x2
564
st1 {v1.8b}, [x0], x2
565
st1 {v2.8b}, [x0], x2
566
st1 {v3.8b}, [x0], x2
567
st1 {v4.8b}, [x0], x2
568
st1 {v5.8b}, [x0], x2
569
st1 {v6.8b}, [x0], x2
570
st1 {v7.8b}, [x0], x2
571
ret
572
endfunc
573
574
.macro ADD16x4_IDCT_DC dc
575
ld1 {v4.16b}, [x0], x3
576
dup v24.8h, \dc[0]
577
dup v25.8h, \dc[1]
578
ld1 {v5.16b}, [x0], x3
579
dup v26.8h, \dc[2]
580
dup v27.8h, \dc[3]
581
ld1 {v6.16b}, [x0], x3
582
trn1 v24.2d, v24.2d, v25.2d
583
ld1 {v7.16b}, [x0], x3
584
trn1 v25.2d, v26.2d, v27.2d
585
neg v26.8h, v24.8h
586
neg v27.8h, v25.8h
587
588
sqxtun v20.8b, v24.8h
589
sqxtun v21.8b, v26.8h
590
sqxtun2 v20.16b, v25.8h
591
sqxtun2 v21.16b, v27.8h
592
593
uqadd v4.16b, v4.16b, v20.16b
594
uqadd v5.16b, v5.16b, v20.16b
595
uqadd v6.16b, v6.16b, v20.16b
596
uqadd v7.16b, v7.16b, v20.16b
597
598
uqsub v4.16b, v4.16b, v21.16b
599
uqsub v5.16b, v5.16b, v21.16b
600
uqsub v6.16b, v6.16b, v21.16b
601
st1 {v4.16b}, [x2], x3
602
uqsub v7.16b, v7.16b, v21.16b
603
st1 {v5.16b}, [x2], x3
604
st1 {v6.16b}, [x2], x3
605
st1 {v7.16b}, [x2], x3
606
.endm
607
608
function x264_add16x16_idct_dc_neon, export=1
609
mov x2, x0
610
mov x3, #FDEC_STRIDE
611
612
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
613
srshr v0.4h, v0.4h, #6
614
srshr v1.4h, v1.4h, #6
615
616
ADD16x4_IDCT_DC v0.h
617
srshr v2.4h, v2.4h, #6
618
ADD16x4_IDCT_DC v1.h
619
srshr v3.4h, v3.4h, #6
620
ADD16x4_IDCT_DC v2.h
621
ADD16x4_IDCT_DC v3.h
622
ret
623
endfunc
624
625
.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
626
ld1 {\t0\().8b}, [x1], x3
627
ld1 {\t1\().8b}, [x2], x4
628
ld1 {\t2\().8b}, [x1], x3
629
ld1 {\t3\().8b}, [x2], x4
630
usubl \t0\().8h, \t0\().8b, \t1\().8b
631
ld1 {\t4\().8b}, [x1], x3
632
ld1 {\t5\().8b}, [x2], x4
633
usubl \t1\().8h, \t2\().8b, \t3\().8b
634
ld1 {\t6\().8b}, [x1], x3
635
ld1 {\t7\().8b}, [x2], x4
636
add \dst\().8h, \t0\().8h, \t1\().8h
637
usubl \t2\().8h, \t4\().8b, \t5\().8b
638
usubl \t3\().8h, \t6\().8b, \t7\().8b
639
add \dst\().8h, \dst\().8h, \t2\().8h
640
add \dst\().8h, \dst\().8h, \t3\().8h
641
.endm
642
643
function x264_sub8x8_dct_dc_neon, export=1
644
mov x3, #FENC_STRIDE
645
mov x4, #FDEC_STRIDE
646
647
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
648
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
649
650
transpose v2.2d, v3.2d, v0.2d, v1.2d
651
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
652
transpose v2.2d, v3.2d, v0.2d, v1.2d
653
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
654
transpose v2.2d, v3.2d, v0.2d, v1.2d
655
656
addp v0.8h, v2.8h, v3.8h
657
addp v0.8h, v0.8h, v0.8h
658
659
st1 {v0.4h}, [x0]
660
ret
661
endfunc
662
663
function x264_sub8x16_dct_dc_neon, export=1
664
mov x3, #FENC_STRIDE
665
mov x4, #FDEC_STRIDE
666
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
667
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
668
sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
669
sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
670
671
addp v4.8h, v0.8h, v2.8h
672
addp v5.8h, v1.8h, v3.8h
673
674
transpose v2.4s, v3.4s, v4.4s, v5.4s
675
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
676
677
transpose v2.4s, v3.4s, v0.4s, v1.4s
678
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
679
680
transpose v2.2d, v3.2d, v0.2d, v1.2d
681
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
682
683
trn1 v2.2d, v0.2d, v1.2d
684
trn2 v3.2d, v1.2d, v0.2d
685
686
addp v0.8h, v2.8h, v3.8h
687
688
st1 {v0.8h}, [x0]
689
ret
690
endfunc
691
692
function x264_zigzag_interleave_8x8_cavlc_neon, export=1
693
mov x3, #7
694
movi v31.4s, #1
695
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
696
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
697
umax v16.8h, v0.8h, v4.8h
698
umax v17.8h, v1.8h, v5.8h
699
umax v18.8h, v2.8h, v6.8h
700
umax v19.8h, v3.8h, v7.8h
701
st1 {v0.8h}, [x0], #16
702
st1 {v4.8h}, [x0], #16
703
umaxp v16.8h, v16.8h, v17.8h
704
umaxp v18.8h, v18.8h, v19.8h
705
st1 {v1.8h}, [x0], #16
706
st1 {v5.8h}, [x0], #16
707
umaxp v16.8h, v16.8h, v18.8h
708
st1 {v2.8h}, [x0], #16
709
st1 {v6.8h}, [x0], #16
710
cmhi v16.4s, v16.4s, v31.4s
711
st1 {v3.8h}, [x0], #16
712
and v16.16b, v16.16b, v31.16b
713
st1 {v7.8h}, [x0], #16
714
st1 {v16.b}[0], [x2], #1
715
st1 {v16.b}[4], [x2], x3
716
st1 {v16.b}[8], [x2], #1
717
st1 {v16.b}[12], [x2]
718
ret
719
endfunc
720
721
function x264_zigzag_scan_4x4_frame_neon, export=1
722
movrel x2, scan4x4_frame
723
ld1 {v0.16b,v1.16b}, [x1]
724
ld1 {v16.16b,v17.16b}, [x2]
725
tbl v2.16b, {v0.16b,v1.16b}, v16.16b
726
tbl v3.16b, {v0.16b,v1.16b}, v17.16b
727
st1 {v2.16b,v3.16b}, [x0]
728
ret
729
endfunc
730
731
.macro zigzag_sub_4x4 f ac
732
function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
733
mov x9, #FENC_STRIDE
734
mov x4, #FDEC_STRIDE
735
movrel x5, sub4x4_\f
736
mov x6, x2
737
ld1 {v0.s}[0], [x1], x9
738
ld1 {v0.s}[1], [x1], x9
739
ld1 {v0.s}[2], [x1], x9
740
ld1 {v0.s}[3], [x1], x9
741
ld1 {v16.16b}, [x5]
742
ld1 {v1.s}[0], [x2], x4
743
ld1 {v1.s}[1], [x2], x4
744
ld1 {v1.s}[2], [x2], x4
745
ld1 {v1.s}[3], [x2], x4
746
tbl v2.16b, {v0.16b}, v16.16b
747
tbl v3.16b, {v1.16b}, v16.16b
748
st1 {v0.s}[0], [x6], x4
749
usubl v4.8h, v2.8b, v3.8b
750
.ifc \ac, ac
751
dup h7, v4.h[0]
752
ins v4.h[0], wzr
753
fmov w5, s7
754
strh w5, [x3]
755
.endif
756
usubl2 v5.8h, v2.16b, v3.16b
757
st1 {v0.s}[1], [x6], x4
758
umax v6.8h, v4.8h, v5.8h
759
umaxv h6, v6.8h
760
st1 {v0.s}[2], [x6], x4
761
fmov w7, s6
762
st1 {v0.s}[3], [x6], x4
763
cmp w7, #0
764
st1 {v4.8h,v5.8h}, [x0]
765
cset w0, ne
766
ret
767
endfunc
768
.endm
769
770
zigzag_sub_4x4 field
771
zigzag_sub_4x4 field, ac
772
zigzag_sub_4x4 frame
773
zigzag_sub_4x4 frame, ac
774
775
function x264_zigzag_scan_4x4_field_neon, export=1
776
movrel x2, scan4x4_field
777
ld1 {v0.8h,v1.8h}, [x1]
778
ld1 {v16.16b}, [x2]
779
tbl v0.16b, {v0.16b}, v16.16b
780
st1 {v0.8h,v1.8h}, [x0]
781
ret
782
endfunc
783
784
function x264_zigzag_scan_8x8_frame_neon, export=1
785
movrel x2, scan8x8_frame
786
ld1 {v0.8h,v1.8h}, [x1], #32
787
ld1 {v2.8h,v3.8h}, [x1], #32
788
ld1 {v4.8h,v5.8h}, [x1], #32
789
ld1 {v6.8h,v7.8h}, [x1]
790
ld1 {v16.16b,v17.16b}, [x2], #32
791
ld1 {v18.16b,v19.16b}, [x2], #32
792
ld1 {v20.16b,v21.16b}, [x2], #32
793
ld1 {v22.16b,v23.16b}, [x2], #32
794
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
795
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
796
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
797
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
798
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
799
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
800
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
801
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
802
mov v25.h[6], v4.h[0]
803
mov v25.h[7], v5.h[0]
804
mov v26.h[0], v4.h[1]
805
mov v27.h[4], v7.h[0]
806
mov v28.h[7], v4.h[4]
807
mov v29.h[7], v3.h[6]
808
mov v30.h[0], v2.h[7]
809
mov v30.h[1], v3.h[7]
810
st1 {v24.8h,v25.8h}, [x0], #32
811
st1 {v26.8h,v27.8h}, [x0], #32
812
st1 {v28.8h,v29.8h}, [x0], #32
813
st1 {v30.8h,v31.8h}, [x0]
814
ret
815
endfunc
816
817
#define Z(z) 2*(z), 2*(z)+1
818
#define T(x,y) Z(x*8+y)
819
const scan8x8_frame, align=5
820
.byte T(0,0), T(1,0), T(0,1), T(0,2)
821
.byte T(1,1), T(2,0), T(3,0), T(2,1)
822
.byte T(1,2), T(0,3), T(0,4), T(1,3)
823
.byte T(2,2), T(3,1), T(4,0), T(5,0)
824
.byte T(4,1), T(3,2), T(2,3), T(1,4)
825
.byte T(0,5), T(0,6), T(1,5), T(2,4)
826
#undef T
827
#define T(x,y) Z((x-3)*8+y)
828
.byte T(3,3), T(4,2), T(5,1), T(6,0)
829
.byte T(7,0), T(6,1), T(5,2), T(4,3)
830
#undef T
831
#define T(x,y) Z((x-0)*8+y)
832
.byte T(3,4), T(2,5), T(1,6), T(0,7)
833
.byte T(1,7), T(2,6), T(3,5), T(4,4)
834
#undef T
835
#define T(x,y) Z((x-4)*8+y)
836
.byte T(5,3), T(6,2), T(7,1), T(7,2)
837
.byte T(6,3), T(5,4), T(4,5), T(3,6)
838
.byte T(2,7), T(3,7), T(4,6), T(5,5)
839
.byte T(6,4), T(7,3), T(7,4), T(6,5)
840
.byte T(5,6), T(4,7), T(5,7), T(6,6)
841
.byte T(7,5), T(7,6), T(6,7), T(7,7)
842
endconst
843
844
function x264_zigzag_scan_8x8_field_neon, export=1
845
movrel x2, scan8x8_field
846
ld1 {v0.8h,v1.8h}, [x1], #32
847
ld1 {v2.8h,v3.8h}, [x1], #32
848
ld1 {v4.8h,v5.8h}, [x1], #32
849
ld1 {v6.8h,v7.8h}, [x1]
850
ld1 {v16.16b,v17.16b}, [x2], #32
851
ld1 {v18.16b,v19.16b}, [x2], #32
852
ld1 {v20.16b,v21.16b}, [x2], #32
853
ld1 {v22.16b}, [x2]
854
ext v31.16b, v7.16b, v7.16b, #4
855
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
856
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
857
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
858
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
859
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
860
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
861
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
862
ext v31.16b, v6.16b, v31.16b, #12
863
st1 {v24.8h,v25.8h}, [x0], #32
864
st1 {v26.8h,v27.8h}, [x0], #32
865
st1 {v28.8h,v29.8h}, [x0], #32
866
st1 {v30.8h,v31.8h}, [x0]
867
ret
868
endfunc
869
870
.macro zigzag_sub8x8 f
871
function x264_zigzag_sub_8x8_\f\()_neon, export=1
872
movrel x4, sub8x8_\f
873
mov x5, #FENC_STRIDE
874
mov x6, #FDEC_STRIDE
875
mov x7, x2
876
ld1 {v0.d}[0], [x1], x5
877
ld1 {v0.d}[1], [x1], x5
878
ld1 {v1.d}[0], [x1], x5
879
ld1 {v1.d}[1], [x1], x5
880
ld1 {v2.d}[0], [x1], x5
881
ld1 {v2.d}[1], [x1], x5
882
ld1 {v3.d}[0], [x1], x5
883
ld1 {v3.d}[1], [x1]
884
ld1 {v4.d}[0], [x2], x6
885
ld1 {v4.d}[1], [x2], x6
886
ld1 {v5.d}[0], [x2], x6
887
ld1 {v5.d}[1], [x2], x6
888
ld1 {v6.d}[0], [x2], x6
889
ld1 {v6.d}[1], [x2], x6
890
ld1 {v7.d}[0], [x2], x6
891
ld1 {v7.d}[1], [x2]
892
ld1 {v16.16b,v17.16b}, [x4], #32
893
ld1 {v18.16b,v19.16b}, [x4], #32
894
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
895
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
896
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
897
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
898
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
899
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
900
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
901
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
902
usubl v4.8h, v24.8b, v28.8b
903
usubl2 v5.8h, v24.16b, v28.16b
904
usubl v6.8h, v25.8b, v29.8b
905
usubl2 v7.8h, v25.16b, v29.16b
906
usubl v16.8h, v26.8b, v30.8b
907
usubl2 v17.8h, v26.16b, v30.16b
908
usubl v18.8h, v27.8b, v31.8b
909
usubl2 v19.8h, v27.16b, v31.16b
910
umax v20.8h, v4.8h, v5.8h
911
umax v21.8h, v6.8h, v7.8h
912
umax v22.8h, v16.8h, v17.8h
913
umax v23.8h, v18.8h, v19.8h
914
umax v20.8h, v20.8h, v21.8h
915
umax v21.8h, v22.8h, v23.8h
916
umax v20.8h, v20.8h, v21.8h
917
umaxv h22, v20.8h
918
st1 {v0.d}[0], [x7], x6
919
st1 {v0.d}[1], [x7], x6
920
st1 {v1.d}[0], [x7], x6
921
st1 {v1.d}[1], [x7], x6
922
st1 {v2.d}[0], [x7], x6
923
st1 {v2.d}[1], [x7], x6
924
st1 {v3.d}[0], [x7], x6
925
st1 {v3.d}[1], [x7]
926
st1 {v4.8h,v5.8h}, [x0], #32
927
st1 {v6.8h,v7.8h}, [x0], #32
928
st1 {v16.8h,v17.8h}, [x0], #32
929
st1 {v18.8h,v19.8h}, [x0]
930
fmov w9, s22
931
cmp w9, #0
932
cset w0, ne
933
ret
934
endfunc
935
.endm
936
937
zigzag_sub8x8 field
938
zigzag_sub8x8 frame
939
940
#undef T
941
#define T(x,y) Z(x*8+y)
942
const scan8x8_field, align=5
943
.byte T(0,0), T(0,1), T(0,2), T(1,0)
944
.byte T(1,1), T(0,3), T(0,4), T(1,2)
945
.byte T(2,0), T(1,3), T(0,5), T(0,6)
946
.byte T(0,7), T(1,4), T(2,1), T(3,0)
947
#undef T
948
#define T(x,y) Z((x-1)*8+y)
949
.byte T(2,2), T(1,5), T(1,6), T(1,7)
950
.byte T(2,3), T(3,1), T(4,0), T(3,2)
951
#undef T
952
#define T(x,y) Z((x-2)*8+y)
953
.byte T(2,4), T(2,5), T(2,6), T(2,7)
954
.byte T(3,3), T(4,1), T(5,0), T(4,2)
955
#undef T
956
#define T(x,y) Z((x-3)*8+y)
957
.byte T(3,4), T(3,5), T(3,6), T(3,7)
958
.byte T(4,3), T(5,1), T(6,0), T(5,2)
959
#undef T
960
#define T(x,y) Z((x-4)*8+y)
961
.byte T(4,4), T(4,5), T(4,6), T(4,7)
962
.byte T(5,3), T(6,1), T(6,2), T(5,4)
963
#undef T
964
#define T(x,y) Z((x-5)*8+y)
965
.byte T(5,5), T(5,6), T(5,7), T(6,3)
966
.byte T(7,0), T(7,1), T(6,4), T(6,5)
967
endconst
968
969
970
#undef T
971
#define T(y,x) x*8+y
972
const sub8x8_frame, align=5
973
.byte T(0,0), T(1,0), T(0,1), T(0,2)
974
.byte T(1,1), T(2,0), T(3,0), T(2,1)
975
.byte T(1,2), T(0,3), T(0,4), T(1,3)
976
.byte T(2,2), T(3,1), T(4,0), T(5,0)
977
.byte T(4,1), T(3,2), T(2,3), T(1,4)
978
.byte T(0,5), T(0,6), T(1,5), T(2,4)
979
.byte T(3,3), T(4,2), T(5,1), T(6,0)
980
.byte T(7,0), T(6,1), T(5,2), T(4,3)
981
.byte T(3,4), T(2,5), T(1,6), T(0,7)
982
.byte T(1,7), T(2,6), T(3,5), T(4,4)
983
.byte T(5,3), T(6,2), T(7,1), T(7,2)
984
.byte T(6,3), T(5,4), T(4,5), T(3,6)
985
.byte T(2,7), T(3,7), T(4,6), T(5,5)
986
.byte T(6,4), T(7,3), T(7,4), T(6,5)
987
.byte T(5,6), T(4,7), T(5,7), T(6,6)
988
.byte T(7,5), T(7,6), T(6,7), T(7,7)
989
endconst
990
991
const sub8x8_field, align=5
992
.byte T(0,0), T(0,1), T(0,2), T(1,0)
993
.byte T(1,1), T(0,3), T(0,4), T(1,2)
994
.byte T(2,0), T(1,3), T(0,5), T(0,6)
995
.byte T(0,7), T(1,4), T(2,1), T(3,0)
996
.byte T(2,2), T(1,5), T(1,6), T(1,7)
997
.byte T(2,3), T(3,1), T(4,0), T(3,2)
998
.byte T(2,4), T(2,5), T(2,6), T(2,7)
999
.byte T(3,3), T(4,1), T(5,0), T(4,2)
1000
.byte T(3,4), T(3,5), T(3,6), T(3,7)
1001
.byte T(4,3), T(5,1), T(6,0), T(5,2)
1002
.byte T(4,4), T(4,5), T(4,6), T(4,7)
1003
.byte T(5,3), T(6,1), T(6,2), T(5,4)
1004
.byte T(5,5), T(5,6), T(5,7), T(6,3)
1005
.byte T(7,0), T(7,1), T(6,4), T(6,5)
1006
.byte T(6,6), T(6,7), T(7,2), T(7,3)
1007
.byte T(7,4), T(7,5), T(7,6), T(7,7)
1008
endconst
1009
1010