Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/ghash-ce-core.S
29269 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
4
*
5
* Copyright (C) 2014 - 2018 Linaro Ltd. <[email protected]>
6
*/
7
8
#include <linux/linkage.h>
9
#include <linux/cfi_types.h>
10
#include <asm/assembler.h>
11
12
SHASH .req v0
13
SHASH2 .req v1
14
T1 .req v2
15
T2 .req v3
16
MASK .req v4
17
XM .req v5
18
XL .req v6
19
XH .req v7
20
IN1 .req v7
21
22
k00_16 .req v8
23
k32_48 .req v9
24
25
t3 .req v10
26
t4 .req v11
27
t5 .req v12
28
t6 .req v13
29
t7 .req v14
30
t8 .req v15
31
t9 .req v16
32
33
perm1 .req v17
34
perm2 .req v18
35
perm3 .req v19
36
37
sh1 .req v20
38
sh2 .req v21
39
sh3 .req v22
40
sh4 .req v23
41
42
ss1 .req v24
43
ss2 .req v25
44
ss3 .req v26
45
ss4 .req v27
46
47
XL2 .req v8
48
XM2 .req v9
49
XH2 .req v10
50
XL3 .req v11
51
XM3 .req v12
52
XH3 .req v13
53
TT3 .req v14
54
TT4 .req v15
55
HH .req v16
56
HH3 .req v17
57
HH4 .req v18
58
HH34 .req v19
59
60
.text
61
.arch armv8-a+crypto
62
63
.macro __pmull_p64, rd, rn, rm
64
pmull \rd\().1q, \rn\().1d, \rm\().1d
65
.endm
66
67
.macro __pmull2_p64, rd, rn, rm
68
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
69
.endm
70
71
.macro __pmull_p8, rq, ad, bd
72
ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
73
ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
74
ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
75
76
__pmull_p8_\bd \rq, \ad
77
.endm
78
79
.macro __pmull2_p8, rq, ad, bd
80
tbl t3.16b, {\ad\().16b}, perm1.16b // A1
81
tbl t5.16b, {\ad\().16b}, perm2.16b // A2
82
tbl t7.16b, {\ad\().16b}, perm3.16b // A3
83
84
__pmull2_p8_\bd \rq, \ad
85
.endm
86
87
.macro __pmull_p8_SHASH, rq, ad
88
__pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89
.endm
90
91
.macro __pmull_p8_SHASH2, rq, ad
92
__pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93
.endm
94
95
.macro __pmull2_p8_SHASH, rq, ad
96
__pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97
.endm
98
99
.macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100
pmull\t t3.8h, t3.\nb, \bd // F = A1*B
101
pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
102
pmull\t t5.8h, t5.\nb, \bd // H = A2*B
103
pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
104
pmull\t t7.8h, t7.\nb, \bd // J = A3*B
105
pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
106
pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
107
pmull\t \rq\().8h, \ad, \bd // D = A*B
108
109
eor t3.16b, t3.16b, t4.16b // L = E + F
110
eor t5.16b, t5.16b, t6.16b // M = G + H
111
eor t7.16b, t7.16b, t8.16b // N = I + J
112
113
uzp1 t4.2d, t3.2d, t5.2d
114
uzp2 t3.2d, t3.2d, t5.2d
115
uzp1 t6.2d, t7.2d, t9.2d
116
uzp2 t7.2d, t7.2d, t9.2d
117
118
// t3 = (L) (P0 + P1) << 8
119
// t5 = (M) (P2 + P3) << 16
120
eor t4.16b, t4.16b, t3.16b
121
and t3.16b, t3.16b, k32_48.16b
122
123
// t7 = (N) (P4 + P5) << 24
124
// t9 = (K) (P6 + P7) << 32
125
eor t6.16b, t6.16b, t7.16b
126
and t7.16b, t7.16b, k00_16.16b
127
128
eor t4.16b, t4.16b, t3.16b
129
eor t6.16b, t6.16b, t7.16b
130
131
zip2 t5.2d, t4.2d, t3.2d
132
zip1 t3.2d, t4.2d, t3.2d
133
zip2 t9.2d, t6.2d, t7.2d
134
zip1 t7.2d, t6.2d, t7.2d
135
136
ext t3.16b, t3.16b, t3.16b, #15
137
ext t5.16b, t5.16b, t5.16b, #14
138
ext t7.16b, t7.16b, t7.16b, #13
139
ext t9.16b, t9.16b, t9.16b, #12
140
141
eor t3.16b, t3.16b, t5.16b
142
eor t7.16b, t7.16b, t9.16b
143
eor \rq\().16b, \rq\().16b, t3.16b
144
eor \rq\().16b, \rq\().16b, t7.16b
145
.endm
146
147
.macro __pmull_pre_p64
148
add x8, x3, #16
149
ld1 {HH.2d-HH4.2d}, [x8]
150
151
trn1 SHASH2.2d, SHASH.2d, HH.2d
152
trn2 T1.2d, SHASH.2d, HH.2d
153
eor SHASH2.16b, SHASH2.16b, T1.16b
154
155
trn1 HH34.2d, HH3.2d, HH4.2d
156
trn2 T1.2d, HH3.2d, HH4.2d
157
eor HH34.16b, HH34.16b, T1.16b
158
159
movi MASK.16b, #0xe1
160
shl MASK.2d, MASK.2d, #57
161
.endm
162
163
.macro __pmull_pre_p8
164
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
165
eor SHASH2.16b, SHASH2.16b, SHASH.16b
166
167
// k00_16 := 0x0000000000000000_000000000000ffff
168
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169
movi k32_48.2d, #0xffffffff
170
mov k32_48.h[2], k32_48.h[0]
171
ushr k00_16.2d, k32_48.2d, #32
172
173
// prepare the permutation vectors
174
mov_q x5, 0x080f0e0d0c0b0a09
175
movi T1.8b, #8
176
dup perm1.2d, x5
177
eor perm1.16b, perm1.16b, T1.16b
178
ushr perm2.2d, perm1.2d, #8
179
ushr perm3.2d, perm1.2d, #16
180
ushr T1.2d, perm1.2d, #24
181
sli perm2.2d, perm1.2d, #56
182
sli perm3.2d, perm1.2d, #48
183
sli T1.2d, perm1.2d, #40
184
185
// precompute loop invariants
186
tbl sh1.16b, {SHASH.16b}, perm1.16b
187
tbl sh2.16b, {SHASH.16b}, perm2.16b
188
tbl sh3.16b, {SHASH.16b}, perm3.16b
189
tbl sh4.16b, {SHASH.16b}, T1.16b
190
ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
191
ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
192
ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
193
ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
194
.endm
195
196
//
197
// PMULL (64x64->128) based reduction for CPUs that can do
198
// it in a single instruction.
199
//
200
.macro __pmull_reduce_p64
201
pmull T2.1q, XL.1d, MASK.1d
202
eor XM.16b, XM.16b, T1.16b
203
204
mov XH.d[0], XM.d[1]
205
mov XM.d[1], XL.d[0]
206
207
eor XL.16b, XM.16b, T2.16b
208
ext T2.16b, XL.16b, XL.16b, #8
209
pmull XL.1q, XL.1d, MASK.1d
210
.endm
211
212
//
213
// Alternative reduction for CPUs that lack support for the
214
// 64x64->128 PMULL instruction
215
//
216
.macro __pmull_reduce_p8
217
eor XM.16b, XM.16b, T1.16b
218
219
mov XL.d[1], XM.d[0]
220
mov XH.d[0], XM.d[1]
221
222
shl T1.2d, XL.2d, #57
223
shl T2.2d, XL.2d, #62
224
eor T2.16b, T2.16b, T1.16b
225
shl T1.2d, XL.2d, #63
226
eor T2.16b, T2.16b, T1.16b
227
ext T1.16b, XL.16b, XH.16b, #8
228
eor T2.16b, T2.16b, T1.16b
229
230
mov XL.d[1], T2.d[0]
231
mov XH.d[0], T2.d[1]
232
233
ushr T2.2d, XL.2d, #1
234
eor XH.16b, XH.16b, XL.16b
235
eor XL.16b, XL.16b, T2.16b
236
ushr T2.2d, T2.2d, #6
237
ushr XL.2d, XL.2d, #1
238
.endm
239
240
.macro __pmull_ghash, pn
241
ld1 {SHASH.2d}, [x3]
242
ld1 {XL.2d}, [x1]
243
244
__pmull_pre_\pn
245
246
/* do the head block first, if supplied */
247
cbz x4, 0f
248
ld1 {T1.2d}, [x4]
249
mov x4, xzr
250
b 3f
251
252
0: .ifc \pn, p64
253
tbnz w0, #0, 2f // skip until #blocks is a
254
tbnz w0, #1, 2f // round multiple of 4
255
256
1: ld1 {XM3.16b-TT4.16b}, [x2], #64
257
258
sub w0, w0, #4
259
260
rev64 T1.16b, XM3.16b
261
rev64 T2.16b, XH3.16b
262
rev64 TT4.16b, TT4.16b
263
rev64 TT3.16b, TT3.16b
264
265
ext IN1.16b, TT4.16b, TT4.16b, #8
266
ext XL3.16b, TT3.16b, TT3.16b, #8
267
268
eor TT4.16b, TT4.16b, IN1.16b
269
pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
270
pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
271
pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
272
273
eor TT3.16b, TT3.16b, XL3.16b
274
pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
275
pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
276
pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
277
278
ext IN1.16b, T2.16b, T2.16b, #8
279
eor XL2.16b, XL2.16b, XL3.16b
280
eor XH2.16b, XH2.16b, XH3.16b
281
eor XM2.16b, XM2.16b, XM3.16b
282
283
eor T2.16b, T2.16b, IN1.16b
284
pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
285
pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
286
pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
287
288
eor XL2.16b, XL2.16b, XL3.16b
289
eor XH2.16b, XH2.16b, XH3.16b
290
eor XM2.16b, XM2.16b, XM3.16b
291
292
ext IN1.16b, T1.16b, T1.16b, #8
293
ext TT3.16b, XL.16b, XL.16b, #8
294
eor XL.16b, XL.16b, IN1.16b
295
eor T1.16b, T1.16b, TT3.16b
296
297
pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
298
eor T1.16b, T1.16b, XL.16b
299
pmull XL.1q, HH4.1d, XL.1d // a0 * b0
300
pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
301
302
eor XL.16b, XL.16b, XL2.16b
303
eor XH.16b, XH.16b, XH2.16b
304
eor XM.16b, XM.16b, XM2.16b
305
306
eor T2.16b, XL.16b, XH.16b
307
ext T1.16b, XL.16b, XH.16b, #8
308
eor XM.16b, XM.16b, T2.16b
309
310
__pmull_reduce_p64
311
312
eor T2.16b, T2.16b, XH.16b
313
eor XL.16b, XL.16b, T2.16b
314
315
cbz w0, 5f
316
b 1b
317
.endif
318
319
2: ld1 {T1.2d}, [x2], #16
320
sub w0, w0, #1
321
322
3: /* multiply XL by SHASH in GF(2^128) */
323
CPU_LE( rev64 T1.16b, T1.16b )
324
325
ext T2.16b, XL.16b, XL.16b, #8
326
ext IN1.16b, T1.16b, T1.16b, #8
327
eor T1.16b, T1.16b, T2.16b
328
eor XL.16b, XL.16b, IN1.16b
329
330
__pmull2_\pn XH, XL, SHASH // a1 * b1
331
eor T1.16b, T1.16b, XL.16b
332
__pmull_\pn XL, XL, SHASH // a0 * b0
333
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
334
335
4: eor T2.16b, XL.16b, XH.16b
336
ext T1.16b, XL.16b, XH.16b, #8
337
eor XM.16b, XM.16b, T2.16b
338
339
__pmull_reduce_\pn
340
341
eor T2.16b, T2.16b, XH.16b
342
eor XL.16b, XL.16b, T2.16b
343
344
cbnz w0, 0b
345
346
5: st1 {XL.2d}, [x1]
347
ret
348
.endm
349
350
/*
351
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352
* struct ghash_key const *k, const char *head)
353
*/
354
SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355
__pmull_ghash p64
356
SYM_FUNC_END(pmull_ghash_update_p64)
357
358
SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359
__pmull_ghash p8
360
SYM_FUNC_END(pmull_ghash_update_p8)
361
362
KS0 .req v8
363
KS1 .req v9
364
KS2 .req v10
365
KS3 .req v11
366
367
INP0 .req v21
368
INP1 .req v22
369
INP2 .req v23
370
INP3 .req v24
371
372
K0 .req v25
373
K1 .req v26
374
K2 .req v27
375
K3 .req v28
376
K4 .req v12
377
K5 .req v13
378
K6 .req v4
379
K7 .req v5
380
K8 .req v14
381
K9 .req v15
382
KK .req v29
383
KL .req v30
384
KM .req v31
385
386
.macro load_round_keys, rounds, rk, tmp
387
add \tmp, \rk, #64
388
ld1 {K0.4s-K3.4s}, [\rk]
389
ld1 {K4.4s-K5.4s}, [\tmp]
390
add \tmp, \rk, \rounds, lsl #4
391
sub \tmp, \tmp, #32
392
ld1 {KK.4s-KM.4s}, [\tmp]
393
.endm
394
395
.macro enc_round, state, key
396
aese \state\().16b, \key\().16b
397
aesmc \state\().16b, \state\().16b
398
.endm
399
400
.macro enc_qround, s0, s1, s2, s3, key
401
enc_round \s0, \key
402
enc_round \s1, \key
403
enc_round \s2, \key
404
enc_round \s3, \key
405
.endm
406
407
.macro enc_block, state, rounds, rk, tmp
408
add \tmp, \rk, #96
409
ld1 {K6.4s-K7.4s}, [\tmp], #32
410
.irp key, K0, K1, K2, K3, K4 K5
411
enc_round \state, \key
412
.endr
413
414
tbnz \rounds, #2, .Lnot128_\@
415
.Lout256_\@:
416
enc_round \state, K6
417
enc_round \state, K7
418
419
.Lout192_\@:
420
enc_round \state, KK
421
aese \state\().16b, KL.16b
422
eor \state\().16b, \state\().16b, KM.16b
423
424
.subsection 1
425
.Lnot128_\@:
426
ld1 {K8.4s-K9.4s}, [\tmp], #32
427
enc_round \state, K6
428
enc_round \state, K7
429
ld1 {K6.4s-K7.4s}, [\tmp]
430
enc_round \state, K8
431
enc_round \state, K9
432
tbz \rounds, #1, .Lout192_\@
433
b .Lout256_\@
434
.previous
435
.endm
436
437
.align 6
438
.macro pmull_gcm_do_crypt, enc
439
frame_push 1
440
441
load_round_keys x7, x6, x8
442
443
ld1 {SHASH.2d}, [x3], #16
444
ld1 {HH.2d-HH4.2d}, [x3]
445
446
trn1 SHASH2.2d, SHASH.2d, HH.2d
447
trn2 T1.2d, SHASH.2d, HH.2d
448
eor SHASH2.16b, SHASH2.16b, T1.16b
449
450
trn1 HH34.2d, HH3.2d, HH4.2d
451
trn2 T1.2d, HH3.2d, HH4.2d
452
eor HH34.16b, HH34.16b, T1.16b
453
454
ld1 {XL.2d}, [x4]
455
456
cbz x0, 3f // tag only?
457
458
ldr w8, [x5, #12] // load lower counter
459
CPU_LE( rev w8, w8 )
460
461
0: mov w9, #4 // max blocks per round
462
add x10, x0, #0xf
463
lsr x10, x10, #4 // remaining blocks
464
465
subs x0, x0, #64
466
csel w9, w10, w9, mi
467
add w8, w8, w9
468
469
bmi 1f
470
ld1 {INP0.16b-INP3.16b}, [x2], #64
471
.subsection 1
472
/*
473
* Populate the four input registers right to left with up to 63 bytes
474
* of data, using overlapping loads to avoid branches.
475
*
476
* INP0 INP1 INP2 INP3
477
* 1 byte | | | |x |
478
* 16 bytes | | | |xxxxxxxx|
479
* 17 bytes | | |xxxxxxxx|x |
480
* 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
481
* etc etc
482
*
483
* Note that this code may read up to 15 bytes before the start of
484
* the input. It is up to the calling code to ensure this is safe if
485
* this happens in the first iteration of the loop (i.e., when the
486
* input size is < 16 bytes)
487
*/
488
1: mov x15, #16
489
ands x19, x0, #0xf
490
csel x19, x19, x15, ne
491
adr_l x17, .Lpermute_table + 16
492
493
sub x11, x15, x19
494
add x12, x17, x11
495
sub x17, x17, x11
496
ld1 {T1.16b}, [x12]
497
sub x10, x1, x11
498
sub x11, x2, x11
499
500
cmp x0, #-16
501
csel x14, x15, xzr, gt
502
cmp x0, #-32
503
csel x15, x15, xzr, gt
504
cmp x0, #-48
505
csel x16, x19, xzr, gt
506
csel x1, x1, x10, gt
507
csel x2, x2, x11, gt
508
509
ld1 {INP0.16b}, [x2], x14
510
ld1 {INP1.16b}, [x2], x15
511
ld1 {INP2.16b}, [x2], x16
512
ld1 {INP3.16b}, [x2]
513
tbl INP3.16b, {INP3.16b}, T1.16b
514
b 2f
515
.previous
516
517
2: .if \enc == 0
518
bl pmull_gcm_ghash_4x
519
.endif
520
521
bl pmull_gcm_enc_4x
522
523
tbnz x0, #63, 6f
524
st1 {INP0.16b-INP3.16b}, [x1], #64
525
.if \enc == 1
526
bl pmull_gcm_ghash_4x
527
.endif
528
bne 0b
529
530
3: ldr x10, [sp, #.Lframe_local_offset]
531
cbz x10, 5f // output tag?
532
533
ld1 {INP3.16b}, [x10] // load lengths[]
534
mov w9, #1
535
bl pmull_gcm_ghash_4x
536
537
mov w11, #(0x1 << 24) // BE '1U'
538
ld1 {KS0.16b}, [x5]
539
mov KS0.s[3], w11
540
541
enc_block KS0, x7, x6, x12
542
543
ext XL.16b, XL.16b, XL.16b, #8
544
rev64 XL.16b, XL.16b
545
eor XL.16b, XL.16b, KS0.16b
546
547
.if \enc == 1
548
st1 {XL.16b}, [x10] // store tag
549
.else
550
ldp x11, x12, [sp, #40] // load tag pointer and authsize
551
adr_l x17, .Lpermute_table
552
ld1 {KS0.16b}, [x11] // load supplied tag
553
add x17, x17, x12
554
ld1 {KS1.16b}, [x17] // load permute vector
555
556
cmeq XL.16b, XL.16b, KS0.16b // compare tags
557
mvn XL.16b, XL.16b // -1 for fail, 0 for pass
558
tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
559
sminv b0, XL.16b // signed minimum across XL
560
smov w0, v0.b[0] // return b0
561
.endif
562
563
4: frame_pop
564
ret
565
566
5:
567
CPU_LE( rev w8, w8 )
568
str w8, [x5, #12] // store lower counter
569
st1 {XL.2d}, [x4]
570
b 4b
571
572
6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
573
sub x17, x17, x19, lsl #1
574
575
cmp w9, #1
576
beq 7f
577
.subsection 1
578
7: ld1 {INP2.16b}, [x1]
579
tbx INP2.16b, {INP3.16b}, T1.16b
580
mov INP3.16b, INP2.16b
581
b 8f
582
.previous
583
584
st1 {INP0.16b}, [x1], x14
585
st1 {INP1.16b}, [x1], x15
586
st1 {INP2.16b}, [x1], x16
587
tbl INP3.16b, {INP3.16b}, T1.16b
588
tbx INP3.16b, {INP2.16b}, T2.16b
589
8: st1 {INP3.16b}, [x1]
590
591
.if \enc == 1
592
ld1 {T1.16b}, [x17]
593
tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
594
bl pmull_gcm_ghash_4x
595
.endif
596
b 3b
597
.endm
598
599
/*
600
* void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601
* struct ghash_key const *k, u64 dg[], u8 ctr[],
602
* int rounds, u8 tag)
603
*/
604
SYM_FUNC_START(pmull_gcm_encrypt)
605
pmull_gcm_do_crypt 1
606
SYM_FUNC_END(pmull_gcm_encrypt)
607
608
/*
609
* void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610
* struct ghash_key const *k, u64 dg[], u8 ctr[],
611
* int rounds, u8 tag)
612
*/
613
SYM_FUNC_START(pmull_gcm_decrypt)
614
pmull_gcm_do_crypt 0
615
SYM_FUNC_END(pmull_gcm_decrypt)
616
617
SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618
movi MASK.16b, #0xe1
619
shl MASK.2d, MASK.2d, #57
620
621
rev64 T1.16b, INP0.16b
622
rev64 T2.16b, INP1.16b
623
rev64 TT3.16b, INP2.16b
624
rev64 TT4.16b, INP3.16b
625
626
ext XL.16b, XL.16b, XL.16b, #8
627
628
tbz w9, #2, 0f // <4 blocks?
629
.subsection 1
630
0: movi XH2.16b, #0
631
movi XM2.16b, #0
632
movi XL2.16b, #0
633
634
tbz w9, #0, 1f // 2 blocks?
635
tbz w9, #1, 2f // 1 block?
636
637
eor T2.16b, T2.16b, XL.16b
638
ext T1.16b, T2.16b, T2.16b, #8
639
b .Lgh3
640
641
1: eor TT3.16b, TT3.16b, XL.16b
642
ext T2.16b, TT3.16b, TT3.16b, #8
643
b .Lgh2
644
645
2: eor TT4.16b, TT4.16b, XL.16b
646
ext IN1.16b, TT4.16b, TT4.16b, #8
647
b .Lgh1
648
.previous
649
650
eor T1.16b, T1.16b, XL.16b
651
ext IN1.16b, T1.16b, T1.16b, #8
652
653
pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
654
eor T1.16b, T1.16b, IN1.16b
655
pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
656
pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
657
658
ext T1.16b, T2.16b, T2.16b, #8
659
.Lgh3: eor T2.16b, T2.16b, T1.16b
660
pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
661
pmull XL.1q, HH3.1d, T1.1d // a0 * b0
662
pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
663
664
eor XH2.16b, XH2.16b, XH.16b
665
eor XL2.16b, XL2.16b, XL.16b
666
eor XM2.16b, XM2.16b, XM.16b
667
668
ext T2.16b, TT3.16b, TT3.16b, #8
669
.Lgh2: eor TT3.16b, TT3.16b, T2.16b
670
pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
671
pmull XL.1q, HH.1d, T2.1d // a0 * b0
672
pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
673
674
eor XH2.16b, XH2.16b, XH.16b
675
eor XL2.16b, XL2.16b, XL.16b
676
eor XM2.16b, XM2.16b, XM.16b
677
678
ext IN1.16b, TT4.16b, TT4.16b, #8
679
.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
680
pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
681
pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
682
pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
683
684
eor XH.16b, XH.16b, XH2.16b
685
eor XL.16b, XL.16b, XL2.16b
686
eor XM.16b, XM.16b, XM2.16b
687
688
eor T2.16b, XL.16b, XH.16b
689
ext T1.16b, XL.16b, XH.16b, #8
690
eor XM.16b, XM.16b, T2.16b
691
692
__pmull_reduce_p64
693
694
eor T2.16b, T2.16b, XH.16b
695
eor XL.16b, XL.16b, T2.16b
696
697
ret
698
SYM_FUNC_END(pmull_gcm_ghash_4x)
699
700
SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701
ld1 {KS0.16b}, [x5] // load upper counter
702
sub w10, w8, #4
703
sub w11, w8, #3
704
sub w12, w8, #2
705
sub w13, w8, #1
706
rev w10, w10
707
rev w11, w11
708
rev w12, w12
709
rev w13, w13
710
mov KS1.16b, KS0.16b
711
mov KS2.16b, KS0.16b
712
mov KS3.16b, KS0.16b
713
ins KS0.s[3], w10 // set lower counter
714
ins KS1.s[3], w11
715
ins KS2.s[3], w12
716
ins KS3.s[3], w13
717
718
add x10, x6, #96 // round key pointer
719
ld1 {K6.4s-K7.4s}, [x10], #32
720
.irp key, K0, K1, K2, K3, K4, K5
721
enc_qround KS0, KS1, KS2, KS3, \key
722
.endr
723
724
tbnz x7, #2, .Lnot128
725
.subsection 1
726
.Lnot128:
727
ld1 {K8.4s-K9.4s}, [x10], #32
728
.irp key, K6, K7
729
enc_qround KS0, KS1, KS2, KS3, \key
730
.endr
731
ld1 {K6.4s-K7.4s}, [x10]
732
.irp key, K8, K9
733
enc_qround KS0, KS1, KS2, KS3, \key
734
.endr
735
tbz x7, #1, .Lout192
736
b .Lout256
737
.previous
738
739
.Lout256:
740
.irp key, K6, K7
741
enc_qround KS0, KS1, KS2, KS3, \key
742
.endr
743
744
.Lout192:
745
enc_qround KS0, KS1, KS2, KS3, KK
746
747
aese KS0.16b, KL.16b
748
aese KS1.16b, KL.16b
749
aese KS2.16b, KL.16b
750
aese KS3.16b, KL.16b
751
752
eor KS0.16b, KS0.16b, KM.16b
753
eor KS1.16b, KS1.16b, KM.16b
754
eor KS2.16b, KS2.16b, KM.16b
755
eor KS3.16b, KS3.16b, KM.16b
756
757
eor INP0.16b, INP0.16b, KS0.16b
758
eor INP1.16b, INP1.16b, KS1.16b
759
eor INP2.16b, INP2.16b, KS2.16b
760
eor INP3.16b, INP3.16b, KS3.16b
761
762
ret
763
SYM_FUNC_END(pmull_gcm_enc_4x)
764
765
.section ".rodata", "a"
766
.align 6
767
.Lpermute_table:
768
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
771
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
772
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
775
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
776
.previous
777
778