CoCalc -- ghash-ce-core.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/ghash-ce-core.S
²⁹²⁶⁹ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4
 *
5
 * Copyright (C) 2014 - 2018 Linaro Ltd. <[email protected]>
6
 */
7

8
#include <linux/linkage.h>
9
#include <linux/cfi_types.h>
10
#include <asm/assembler.h>
11

12
	SHASH		.req	v0
13
	SHASH2		.req	v1
14
	T1		.req	v2
15
	T2		.req	v3
16
	MASK		.req	v4
17
	XM		.req	v5
18
	XL		.req	v6
19
	XH		.req	v7
20
	IN1		.req	v7
21

22
	k00_16		.req	v8
23
	k32_48		.req	v9
24

25
	t3		.req	v10
26
	t4		.req	v11
27
	t5		.req	v12
28
	t6		.req	v13
29
	t7		.req	v14
30
	t8		.req	v15
31
	t9		.req	v16
32

33
	perm1		.req	v17
34
	perm2		.req	v18
35
	perm3		.req	v19
36

37
	sh1		.req	v20
38
	sh2		.req	v21
39
	sh3		.req	v22
40
	sh4		.req	v23
41

42
	ss1		.req	v24
43
	ss2		.req	v25
44
	ss3		.req	v26
45
	ss4		.req	v27
46

47
	XL2		.req	v8
48
	XM2		.req	v9
49
	XH2		.req	v10
50
	XL3		.req	v11
51
	XM3		.req	v12
52
	XH3		.req	v13
53
	TT3		.req	v14
54
	TT4		.req	v15
55
	HH		.req	v16
56
	HH3		.req	v17
57
	HH4		.req	v18
58
	HH34		.req	v19
59

60
	.text
61
	.arch		armv8-a+crypto
62

63
	.macro		__pmull_p64, rd, rn, rm
64
	pmull		\rd\().1q, \rn\().1d, \rm\().1d
65
	.endm
66

67
	.macro		__pmull2_p64, rd, rn, rm
68
	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
69
	.endm
70

71
	.macro		__pmull_p8, rq, ad, bd
72
	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
73
	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
74
	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
75

76
	__pmull_p8_\bd	\rq, \ad
77
	.endm
78

79
	.macro		__pmull2_p8, rq, ad, bd
80
	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
81
	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
82
	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
83

84
	__pmull2_p8_\bd	\rq, \ad
85
	.endm
86

87
	.macro		__pmull_p8_SHASH, rq, ad
88
	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89
	.endm
90

91
	.macro		__pmull_p8_SHASH2, rq, ad
92
	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93
	.endm
94

95
	.macro		__pmull2_p8_SHASH, rq, ad
96
	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97
	.endm
98

99
	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100
	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
101
	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
102
	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
103
	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
104
	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
105
	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
106
	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
107
	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
108

109
	eor		t3.16b, t3.16b, t4.16b			// L = E + F
110
	eor		t5.16b, t5.16b, t6.16b			// M = G + H
111
	eor		t7.16b, t7.16b, t8.16b			// N = I + J
112

113
	uzp1		t4.2d, t3.2d, t5.2d
114
	uzp2		t3.2d, t3.2d, t5.2d
115
	uzp1		t6.2d, t7.2d, t9.2d
116
	uzp2		t7.2d, t7.2d, t9.2d
117

118
	// t3 = (L) (P0 + P1) << 8
119
	// t5 = (M) (P2 + P3) << 16
120
	eor		t4.16b, t4.16b, t3.16b
121
	and		t3.16b, t3.16b, k32_48.16b
122

123
	// t7 = (N) (P4 + P5) << 24
124
	// t9 = (K) (P6 + P7) << 32
125
	eor		t6.16b, t6.16b, t7.16b
126
	and		t7.16b, t7.16b, k00_16.16b
127

128
	eor		t4.16b, t4.16b, t3.16b
129
	eor		t6.16b, t6.16b, t7.16b
130

131
	zip2		t5.2d, t4.2d, t3.2d
132
	zip1		t3.2d, t4.2d, t3.2d
133
	zip2		t9.2d, t6.2d, t7.2d
134
	zip1		t7.2d, t6.2d, t7.2d
135

136
	ext		t3.16b, t3.16b, t3.16b, #15
137
	ext		t5.16b, t5.16b, t5.16b, #14
138
	ext		t7.16b, t7.16b, t7.16b, #13
139
	ext		t9.16b, t9.16b, t9.16b, #12
140

141
	eor		t3.16b, t3.16b, t5.16b
142
	eor		t7.16b, t7.16b, t9.16b
143
	eor		\rq\().16b, \rq\().16b, t3.16b
144
	eor		\rq\().16b, \rq\().16b, t7.16b
145
	.endm
146

147
	.macro		__pmull_pre_p64
148
	add		x8, x3, #16
149
	ld1		{HH.2d-HH4.2d}, [x8]
150

151
	trn1		SHASH2.2d, SHASH.2d, HH.2d
152
	trn2		T1.2d, SHASH.2d, HH.2d
153
	eor		SHASH2.16b, SHASH2.16b, T1.16b
154

155
	trn1		HH34.2d, HH3.2d, HH4.2d
156
	trn2		T1.2d, HH3.2d, HH4.2d
157
	eor		HH34.16b, HH34.16b, T1.16b
158

159
	movi		MASK.16b, #0xe1
160
	shl		MASK.2d, MASK.2d, #57
161
	.endm
162

163
	.macro		__pmull_pre_p8
164
	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
165
	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
166

167
	// k00_16 := 0x0000000000000000_000000000000ffff
168
	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169
	movi		k32_48.2d, #0xffffffff
170
	mov		k32_48.h[2], k32_48.h[0]
171
	ushr		k00_16.2d, k32_48.2d, #32
172

173
	// prepare the permutation vectors
174
	mov_q		x5, 0x080f0e0d0c0b0a09
175
	movi		T1.8b, #8
176
	dup		perm1.2d, x5
177
	eor		perm1.16b, perm1.16b, T1.16b
178
	ushr		perm2.2d, perm1.2d, #8
179
	ushr		perm3.2d, perm1.2d, #16
180
	ushr		T1.2d, perm1.2d, #24
181
	sli		perm2.2d, perm1.2d, #56
182
	sli		perm3.2d, perm1.2d, #48
183
	sli		T1.2d, perm1.2d, #40
184

185
	// precompute loop invariants
186
	tbl		sh1.16b, {SHASH.16b}, perm1.16b
187
	tbl		sh2.16b, {SHASH.16b}, perm2.16b
188
	tbl		sh3.16b, {SHASH.16b}, perm3.16b
189
	tbl		sh4.16b, {SHASH.16b}, T1.16b
190
	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
191
	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
192
	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
193
	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
194
	.endm
195

196
	//
197
	// PMULL (64x64->128) based reduction for CPUs that can do
198
	// it in a single instruction.
199
	//
200
	.macro		__pmull_reduce_p64
201
	pmull		T2.1q, XL.1d, MASK.1d
202
	eor		XM.16b, XM.16b, T1.16b
203

204
	mov		XH.d[0], XM.d[1]
205
	mov		XM.d[1], XL.d[0]
206

207
	eor		XL.16b, XM.16b, T2.16b
208
	ext		T2.16b, XL.16b, XL.16b, #8
209
	pmull		XL.1q, XL.1d, MASK.1d
210
	.endm
211

212
	//
213
	// Alternative reduction for CPUs that lack support for the
214
	// 64x64->128 PMULL instruction
215
	//
216
	.macro		__pmull_reduce_p8
217
	eor		XM.16b, XM.16b, T1.16b
218

219
	mov		XL.d[1], XM.d[0]
220
	mov		XH.d[0], XM.d[1]
221

222
	shl		T1.2d, XL.2d, #57
223
	shl		T2.2d, XL.2d, #62
224
	eor		T2.16b, T2.16b, T1.16b
225
	shl		T1.2d, XL.2d, #63
226
	eor		T2.16b, T2.16b, T1.16b
227
	ext		T1.16b, XL.16b, XH.16b, #8
228
	eor		T2.16b, T2.16b, T1.16b
229

230
	mov		XL.d[1], T2.d[0]
231
	mov		XH.d[0], T2.d[1]
232

233
	ushr		T2.2d, XL.2d, #1
234
	eor		XH.16b, XH.16b, XL.16b
235
	eor		XL.16b, XL.16b, T2.16b
236
	ushr		T2.2d, T2.2d, #6
237
	ushr		XL.2d, XL.2d, #1
238
	.endm
239

240
	.macro		__pmull_ghash, pn
241
	ld1		{SHASH.2d}, [x3]
242
	ld1		{XL.2d}, [x1]
243

244
	__pmull_pre_\pn
245

246
	/* do the head block first, if supplied */
247
	cbz		x4, 0f
248
	ld1		{T1.2d}, [x4]
249
	mov		x4, xzr
250
	b		3f
251

252
0:	.ifc		\pn, p64
253
	tbnz		w0, #0, 2f		// skip until #blocks is a
254
	tbnz		w0, #1, 2f		// round multiple of 4
255

256
1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
257

258
	sub		w0, w0, #4
259

260
	rev64		T1.16b, XM3.16b
261
	rev64		T2.16b, XH3.16b
262
	rev64		TT4.16b, TT4.16b
263
	rev64		TT3.16b, TT3.16b
264

265
	ext		IN1.16b, TT4.16b, TT4.16b, #8
266
	ext		XL3.16b, TT3.16b, TT3.16b, #8
267

268
	eor		TT4.16b, TT4.16b, IN1.16b
269
	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
270
	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
271
	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
272

273
	eor		TT3.16b, TT3.16b, XL3.16b
274
	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
275
	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
276
	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
277

278
	ext		IN1.16b, T2.16b, T2.16b, #8
279
	eor		XL2.16b, XL2.16b, XL3.16b
280
	eor		XH2.16b, XH2.16b, XH3.16b
281
	eor		XM2.16b, XM2.16b, XM3.16b
282

283
	eor		T2.16b, T2.16b, IN1.16b
284
	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
285
	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
286
	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
287

288
	eor		XL2.16b, XL2.16b, XL3.16b
289
	eor		XH2.16b, XH2.16b, XH3.16b
290
	eor		XM2.16b, XM2.16b, XM3.16b
291

292
	ext		IN1.16b, T1.16b, T1.16b, #8
293
	ext		TT3.16b, XL.16b, XL.16b, #8
294
	eor		XL.16b, XL.16b, IN1.16b
295
	eor		T1.16b, T1.16b, TT3.16b
296

297
	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
298
	eor		T1.16b, T1.16b, XL.16b
299
	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
300
	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
301

302
	eor		XL.16b, XL.16b, XL2.16b
303
	eor		XH.16b, XH.16b, XH2.16b
304
	eor		XM.16b, XM.16b, XM2.16b
305

306
	eor		T2.16b, XL.16b, XH.16b
307
	ext		T1.16b, XL.16b, XH.16b, #8
308
	eor		XM.16b, XM.16b, T2.16b
309

310
	__pmull_reduce_p64
311

312
	eor		T2.16b, T2.16b, XH.16b
313
	eor		XL.16b, XL.16b, T2.16b
314

315
	cbz		w0, 5f
316
	b		1b
317
	.endif
318

319
2:	ld1		{T1.2d}, [x2], #16
320
	sub		w0, w0, #1
321

322
3:	/* multiply XL by SHASH in GF(2^128) */
323
CPU_LE(	rev64		T1.16b, T1.16b	)
324

325
	ext		T2.16b, XL.16b, XL.16b, #8
326
	ext		IN1.16b, T1.16b, T1.16b, #8
327
	eor		T1.16b, T1.16b, T2.16b
328
	eor		XL.16b, XL.16b, IN1.16b
329

330
	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
331
	eor		T1.16b, T1.16b, XL.16b
332
	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
333
	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
334

335
4:	eor		T2.16b, XL.16b, XH.16b
336
	ext		T1.16b, XL.16b, XH.16b, #8
337
	eor		XM.16b, XM.16b, T2.16b
338

339
	__pmull_reduce_\pn
340

341
	eor		T2.16b, T2.16b, XH.16b
342
	eor		XL.16b, XL.16b, T2.16b
343

344
	cbnz		w0, 0b
345

346
5:	st1		{XL.2d}, [x1]
347
	ret
348
	.endm
349

350
	/*
351
	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352
	 *			   struct ghash_key const *k, const char *head)
353
	 */
354
SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355
	__pmull_ghash	p64
356
SYM_FUNC_END(pmull_ghash_update_p64)
357

358
SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359
	__pmull_ghash	p8
360
SYM_FUNC_END(pmull_ghash_update_p8)
361

362
	KS0		.req	v8
363
	KS1		.req	v9
364
	KS2		.req	v10
365
	KS3		.req	v11
366

367
	INP0		.req	v21
368
	INP1		.req	v22
369
	INP2		.req	v23
370
	INP3		.req	v24
371

372
	K0		.req	v25
373
	K1		.req	v26
374
	K2		.req	v27
375
	K3		.req	v28
376
	K4		.req	v12
377
	K5		.req	v13
378
	K6		.req	v4
379
	K7		.req	v5
380
	K8		.req	v14
381
	K9		.req	v15
382
	KK		.req	v29
383
	KL		.req	v30
384
	KM		.req	v31
385

386
	.macro		load_round_keys, rounds, rk, tmp
387
	add		\tmp, \rk, #64
388
	ld1		{K0.4s-K3.4s}, [\rk]
389
	ld1		{K4.4s-K5.4s}, [\tmp]
390
	add		\tmp, \rk, \rounds, lsl #4
391
	sub		\tmp, \tmp, #32
392
	ld1		{KK.4s-KM.4s}, [\tmp]
393
	.endm
394

395
	.macro		enc_round, state, key
396
	aese		\state\().16b, \key\().16b
397
	aesmc		\state\().16b, \state\().16b
398
	.endm
399

400
	.macro		enc_qround, s0, s1, s2, s3, key
401
	enc_round	\s0, \key
402
	enc_round	\s1, \key
403
	enc_round	\s2, \key
404
	enc_round	\s3, \key
405
	.endm
406

407
	.macro		enc_block, state, rounds, rk, tmp
408
	add		\tmp, \rk, #96
409
	ld1		{K6.4s-K7.4s}, [\tmp], #32
410
	.irp		key, K0, K1, K2, K3, K4 K5
411
	enc_round	\state, \key
412
	.endr
413

414
	tbnz		\rounds, #2, .Lnot128_\@
415
.Lout256_\@:
416
	enc_round	\state, K6
417
	enc_round	\state, K7
418

419
.Lout192_\@:
420
	enc_round	\state, KK
421
	aese		\state\().16b, KL.16b
422
	eor		\state\().16b, \state\().16b, KM.16b
423

424
	.subsection	1
425
.Lnot128_\@:
426
	ld1		{K8.4s-K9.4s}, [\tmp], #32
427
	enc_round	\state, K6
428
	enc_round	\state, K7
429
	ld1		{K6.4s-K7.4s}, [\tmp]
430
	enc_round	\state, K8
431
	enc_round	\state, K9
432
	tbz		\rounds, #1, .Lout192_\@
433
	b		.Lout256_\@
434
	.previous
435
	.endm
436

437
	.align		6
438
	.macro		pmull_gcm_do_crypt, enc
439
	frame_push	1
440

441
	load_round_keys	x7, x6, x8
442

443
	ld1		{SHASH.2d}, [x3], #16
444
	ld1		{HH.2d-HH4.2d}, [x3]
445

446
	trn1		SHASH2.2d, SHASH.2d, HH.2d
447
	trn2		T1.2d, SHASH.2d, HH.2d
448
	eor		SHASH2.16b, SHASH2.16b, T1.16b
449

450
	trn1		HH34.2d, HH3.2d, HH4.2d
451
	trn2		T1.2d, HH3.2d, HH4.2d
452
	eor		HH34.16b, HH34.16b, T1.16b
453

454
	ld1		{XL.2d}, [x4]
455

456
	cbz		x0, 3f				// tag only?
457

458
	ldr		w8, [x5, #12]			// load lower counter
459
CPU_LE(	rev		w8, w8		)
460

461
0:	mov		w9, #4				// max blocks per round
462
	add		x10, x0, #0xf
463
	lsr		x10, x10, #4			// remaining blocks
464

465
	subs		x0, x0, #64
466
	csel		w9, w10, w9, mi
467
	add		w8, w8, w9
468

469
	bmi		1f
470
	ld1		{INP0.16b-INP3.16b}, [x2], #64
471
	.subsection	1
472
	/*
473
	 * Populate the four input registers right to left with up to 63 bytes
474
	 * of data, using overlapping loads to avoid branches.
475
	 *
476
	 *                INP0     INP1     INP2     INP3
477
	 *  1 byte     |        |        |        |x       |
478
	 * 16 bytes    |        |        |        |xxxxxxxx|
479
	 * 17 bytes    |        |        |xxxxxxxx|x       |
480
	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481
	 * etc etc
482
	 *
483
	 * Note that this code may read up to 15 bytes before the start of
484
	 * the input. It is up to the calling code to ensure this is safe if
485
	 * this happens in the first iteration of the loop (i.e., when the
486
	 * input size is < 16 bytes)
487
	 */
488
1:	mov		x15, #16
489
	ands		x19, x0, #0xf
490
	csel		x19, x19, x15, ne
491
	adr_l		x17, .Lpermute_table + 16
492

493
	sub		x11, x15, x19
494
	add		x12, x17, x11
495
	sub		x17, x17, x11
496
	ld1		{T1.16b}, [x12]
497
	sub		x10, x1, x11
498
	sub		x11, x2, x11
499

500
	cmp		x0, #-16
501
	csel		x14, x15, xzr, gt
502
	cmp		x0, #-32
503
	csel		x15, x15, xzr, gt
504
	cmp		x0, #-48
505
	csel		x16, x19, xzr, gt
506
	csel		x1, x1, x10, gt
507
	csel		x2, x2, x11, gt
508

509
	ld1		{INP0.16b}, [x2], x14
510
	ld1		{INP1.16b}, [x2], x15
511
	ld1		{INP2.16b}, [x2], x16
512
	ld1		{INP3.16b}, [x2]
513
	tbl		INP3.16b, {INP3.16b}, T1.16b
514
	b		2f
515
	.previous
516

517
2:	.if		\enc == 0
518
	bl		pmull_gcm_ghash_4x
519
	.endif
520

521
	bl		pmull_gcm_enc_4x
522

523
	tbnz		x0, #63, 6f
524
	st1		{INP0.16b-INP3.16b}, [x1], #64
525
	.if		\enc == 1
526
	bl		pmull_gcm_ghash_4x
527
	.endif
528
	bne		0b
529

530
3:	ldr		x10, [sp, #.Lframe_local_offset]
531
	cbz		x10, 5f				// output tag?
532

533
	ld1		{INP3.16b}, [x10]		// load lengths[]
534
	mov		w9, #1
535
	bl		pmull_gcm_ghash_4x
536

537
	mov		w11, #(0x1 << 24)		// BE '1U'
538
	ld1		{KS0.16b}, [x5]
539
	mov		KS0.s[3], w11
540

541
	enc_block	KS0, x7, x6, x12
542

543
	ext		XL.16b, XL.16b, XL.16b, #8
544
	rev64		XL.16b, XL.16b
545
	eor		XL.16b, XL.16b, KS0.16b
546

547
	.if		\enc == 1
548
	st1		{XL.16b}, [x10]			// store tag
549
	.else
550
	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
551
	adr_l		x17, .Lpermute_table
552
	ld1		{KS0.16b}, [x11]		// load supplied tag
553
	add		x17, x17, x12
554
	ld1		{KS1.16b}, [x17]		// load permute vector
555

556
	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
557
	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
558
	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
559
	sminv		b0, XL.16b			// signed minimum across XL
560
	smov		w0, v0.b[0]			// return b0
561
	.endif
562

563
4:	frame_pop
564
	ret
565

566
5:
567
CPU_LE(	rev		w8, w8		)
568
	str		w8, [x5, #12]			// store lower counter
569
	st1		{XL.2d}, [x4]
570
	b		4b
571

572
6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
573
	sub		x17, x17, x19, lsl #1
574

575
	cmp		w9, #1
576
	beq		7f
577
	.subsection	1
578
7:	ld1		{INP2.16b}, [x1]
579
	tbx		INP2.16b, {INP3.16b}, T1.16b
580
	mov		INP3.16b, INP2.16b
581
	b		8f
582
	.previous
583

584
	st1		{INP0.16b}, [x1], x14
585
	st1		{INP1.16b}, [x1], x15
586
	st1		{INP2.16b}, [x1], x16
587
	tbl		INP3.16b, {INP3.16b}, T1.16b
588
	tbx		INP3.16b, {INP2.16b}, T2.16b
589
8:	st1		{INP3.16b}, [x1]
590

591
	.if		\enc == 1
592
	ld1		{T1.16b}, [x17]
593
	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
594
	bl		pmull_gcm_ghash_4x
595
	.endif
596
	b		3b
597
	.endm
598

599
	/*
600
	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601
	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
602
	 *			  int rounds, u8 tag)
603
	 */
604
SYM_FUNC_START(pmull_gcm_encrypt)
605
	pmull_gcm_do_crypt	1
606
SYM_FUNC_END(pmull_gcm_encrypt)
607

608
	/*
609
	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610
	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
611
	 *			  int rounds, u8 tag)
612
	 */
613
SYM_FUNC_START(pmull_gcm_decrypt)
614
	pmull_gcm_do_crypt	0
615
SYM_FUNC_END(pmull_gcm_decrypt)
616

617
SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618
	movi		MASK.16b, #0xe1
619
	shl		MASK.2d, MASK.2d, #57
620

621
	rev64		T1.16b, INP0.16b
622
	rev64		T2.16b, INP1.16b
623
	rev64		TT3.16b, INP2.16b
624
	rev64		TT4.16b, INP3.16b
625

626
	ext		XL.16b, XL.16b, XL.16b, #8
627

628
	tbz		w9, #2, 0f			// <4 blocks?
629
	.subsection	1
630
0:	movi		XH2.16b, #0
631
	movi		XM2.16b, #0
632
	movi		XL2.16b, #0
633

634
	tbz		w9, #0, 1f			// 2 blocks?
635
	tbz		w9, #1, 2f			// 1 block?
636

637
	eor		T2.16b, T2.16b, XL.16b
638
	ext		T1.16b, T2.16b, T2.16b, #8
639
	b		.Lgh3
640

641
1:	eor		TT3.16b, TT3.16b, XL.16b
642
	ext		T2.16b, TT3.16b, TT3.16b, #8
643
	b		.Lgh2
644

645
2:	eor		TT4.16b, TT4.16b, XL.16b
646
	ext		IN1.16b, TT4.16b, TT4.16b, #8
647
	b		.Lgh1
648
	.previous
649

650
	eor		T1.16b, T1.16b, XL.16b
651
	ext		IN1.16b, T1.16b, T1.16b, #8
652

653
	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
654
	eor		T1.16b, T1.16b, IN1.16b
655
	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
656
	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
657

658
	ext		T1.16b, T2.16b, T2.16b, #8
659
.Lgh3:	eor		T2.16b, T2.16b, T1.16b
660
	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
661
	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
662
	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
663

664
	eor		XH2.16b, XH2.16b, XH.16b
665
	eor		XL2.16b, XL2.16b, XL.16b
666
	eor		XM2.16b, XM2.16b, XM.16b
667

668
	ext		T2.16b, TT3.16b, TT3.16b, #8
669
.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
670
	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
671
	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
672
	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
673

674
	eor		XH2.16b, XH2.16b, XH.16b
675
	eor		XL2.16b, XL2.16b, XL.16b
676
	eor		XM2.16b, XM2.16b, XM.16b
677

678
	ext		IN1.16b, TT4.16b, TT4.16b, #8
679
.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
680
	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
681
	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
682
	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
683

684
	eor		XH.16b, XH.16b, XH2.16b
685
	eor		XL.16b, XL.16b, XL2.16b
686
	eor		XM.16b, XM.16b, XM2.16b
687

688
	eor		T2.16b, XL.16b, XH.16b
689
	ext		T1.16b, XL.16b, XH.16b, #8
690
	eor		XM.16b, XM.16b, T2.16b
691

692
	__pmull_reduce_p64
693

694
	eor		T2.16b, T2.16b, XH.16b
695
	eor		XL.16b, XL.16b, T2.16b
696

697
	ret
698
SYM_FUNC_END(pmull_gcm_ghash_4x)
699

700
SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701
	ld1		{KS0.16b}, [x5]			// load upper counter
702
	sub		w10, w8, #4
703
	sub		w11, w8, #3
704
	sub		w12, w8, #2
705
	sub		w13, w8, #1
706
	rev		w10, w10
707
	rev		w11, w11
708
	rev		w12, w12
709
	rev		w13, w13
710
	mov		KS1.16b, KS0.16b
711
	mov		KS2.16b, KS0.16b
712
	mov		KS3.16b, KS0.16b
713
	ins		KS0.s[3], w10			// set lower counter
714
	ins		KS1.s[3], w11
715
	ins		KS2.s[3], w12
716
	ins		KS3.s[3], w13
717

718
	add		x10, x6, #96			// round key pointer
719
	ld1		{K6.4s-K7.4s}, [x10], #32
720
	.irp		key, K0, K1, K2, K3, K4, K5
721
	enc_qround	KS0, KS1, KS2, KS3, \key
722
	.endr
723

724
	tbnz		x7, #2, .Lnot128
725
	.subsection	1
726
.Lnot128:
727
	ld1		{K8.4s-K9.4s}, [x10], #32
728
	.irp		key, K6, K7
729
	enc_qround	KS0, KS1, KS2, KS3, \key
730
	.endr
731
	ld1		{K6.4s-K7.4s}, [x10]
732
	.irp		key, K8, K9
733
	enc_qround	KS0, KS1, KS2, KS3, \key
734
	.endr
735
	tbz		x7, #1, .Lout192
736
	b		.Lout256
737
	.previous
738

739
.Lout256:
740
	.irp		key, K6, K7
741
	enc_qround	KS0, KS1, KS2, KS3, \key
742
	.endr
743

744
.Lout192:
745
	enc_qround	KS0, KS1, KS2, KS3, KK
746

747
	aese		KS0.16b, KL.16b
748
	aese		KS1.16b, KL.16b
749
	aese		KS2.16b, KL.16b
750
	aese		KS3.16b, KL.16b
751

752
	eor		KS0.16b, KS0.16b, KM.16b
753
	eor		KS1.16b, KS1.16b, KM.16b
754
	eor		KS2.16b, KS2.16b, KM.16b
755
	eor		KS3.16b, KS3.16b, KM.16b
756

757
	eor		INP0.16b, INP0.16b, KS0.16b
758
	eor		INP1.16b, INP1.16b, KS1.16b
759
	eor		INP2.16b, INP2.16b, KS2.16b
760
	eor		INP3.16b, INP3.16b, KS3.16b
761

762
	ret
763
SYM_FUNC_END(pmull_gcm_enc_4x)
764

765
	.section	".rodata", "a"
766
	.align		6
767
.Lpermute_table:
768
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770
	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771
	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774
	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775
	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776
	.previous
777

778
Product

Resources

Company