CoCalc -- aes-neonbs-core.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm/crypto/aes-neonbs-core.S
²⁹²⁶⁶ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * Bit sliced AES using NEON instructions
4
 *
5
 * Copyright (C) 2017 Linaro Ltd.
6
 * Author: Ard Biesheuvel <[email protected]>
7
 */
8

9
/*
10
 * The algorithm implemented here is described in detail by the paper
11
 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12
 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
13
 *
14
 * This implementation is based primarily on the OpenSSL implementation
15
 * for 32-bit ARM written by Andy Polyakov <[email protected]>
16
 */
17

18
#include <linux/linkage.h>
19
#include <asm/assembler.h>
20

21
	.text
22
	.fpu		neon
23

24
	rounds		.req	ip
25
	bskey		.req	r4
26

27
	q0l		.req	d0
28
	q0h		.req	d1
29
	q1l		.req	d2
30
	q1h		.req	d3
31
	q2l		.req	d4
32
	q2h		.req	d5
33
	q3l		.req	d6
34
	q3h		.req	d7
35
	q4l		.req	d8
36
	q4h		.req	d9
37
	q5l		.req	d10
38
	q5h		.req	d11
39
	q6l		.req	d12
40
	q6h		.req	d13
41
	q7l		.req	d14
42
	q7h		.req	d15
43
	q8l		.req	d16
44
	q8h		.req	d17
45
	q9l		.req	d18
46
	q9h		.req	d19
47
	q10l		.req	d20
48
	q10h		.req	d21
49
	q11l		.req	d22
50
	q11h		.req	d23
51
	q12l		.req	d24
52
	q12h		.req	d25
53
	q13l		.req	d26
54
	q13h		.req	d27
55
	q14l		.req	d28
56
	q14h		.req	d29
57
	q15l		.req	d30
58
	q15h		.req	d31
59

60
	.macro		__tbl, out, tbl, in, tmp
61
	.ifc		\out, \tbl
62
	.ifb		\tmp
63
	.error		__tbl needs temp register if out == tbl
64
	.endif
65
	vmov		\tmp, \out
66
	.endif
67
	vtbl.8		\out\()l, {\tbl}, \in\()l
68
	.ifc		\out, \tbl
69
	vtbl.8		\out\()h, {\tmp}, \in\()h
70
	.else
71
	vtbl.8		\out\()h, {\tbl}, \in\()h
72
	.endif
73
	.endm
74

75
	.macro		__ldr, out, sym
76
	vldr		\out\()l, \sym
77
	vldr		\out\()h, \sym + 8
78
	.endm
79

80
	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
81
	veor		\b2, \b2, \b1
82
	veor		\b5, \b5, \b6
83
	veor		\b3, \b3, \b0
84
	veor		\b6, \b6, \b2
85
	veor		\b5, \b5, \b0
86
	veor		\b6, \b6, \b3
87
	veor		\b3, \b3, \b7
88
	veor		\b7, \b7, \b5
89
	veor		\b3, \b3, \b4
90
	veor		\b4, \b4, \b5
91
	veor		\b2, \b2, \b7
92
	veor		\b3, \b3, \b1
93
	veor		\b1, \b1, \b5
94
	.endm
95

96
	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
97
	veor		\b0, \b0, \b6
98
	veor		\b1, \b1, \b4
99
	veor		\b4, \b4, \b6
100
	veor		\b2, \b2, \b0
101
	veor		\b6, \b6, \b1
102
	veor		\b1, \b1, \b5
103
	veor		\b5, \b5, \b3
104
	veor		\b3, \b3, \b7
105
	veor		\b7, \b7, \b5
106
	veor		\b2, \b2, \b5
107
	veor		\b4, \b4, \b7
108
	.endm
109

110
	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
111
	veor		\b1, \b1, \b7
112
	veor		\b4, \b4, \b7
113
	veor		\b7, \b7, \b5
114
	veor		\b1, \b1, \b3
115
	veor		\b2, \b2, \b5
116
	veor		\b3, \b3, \b7
117
	veor		\b6, \b6, \b1
118
	veor		\b2, \b2, \b0
119
	veor		\b5, \b5, \b3
120
	veor		\b4, \b4, \b6
121
	veor		\b0, \b0, \b6
122
	veor		\b1, \b1, \b4
123
	.endm
124

125
	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
126
	veor		\b1, \b1, \b5
127
	veor		\b2, \b2, \b7
128
	veor		\b3, \b3, \b1
129
	veor		\b4, \b4, \b5
130
	veor		\b7, \b7, \b5
131
	veor		\b3, \b3, \b4
132
	veor 		\b5, \b5, \b0
133
	veor		\b3, \b3, \b7
134
	veor		\b6, \b6, \b2
135
	veor		\b2, \b2, \b1
136
	veor		\b6, \b6, \b3
137
	veor		\b3, \b3, \b0
138
	veor		\b5, \b5, \b6
139
	.endm
140

141
	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
142
	veor 		\t0, \y0, \y1
143
	vand		\t0, \t0, \x0
144
	veor		\x0, \x0, \x1
145
	vand		\t1, \x1, \y0
146
	vand		\x0, \x0, \y1
147
	veor		\x1, \t1, \t0
148
	veor		\x0, \x0, \t1
149
	.endm
150

151
	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
152
	veor		\t0, \y0, \y1
153
	veor 		\t1, \y2, \y3
154
	vand		\t0, \t0, \x0
155
	vand		\t1, \t1, \x2
156
	veor		\x0, \x0, \x1
157
	veor		\x2, \x2, \x3
158
	vand		\x1, \x1, \y0
159
	vand		\x3, \x3, \y2
160
	vand		\x0, \x0, \y1
161
	vand		\x2, \x2, \y3
162
	veor		\x1, \x1, \x0
163
	veor		\x2, \x2, \x3
164
	veor		\x0, \x0, \t0
165
	veor		\x3, \x3, \t1
166
	.endm
167

168
	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
169
				    y0, y1, y2, y3, t0, t1, t2, t3
170
	veor		\t0, \x0, \x2
171
	veor		\t1, \x1, \x3
172
	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
173
	veor		\y0, \y0, \y2
174
	veor		\y1, \y1, \y3
175
	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
176
	veor		\x0, \x0, \t0
177
	veor		\x2, \x2, \t0
178
	veor		\x1, \x1, \t1
179
	veor		\x3, \x3, \t1
180
	veor		\t0, \x4, \x6
181
	veor		\t1, \x5, \x7
182
	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
183
	veor		\y0, \y0, \y2
184
	veor		\y1, \y1, \y3
185
	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
186
	veor		\x4, \x4, \t0
187
	veor		\x6, \x6, \t0
188
	veor		\x5, \x5, \t1
189
	veor		\x7, \x7, \t1
190
	.endm
191

192
	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
193
				   t0, t1, t2, t3, s0, s1, s2, s3
194
	veor		\t3, \x4, \x6
195
	veor		\t0, \x5, \x7
196
	veor		\t1, \x1, \x3
197
	veor		\s1, \x7, \x6
198
	veor		\s0, \x0, \x2
199
	veor		\s3, \t3, \t0
200
	vorr		\t2, \t0, \t1
201
	vand		\s2, \t3, \s0
202
	vorr		\t3, \t3, \s0
203
	veor		\s0, \s0, \t1
204
	vand		\t0, \t0, \t1
205
	veor		\t1, \x3, \x2
206
	vand		\s3, \s3, \s0
207
	vand		\s1, \s1, \t1
208
	veor		\t1, \x4, \x5
209
	veor		\s0, \x1, \x0
210
	veor		\t3, \t3, \s1
211
	veor		\t2, \t2, \s1
212
	vand		\s1, \t1, \s0
213
	vorr		\t1, \t1, \s0
214
	veor		\t3, \t3, \s3
215
	veor		\t0, \t0, \s1
216
	veor		\t2, \t2, \s2
217
	veor		\t1, \t1, \s3
218
	veor		\t0, \t0, \s2
219
	vand		\s0, \x7, \x3
220
	veor		\t1, \t1, \s2
221
	vand		\s1, \x6, \x2
222
	vand		\s2, \x5, \x1
223
	vorr		\s3, \x4, \x0
224
	veor		\t3, \t3, \s0
225
	veor		\t1, \t1, \s2
226
	veor		\s0, \t0, \s3
227
	veor		\t2, \t2, \s1
228
	vand		\s2, \t3, \t1
229
	veor		\s1, \t2, \s2
230
	veor		\s3, \s0, \s2
231
	vbsl		\s1, \t1, \s0
232
	vmvn		\t0, \s0
233
	vbsl		\s0, \s1, \s3
234
	vbsl		\t0, \s1, \s3
235
	vbsl		\s3, \t3, \t2
236
	veor		\t3, \t3, \t2
237
	vand		\s2, \s0, \s3
238
	veor		\t1, \t1, \t0
239
	veor		\s2, \s2, \t3
240
	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
241
			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
242
	.endm
243

244
	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
245
			      t0, t1, t2, t3, s0, s1, s2, s3
246
	in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
247
	inv_gf256	\b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
248
			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
249
	out_bs_ch	\b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
250
	.endm
251

252
	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253
				  t0, t1, t2, t3, s0, s1, s2, s3
254
	inv_in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255
	inv_gf256	\b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
256
			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257
	inv_out_bs_ch	\b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
258
	.endm
259

260
	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
261
				    t0, t1, t2, t3, mask
262
	vld1.8		{\t0-\t1}, [bskey, :256]!
263
	veor		\t0, \t0, \x0
264
	vld1.8		{\t2-\t3}, [bskey, :256]!
265
	veor		\t1, \t1, \x1
266
	__tbl		\x0, \t0, \mask
267
	veor		\t2, \t2, \x2
268
	__tbl		\x1, \t1, \mask
269
	vld1.8		{\t0-\t1}, [bskey, :256]!
270
	veor		\t3, \t3, \x3
271
	__tbl		\x2, \t2, \mask
272
	__tbl		\x3, \t3, \mask
273
	vld1.8		{\t2-\t3}, [bskey, :256]!
274
	veor		\t0, \t0, \x4
275
	veor		\t1, \t1, \x5
276
	__tbl		\x4, \t0, \mask
277
	veor		\t2, \t2, \x6
278
	__tbl		\x5, \t1, \mask
279
	veor		\t3, \t3, \x7
280
	__tbl		\x6, \t2, \mask
281
	__tbl		\x7, \t3, \mask
282
	.endm
283

284
	.macro		inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
285
					t0, t1, t2, t3, mask
286
	__tbl		\x0, \x0, \mask, \t0
287
	__tbl		\x1, \x1, \mask, \t1
288
	__tbl		\x2, \x2, \mask, \t2
289
	__tbl		\x3, \x3, \mask, \t3
290
	__tbl		\x4, \x4, \mask, \t0
291
	__tbl		\x5, \x5, \mask, \t1
292
	__tbl		\x6, \x6, \mask, \t2
293
	__tbl		\x7, \x7, \mask, \t3
294
	.endm
295

296
	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
297
				  t0, t1, t2, t3, t4, t5, t6, t7, inv
298
	vext.8		\t0, \x0, \x0, #12
299
	vext.8		\t1, \x1, \x1, #12
300
	veor		\x0, \x0, \t0
301
	vext.8		\t2, \x2, \x2, #12
302
	veor		\x1, \x1, \t1
303
	vext.8		\t3, \x3, \x3, #12
304
	veor		\x2, \x2, \t2
305
	vext.8		\t4, \x4, \x4, #12
306
	veor		\x3, \x3, \t3
307
	vext.8		\t5, \x5, \x5, #12
308
	veor		\x4, \x4, \t4
309
	vext.8		\t6, \x6, \x6, #12
310
	veor		\x5, \x5, \t5
311
	vext.8		\t7, \x7, \x7, #12
312
	veor		\x6, \x6, \t6
313
	veor		\t1, \t1, \x0
314
	veor.8		\x7, \x7, \t7
315
	vext.8		\x0, \x0, \x0, #8
316
	veor		\t2, \t2, \x1
317
	veor		\t0, \t0, \x7
318
	veor		\t1, \t1, \x7
319
	vext.8		\x1, \x1, \x1, #8
320
	veor		\t5, \t5, \x4
321
	veor		\x0, \x0, \t0
322
	veor		\t6, \t6, \x5
323
	veor		\x1, \x1, \t1
324
	vext.8		\t0, \x4, \x4, #8
325
	veor		\t4, \t4, \x3
326
	vext.8		\t1, \x5, \x5, #8
327
	veor		\t7, \t7, \x6
328
	vext.8		\x4, \x3, \x3, #8
329
	veor		\t3, \t3, \x2
330
	vext.8		\x5, \x7, \x7, #8
331
	veor		\t4, \t4, \x7
332
	vext.8		\x3, \x6, \x6, #8
333
	veor		\t3, \t3, \x7
334
	vext.8		\x6, \x2, \x2, #8
335
	veor		\x7, \t1, \t5
336
	.ifb		\inv
337
	veor		\x2, \t0, \t4
338
	veor		\x4, \x4, \t3
339
	veor		\x5, \x5, \t7
340
	veor		\x3, \x3, \t6
341
	veor		\x6, \x6, \t2
342
	.else
343
	veor		\t3, \t3, \x4
344
	veor		\x5, \x5, \t7
345
	veor		\x2, \x3, \t6
346
	veor		\x3, \t0, \t4
347
	veor		\x4, \x6, \t2
348
	vmov		\x6, \t3
349
	.endif
350
	.endm
351

352
	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
353
				      t0, t1, t2, t3, t4, t5, t6, t7
354
	vld1.8		{\t0-\t1}, [bskey, :256]!
355
	veor		\x0, \x0, \t0
356
	vld1.8		{\t2-\t3}, [bskey, :256]!
357
	veor		\x1, \x1, \t1
358
	vld1.8		{\t4-\t5}, [bskey, :256]!
359
	veor		\x2, \x2, \t2
360
	vld1.8		{\t6-\t7}, [bskey, :256]
361
	sub		bskey, bskey, #224
362
	veor		\x3, \x3, \t3
363
	veor		\x4, \x4, \t4
364
	veor		\x5, \x5, \t5
365
	veor		\x6, \x6, \t6
366
	veor		\x7, \x7, \t7
367
	vext.8		\t0, \x0, \x0, #8
368
	vext.8		\t6, \x6, \x6, #8
369
	vext.8		\t7, \x7, \x7, #8
370
	veor		\t0, \t0, \x0
371
	vext.8		\t1, \x1, \x1, #8
372
	veor		\t6, \t6, \x6
373
	vext.8		\t2, \x2, \x2, #8
374
	veor		\t7, \t7, \x7
375
	vext.8		\t3, \x3, \x3, #8
376
	veor		\t1, \t1, \x1
377
	vext.8		\t4, \x4, \x4, #8
378
	veor		\t2, \t2, \x2
379
	vext.8		\t5, \x5, \x5, #8
380
	veor		\t3, \t3, \x3
381
	veor		\t4, \t4, \x4
382
	veor		\t5, \t5, \x5
383
	veor		\x0, \x0, \t6
384
	veor		\x1, \x1, \t6
385
	veor		\x2, \x2, \t0
386
	veor		\x4, \x4, \t2
387
	veor		\x3, \x3, \t1
388
	veor		\x1, \x1, \t7
389
	veor		\x2, \x2, \t7
390
	veor		\x4, \x4, \t6
391
	veor		\x5, \x5, \t3
392
	veor		\x3, \x3, \t6
393
	veor		\x6, \x6, \t4
394
	veor		\x4, \x4, \t7
395
	veor		\x5, \x5, \t7
396
	veor		\x7, \x7, \t5
397
	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
398
			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
399
	.endm
400

401
	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
402
	vshr.u64	\t0, \b0, #\n
403
	vshr.u64	\t1, \b1, #\n
404
	veor		\t0, \t0, \a0
405
	veor		\t1, \t1, \a1
406
	vand		\t0, \t0, \mask
407
	vand		\t1, \t1, \mask
408
	veor		\a0, \a0, \t0
409
	vshl.s64	\t0, \t0, #\n
410
	veor		\a1, \a1, \t1
411
	vshl.s64	\t1, \t1, #\n
412
	veor		\b0, \b0, \t0
413
	veor		\b1, \b1, \t1
414
	.endm
415

416
	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
417
	vmov.i8		\t0, #0x55
418
	vmov.i8		\t1, #0x33
419
	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
420
	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
421
	vmov.i8		\t0, #0x0f
422
	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
423
	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
424
	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
425
	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
426
	.endm
427

428
	.align		4
429
M0:	.quad		0x02060a0e03070b0f, 0x0004080c0105090d
430

431
	/*
432
	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
433
	 */
434
ENTRY(aesbs_convert_key)
435
	vld1.32		{q7}, [r1]!		// load round 0 key
436
	vld1.32		{q15}, [r1]!		// load round 1 key
437

438
	vmov.i8		q8,  #0x01		// bit masks
439
	vmov.i8		q9,  #0x02
440
	vmov.i8		q10, #0x04
441
	vmov.i8		q11, #0x08
442
	vmov.i8		q12, #0x10
443
	vmov.i8		q13, #0x20
444
	__ldr		q14, M0
445

446
	sub		r2, r2, #1
447
	vst1.8		{q7}, [r0, :128]!	// save round 0 key
448

449
.Lkey_loop:
450
	__tbl		q7, q15, q14
451
	vmov.i8		q6, #0x40
452
	vmov.i8		q15, #0x80
453

454
	vtst.8		q0, q7, q8
455
	vtst.8		q1, q7, q9
456
	vtst.8		q2, q7, q10
457
	vtst.8		q3, q7, q11
458
	vtst.8		q4, q7, q12
459
	vtst.8		q5, q7, q13
460
	vtst.8		q6, q7, q6
461
	vtst.8		q7, q7, q15
462
	vld1.32		{q15}, [r1]!		// load next round key
463
	vmvn		q0, q0
464
	vmvn		q1, q1
465
	vmvn		q5, q5
466
	vmvn		q6, q6
467

468
	subs		r2, r2, #1
469
	vst1.8		{q0-q1}, [r0, :256]!
470
	vst1.8		{q2-q3}, [r0, :256]!
471
	vst1.8		{q4-q5}, [r0, :256]!
472
	vst1.8		{q6-q7}, [r0, :256]!
473
	bne		.Lkey_loop
474

475
	vmov.i8		q7, #0x63		// compose .L63
476
	veor		q15, q15, q7
477
	vst1.8		{q15}, [r0, :128]
478
	bx		lr
479
ENDPROC(aesbs_convert_key)
480

481
	.align		4
482
M0SR:	.quad		0x0a0e02060f03070b, 0x0004080c05090d01
483

484
aesbs_encrypt8:
485
	vld1.8		{q9}, [bskey, :128]!	// round 0 key
486
	__ldr		q8, M0SR
487

488
	veor		q10, q0, q9		// xor with round0 key
489
	veor		q11, q1, q9
490
	__tbl		q0, q10, q8
491
	veor		q12, q2, q9
492
	__tbl		q1, q11, q8
493
	veor		q13, q3, q9
494
	__tbl		q2, q12, q8
495
	veor		q14, q4, q9
496
	__tbl		q3, q13, q8
497
	veor		q15, q5, q9
498
	__tbl		q4, q14, q8
499
	veor		q10, q6, q9
500
	__tbl		q5, q15, q8
501
	veor		q11, q7, q9
502
	__tbl		q6, q10, q8
503
	__tbl		q7, q11, q8
504

505
	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
506

507
	sub		rounds, rounds, #1
508
	b		.Lenc_sbox
509

510
	.align		5
511
SR:	.quad		0x0504070600030201, 0x0f0e0d0c0a09080b
512
SRM0:	.quad		0x0304090e00050a0f, 0x01060b0c0207080d
513

514
.Lenc_last:
515
	__ldr		q12, SRM0
516
.Lenc_loop:
517
	shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
518
.Lenc_sbox:
519
	sbox		q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
520
								q13, q14, q15
521
	subs		rounds, rounds, #1
522
	bcc		.Lenc_done
523

524
	mix_cols	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
525
								q13, q14, q15
526

527
	beq		.Lenc_last
528
	__ldr		q12, SR
529
	b		.Lenc_loop
530

531
.Lenc_done:
532
	vld1.8		{q12}, [bskey, :128]	// last round key
533

534
	bitslice	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
535

536
	veor		q0, q0, q12
537
	veor		q1, q1, q12
538
	veor		q4, q4, q12
539
	veor		q6, q6, q12
540
	veor		q3, q3, q12
541
	veor		q7, q7, q12
542
	veor		q2, q2, q12
543
	veor		q5, q5, q12
544
	bx		lr
545
ENDPROC(aesbs_encrypt8)
546

547
	.align		4
548
M0ISR:	.quad		0x0a0e0206070b0f03, 0x0004080c0d010509
549

550
aesbs_decrypt8:
551
	add		bskey, bskey, rounds, lsl #7
552
	sub		bskey, bskey, #112
553
	vld1.8		{q9}, [bskey, :128]	// round 0 key
554
	sub		bskey, bskey, #128
555
	__ldr		q8, M0ISR
556

557
	veor		q10, q0, q9		// xor with round0 key
558
	veor		q11, q1, q9
559
	__tbl		q0, q10, q8
560
	veor		q12, q2, q9
561
	__tbl		q1, q11, q8
562
	veor		q13, q3, q9
563
	__tbl		q2, q12, q8
564
	veor		q14, q4, q9
565
	__tbl		q3, q13, q8
566
	veor		q15, q5, q9
567
	__tbl		q4, q14, q8
568
	veor		q10, q6, q9
569
	__tbl		q5, q15, q8
570
	veor		q11, q7, q9
571
	__tbl		q6, q10, q8
572
	__tbl		q7, q11, q8
573

574
	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
575

576
	sub		rounds, rounds, #1
577
	b		.Ldec_sbox
578

579
	.align		5
580
ISR:	.quad		0x0504070602010003, 0x0f0e0d0c080b0a09
581
ISRM0:	.quad		0x01040b0e0205080f, 0x0306090c00070a0d
582

583
.Ldec_last:
584
	__ldr		q12, ISRM0
585
.Ldec_loop:
586
	inv_shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
587
.Ldec_sbox:
588
	inv_sbox	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
589
								q13, q14, q15
590
	subs		rounds, rounds, #1
591
	bcc		.Ldec_done
592

593
	inv_mix_cols	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
594
								q13, q14, q15
595

596
	beq		.Ldec_last
597
	__ldr		q12, ISR
598
	b		.Ldec_loop
599

600
.Ldec_done:
601
	add		bskey, bskey, #112
602
	vld1.8		{q12}, [bskey, :128]	// last round key
603

604
	bitslice	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
605

606
	veor		q0, q0, q12
607
	veor		q1, q1, q12
608
	veor		q6, q6, q12
609
	veor		q4, q4, q12
610
	veor		q2, q2, q12
611
	veor		q7, q7, q12
612
	veor		q3, q3, q12
613
	veor		q5, q5, q12
614
	bx		lr
615
ENDPROC(aesbs_decrypt8)
616

617
	/*
618
	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
619
	 *		     int blocks)
620
	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
621
	 *		     int blocks)
622
	 */
623
	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
624
	push		{r4-r6, lr}
625
	ldr		r5, [sp, #16]		// number of blocks
626

627
99:	adr		ip, 0f
628
	and		lr, r5, #7
629
	cmp		r5, #8
630
	sub		ip, ip, lr, lsl #2
631
	movlt		pc, ip			// computed goto if blocks < 8
632

633
	vld1.8		{q0}, [r1]!
634
	vld1.8		{q1}, [r1]!
635
	vld1.8		{q2}, [r1]!
636
	vld1.8		{q3}, [r1]!
637
	vld1.8		{q4}, [r1]!
638
	vld1.8		{q5}, [r1]!
639
	vld1.8		{q6}, [r1]!
640
	vld1.8		{q7}, [r1]!
641

642
0:	mov		bskey, r2
643
	mov		rounds, r3
644
	bl		\do8
645

646
	adr		ip, 1f
647
	and		lr, r5, #7
648
	cmp		r5, #8
649
	sub		ip, ip, lr, lsl #2
650
	movlt		pc, ip			// computed goto if blocks < 8
651

652
	vst1.8		{\o0}, [r0]!
653
	vst1.8		{\o1}, [r0]!
654
	vst1.8		{\o2}, [r0]!
655
	vst1.8		{\o3}, [r0]!
656
	vst1.8		{\o4}, [r0]!
657
	vst1.8		{\o5}, [r0]!
658
	vst1.8		{\o6}, [r0]!
659
	vst1.8		{\o7}, [r0]!
660

661
1:	subs		r5, r5, #8
662
	bgt		99b
663

664
	pop		{r4-r6, pc}
665
	.endm
666

667
	.align		4
668
ENTRY(aesbs_ecb_encrypt)
669
	__ecb_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
670
ENDPROC(aesbs_ecb_encrypt)
671

672
	.align		4
673
ENTRY(aesbs_ecb_decrypt)
674
	__ecb_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
675
ENDPROC(aesbs_ecb_decrypt)
676

677
	/*
678
	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
679
	 *		     int rounds, int blocks, u8 iv[])
680
	 */
681
	.align		4
682
ENTRY(aesbs_cbc_decrypt)
683
	mov		ip, sp
684
	push		{r4-r6, lr}
685
	ldm		ip, {r5-r6}		// load args 4-5
686

687
99:	adr		ip, 0f
688
	and		lr, r5, #7
689
	cmp		r5, #8
690
	sub		ip, ip, lr, lsl #2
691
	mov		lr, r1
692
	movlt		pc, ip			// computed goto if blocks < 8
693

694
	vld1.8		{q0}, [lr]!
695
	vld1.8		{q1}, [lr]!
696
	vld1.8		{q2}, [lr]!
697
	vld1.8		{q3}, [lr]!
698
	vld1.8		{q4}, [lr]!
699
	vld1.8		{q5}, [lr]!
700
	vld1.8		{q6}, [lr]!
701
	vld1.8		{q7}, [lr]
702

703
0:	mov		bskey, r2
704
	mov		rounds, r3
705
	bl		aesbs_decrypt8
706

707
	vld1.8		{q8}, [r6]
708
	vmov		q9, q8
709
	vmov		q10, q8
710
	vmov		q11, q8
711
	vmov		q12, q8
712
	vmov		q13, q8
713
	vmov		q14, q8
714
	vmov		q15, q8
715

716
	adr		ip, 1f
717
	and		lr, r5, #7
718
	cmp		r5, #8
719
	sub		ip, ip, lr, lsl #2
720
	movlt		pc, ip			// computed goto if blocks < 8
721

722
	vld1.8		{q9}, [r1]!
723
	vld1.8		{q10}, [r1]!
724
	vld1.8		{q11}, [r1]!
725
	vld1.8		{q12}, [r1]!
726
	vld1.8		{q13}, [r1]!
727
	vld1.8		{q14}, [r1]!
728
	vld1.8		{q15}, [r1]!
729
	W(nop)
730

731
1:	adr		ip, 2f
732
	sub		ip, ip, lr, lsl #3
733
	movlt		pc, ip			// computed goto if blocks < 8
734

735
	veor		q0, q0, q8
736
	vst1.8		{q0}, [r0]!
737
	veor		q1, q1, q9
738
	vst1.8		{q1}, [r0]!
739
	veor		q6, q6, q10
740
	vst1.8		{q6}, [r0]!
741
	veor		q4, q4, q11
742
	vst1.8		{q4}, [r0]!
743
	veor		q2, q2, q12
744
	vst1.8		{q2}, [r0]!
745
	veor		q7, q7, q13
746
	vst1.8		{q7}, [r0]!
747
	veor		q3, q3, q14
748
	vst1.8		{q3}, [r0]!
749
	veor		q5, q5, q15
750
	vld1.8		{q8}, [r1]!		// load next round's iv
751
2:	vst1.8		{q5}, [r0]!
752

753
	subs		r5, r5, #8
754
	vst1.8		{q8}, [r6]		// store next round's iv
755
	bgt		99b
756

757
	pop		{r4-r6, pc}
758
ENDPROC(aesbs_cbc_decrypt)
759

760
	.macro		next_ctr, q
761
	vmov		\q\()h, r9, r10
762
	adds		r10, r10, #1
763
	adcs		r9, r9, #0
764
	vmov		\q\()l, r7, r8
765
	adcs		r8, r8, #0
766
	adc		r7, r7, #0
767
	vrev32.8	\q, \q
768
	.endm
769

770
	/*
771
	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
772
	 *		     int rounds, int bytes, u8 ctr[])
773
	 */
774
ENTRY(aesbs_ctr_encrypt)
775
	mov		ip, sp
776
	push		{r4-r10, lr}
777

778
	ldm		ip, {r5, r6}		// load args 4-5
779
	vld1.8		{q0}, [r6]		// load counter
780
	vrev32.8	q1, q0
781
	vmov		r9, r10, d3
782
	vmov		r7, r8, d2
783

784
	adds		r10, r10, #1
785
	adcs		r9, r9, #0
786
	adcs		r8, r8, #0
787
	adc		r7, r7, #0
788

789
99:	vmov		q1, q0
790
	sub		lr, r5, #1
791
	vmov		q2, q0
792
	adr		ip, 0f
793
	vmov		q3, q0
794
	and		lr, lr, #112
795
	vmov		q4, q0
796
	cmp		r5, #112
797
	vmov		q5, q0
798
	sub		ip, ip, lr, lsl #1
799
	vmov		q6, q0
800
	add		ip, ip, lr, lsr #2
801
	vmov		q7, q0
802
	movle		pc, ip			// computed goto if bytes < 112
803

804
	next_ctr	q1
805
	next_ctr	q2
806
	next_ctr	q3
807
	next_ctr	q4
808
	next_ctr	q5
809
	next_ctr	q6
810
	next_ctr	q7
811

812
0:	mov		bskey, r2
813
	mov		rounds, r3
814
	bl		aesbs_encrypt8
815

816
	adr		ip, 1f
817
	sub		lr, r5, #1
818
	cmp		r5, #128
819
	bic		lr, lr, #15
820
	ands		r4, r5, #15		// preserves C flag
821
	teqcs		r5, r5			// set Z flag if not last iteration
822
	sub		ip, ip, lr, lsr #2
823
	rsb		r4, r4, #16
824
	movcc		pc, ip			// computed goto if bytes < 128
825

826
	vld1.8		{q8}, [r1]!
827
	vld1.8		{q9}, [r1]!
828
	vld1.8		{q10}, [r1]!
829
	vld1.8		{q11}, [r1]!
830
	vld1.8		{q12}, [r1]!
831
	vld1.8		{q13}, [r1]!
832
	vld1.8		{q14}, [r1]!
833
1:	subne		r1, r1, r4
834
	vld1.8		{q15}, [r1]!
835

836
	add		ip, ip, #2f - 1b
837

838
	veor		q0, q0, q8
839
	veor		q1, q1, q9
840
	veor		q4, q4, q10
841
	veor		q6, q6, q11
842
	veor		q3, q3, q12
843
	veor		q7, q7, q13
844
	veor		q2, q2, q14
845
	bne		3f
846
	veor		q5, q5, q15
847

848
	movcc		pc, ip			// computed goto if bytes < 128
849

850
	vst1.8		{q0}, [r0]!
851
	vst1.8		{q1}, [r0]!
852
	vst1.8		{q4}, [r0]!
853
	vst1.8		{q6}, [r0]!
854
	vst1.8		{q3}, [r0]!
855
	vst1.8		{q7}, [r0]!
856
	vst1.8		{q2}, [r0]!
857
2:	subne		r0, r0, r4
858
	vst1.8		{q5}, [r0]!
859

860
	next_ctr	q0
861

862
	subs		r5, r5, #128
863
	bgt		99b
864

865
	vst1.8		{q0}, [r6]
866
	pop		{r4-r10, pc}
867

868
3:	adr		lr, .Lpermute_table + 16
869
	cmp		r5, #16			// Z flag remains cleared
870
	sub		lr, lr, r4
871
	vld1.8		{q8-q9}, [lr]
872
	vtbl.8		d16, {q5}, d16
873
	vtbl.8		d17, {q5}, d17
874
	veor		q5, q8, q15
875
	bcc		4f			// have to reload prev if R5 < 16
876
	vtbx.8		d10, {q2}, d18
877
	vtbx.8		d11, {q2}, d19
878
	mov		pc, ip			// branch back to VST sequence
879

880
4:	sub		r0, r0, r4
881
	vshr.s8		q9, q9, #7		// create mask for VBIF
882
	vld1.8		{q8}, [r0]		// reload
883
	vbif		q5, q8, q9
884
	vst1.8		{q5}, [r0]
885
	pop		{r4-r10, pc}
886
ENDPROC(aesbs_ctr_encrypt)
887

888
	.align		6
889
.Lpermute_table:
890
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
891
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
892
	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
893
	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
894
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
895
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
896

897
	.macro		next_tweak, out, in, const, tmp
898
	vshr.s64	\tmp, \in, #63
899
	vand		\tmp, \tmp, \const
900
	vadd.u64	\out, \in, \in
901
	vext.8		\tmp, \tmp, \tmp, #8
902
	veor		\out, \out, \tmp
903
	.endm
904

905
	/*
906
	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
907
	 *		     int blocks, u8 iv[], int reorder_last_tweak)
908
	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
909
	 *		     int blocks, u8 iv[], int reorder_last_tweak)
910
	 */
911
	.align		6
912
__xts_prepare8:
913
	vld1.8		{q14}, [r7]		// load iv
914
	vmov.i32	d30, #0x87		// compose tweak mask vector
915
	vmovl.u32	q15, d30
916
	vshr.u64	d30, d31, #7
917
	vmov		q12, q14
918

919
	adr		ip, 0f
920
	and		r4, r6, #7
921
	cmp		r6, #8
922
	sub		ip, ip, r4, lsl #5
923
	mov		r4, sp
924
	movlt		pc, ip			// computed goto if blocks < 8
925

926
	vld1.8		{q0}, [r1]!
927
	next_tweak	q12, q14, q15, q13
928
	veor		q0, q0, q14
929
	vst1.8		{q14}, [r4, :128]!
930

931
	vld1.8		{q1}, [r1]!
932
	next_tweak	q14, q12, q15, q13
933
	veor		q1, q1, q12
934
	vst1.8		{q12}, [r4, :128]!
935

936
	vld1.8		{q2}, [r1]!
937
	next_tweak	q12, q14, q15, q13
938
	veor		q2, q2, q14
939
	vst1.8		{q14}, [r4, :128]!
940

941
	vld1.8		{q3}, [r1]!
942
	next_tweak	q14, q12, q15, q13
943
	veor		q3, q3, q12
944
	vst1.8		{q12}, [r4, :128]!
945

946
	vld1.8		{q4}, [r1]!
947
	next_tweak	q12, q14, q15, q13
948
	veor		q4, q4, q14
949
	vst1.8		{q14}, [r4, :128]!
950

951
	vld1.8		{q5}, [r1]!
952
	next_tweak	q14, q12, q15, q13
953
	veor		q5, q5, q12
954
	vst1.8		{q12}, [r4, :128]!
955

956
	vld1.8		{q6}, [r1]!
957
	next_tweak	q12, q14, q15, q13
958
	veor		q6, q6, q14
959
	vst1.8		{q14}, [r4, :128]!
960

961
	vld1.8		{q7}, [r1]!
962
	next_tweak	q14, q12, q15, q13
963
THUMB(	itt		le		)
964
	W(cmple)	r8, #0
965
	ble		1f
966
0:	veor		q7, q7, q12
967
	vst1.8		{q12}, [r4, :128]
968

969
	vst1.8		{q14}, [r7]		// store next iv
970
	bx		lr
971

972
1:	vswp		q12, q14
973
	b		0b
974
ENDPROC(__xts_prepare8)
975

976
	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
977
	push		{r4-r8, lr}
978
	mov		r5, sp			// preserve sp
979
	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
980
	rsb		r8, ip, #1
981
	sub		ip, sp, #128		// make room for 8x tweak
982
	bic		ip, ip, #0xf		// align sp to 16 bytes
983
	mov		sp, ip
984

985
99:	bl		__xts_prepare8
986

987
	mov		bskey, r2
988
	mov		rounds, r3
989
	bl		\do8
990

991
	adr		ip, 0f
992
	and		lr, r6, #7
993
	cmp		r6, #8
994
	sub		ip, ip, lr, lsl #2
995
	mov		r4, sp
996
	movlt		pc, ip			// computed goto if blocks < 8
997

998
	vld1.8		{q8}, [r4, :128]!
999
	vld1.8		{q9}, [r4, :128]!
1000
	vld1.8		{q10}, [r4, :128]!
1001
	vld1.8		{q11}, [r4, :128]!
1002
	vld1.8		{q12}, [r4, :128]!
1003
	vld1.8		{q13}, [r4, :128]!
1004
	vld1.8		{q14}, [r4, :128]!
1005
	vld1.8		{q15}, [r4, :128]
1006

1007
0:	adr		ip, 1f
1008
	sub		ip, ip, lr, lsl #3
1009
	movlt		pc, ip			// computed goto if blocks < 8
1010

1011
	veor		\o0, \o0, q8
1012
	vst1.8		{\o0}, [r0]!
1013
	veor		\o1, \o1, q9
1014
	vst1.8		{\o1}, [r0]!
1015
	veor		\o2, \o2, q10
1016
	vst1.8		{\o2}, [r0]!
1017
	veor		\o3, \o3, q11
1018
	vst1.8		{\o3}, [r0]!
1019
	veor		\o4, \o4, q12
1020
	vst1.8		{\o4}, [r0]!
1021
	veor		\o5, \o5, q13
1022
	vst1.8		{\o5}, [r0]!
1023
	veor		\o6, \o6, q14
1024
	vst1.8		{\o6}, [r0]!
1025
	veor		\o7, \o7, q15
1026
	vst1.8		{\o7}, [r0]!
1027

1028
1:	subs		r6, r6, #8
1029
	bgt		99b
1030

1031
	mov		sp, r5
1032
	pop		{r4-r8, pc}
1033
	.endm
1034

1035
ENTRY(aesbs_xts_encrypt)
1036
	mov		ip, #0			// never reorder final tweak
1037
	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1038
ENDPROC(aesbs_xts_encrypt)
1039

1040
ENTRY(aesbs_xts_decrypt)
1041
	ldr		ip, [sp, #8]		// reorder final tweak?
1042
	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1043
ENDPROC(aesbs_xts_decrypt)
1044

1045
Product

Resources

Company