CoCalc -- blake2b-neon-core.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm/crypto/blake2b-neon-core.S
²⁹²⁶⁶ views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
 * BLAKE2b digest algorithm, NEON accelerated
4
 *
5
 * Copyright 2020 Google LLC
6
 *
7
 * Author: Eric Biggers <[email protected]>
8
 */
9

10
#include <linux/linkage.h>
11

12
	.text
13
	.fpu		neon
14

15
	// The arguments to blake2b_compress_neon()
16
	STATE		.req	r0
17
	BLOCK		.req	r1
18
	NBLOCKS		.req	r2
19
	INC		.req	r3
20

21
	// Pointers to the rotation tables
22
	ROR24_TABLE	.req	r4
23
	ROR16_TABLE	.req	r5
24

25
	// The original stack pointer
26
	ORIG_SP		.req	r6
27

28
	// NEON registers which contain the message words of the current block.
29
	// M_0-M_3 are occasionally used for other purposes too.
30
	M_0		.req	d16
31
	M_1		.req	d17
32
	M_2		.req	d18
33
	M_3		.req	d19
34
	M_4		.req	d20
35
	M_5		.req	d21
36
	M_6		.req	d22
37
	M_7		.req	d23
38
	M_8		.req	d24
39
	M_9		.req	d25
40
	M_10		.req	d26
41
	M_11		.req	d27
42
	M_12		.req	d28
43
	M_13		.req	d29
44
	M_14		.req	d30
45
	M_15		.req	d31
46

47
	.align		4
48
	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49
	// instruction.  This is the most efficient way to implement these
50
	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
51
	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52
.Lror24_table:
53
	.byte		3, 4, 5, 6, 7, 0, 1, 2
54
.Lror16_table:
55
	.byte		2, 3, 4, 5, 6, 7, 0, 1
56
	// The BLAKE2b initialization vector
57
.Lblake2b_IV:
58
	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59
	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60
	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
61
	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
62

63
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64
// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
65
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66
// (M_0-M_3), so that they can be reloaded if they are used as temporary
67
// registers.  The macro arguments s0-s15 give the order in which the message
68
// words are used in this round.  'final' is 1 if this is the final round.
69
.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
70
			s8, s9, s10, s11, s12, s13, s14, s15, final=0
71

72
	// Mix the columns:
73
	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74
	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75

76
	// a += b + m[blake2b_sigma[r][2*i + 0]];
77
	vadd.u64	q0, q0, q2
78
	vadd.u64	q1, q1, q3
79
	vadd.u64	d0, d0, M_\s0
80
	vadd.u64	d1, d1, M_\s2
81
	vadd.u64	d2, d2, M_\s4
82
	vadd.u64	d3, d3, M_\s6
83

84
	// d = ror64(d ^ a, 32);
85
	veor		q6, q6, q0
86
	veor		q7, q7, q1
87
	vrev64.32	q6, q6
88
	vrev64.32	q7, q7
89

90
	// c += d;
91
	vadd.u64	q4, q4, q6
92
	vadd.u64	q5, q5, q7
93

94
	// b = ror64(b ^ c, 24);
95
	vld1.8		{M_0}, [ROR24_TABLE, :64]
96
	veor		q2, q2, q4
97
	veor		q3, q3, q5
98
	vtbl.8		d4, {d4}, M_0
99
	vtbl.8		d5, {d5}, M_0
100
	vtbl.8		d6, {d6}, M_0
101
	vtbl.8		d7, {d7}, M_0
102

103
	// a += b + m[blake2b_sigma[r][2*i + 1]];
104
	//
105
	// M_0 got clobbered above, so we have to reload it if any of the four
106
	// message words this step needs happens to be M_0.  Otherwise we don't
107
	// need to reload it here, as it will just get clobbered again below.
108
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
109
	vld1.8		{M_0}, [sp, :64]
110
.endif
111
	vadd.u64	q0, q0, q2
112
	vadd.u64	q1, q1, q3
113
	vadd.u64	d0, d0, M_\s1
114
	vadd.u64	d1, d1, M_\s3
115
	vadd.u64	d2, d2, M_\s5
116
	vadd.u64	d3, d3, M_\s7
117

118
	// d = ror64(d ^ a, 16);
119
	vld1.8		{M_0}, [ROR16_TABLE, :64]
120
	veor		q6, q6, q0
121
	veor		q7, q7, q1
122
	vtbl.8		d12, {d12}, M_0
123
	vtbl.8		d13, {d13}, M_0
124
	vtbl.8		d14, {d14}, M_0
125
	vtbl.8		d15, {d15}, M_0
126

127
	// c += d;
128
	vadd.u64	q4, q4, q6
129
	vadd.u64	q5, q5, q7
130

131
	// b = ror64(b ^ c, 63);
132
	//
133
	// This rotation amount isn't a multiple of 8, so it has to be
134
	// implemented using a pair of shifts, which requires temporary
135
	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136
	veor		q8, q2, q4
137
	veor		q9, q3, q5
138
	vshr.u64	q2, q8, #63
139
	vshr.u64	q3, q9, #63
140
	vsli.u64	q2, q8, #1
141
	vsli.u64	q3, q9, #1
142
	vld1.8		{q8-q9}, [sp, :256]
143

144
	// Mix the diagonals:
145
	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146
	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147
	//
148
	// There are two possible ways to do this: use 'vext' instructions to
149
	// shift the rows of the matrix so that the diagonals become columns,
150
	// and undo it afterwards; or just use 64-bit operations on 'd'
151
	// registers instead of 128-bit operations on 'q' registers.  We use the
152
	// latter approach, as it performs much better on Cortex-A7.
153

154
	// a += b + m[blake2b_sigma[r][2*i + 0]];
155
	vadd.u64	d0, d0, d5
156
	vadd.u64	d1, d1, d6
157
	vadd.u64	d2, d2, d7
158
	vadd.u64	d3, d3, d4
159
	vadd.u64	d0, d0, M_\s8
160
	vadd.u64	d1, d1, M_\s10
161
	vadd.u64	d2, d2, M_\s12
162
	vadd.u64	d3, d3, M_\s14
163

164
	// d = ror64(d ^ a, 32);
165
	veor		d15, d15, d0
166
	veor		d12, d12, d1
167
	veor		d13, d13, d2
168
	veor		d14, d14, d3
169
	vrev64.32	d15, d15
170
	vrev64.32	d12, d12
171
	vrev64.32	d13, d13
172
	vrev64.32	d14, d14
173

174
	// c += d;
175
	vadd.u64	d10, d10, d15
176
	vadd.u64	d11, d11, d12
177
	vadd.u64	d8, d8, d13
178
	vadd.u64	d9, d9, d14
179

180
	// b = ror64(b ^ c, 24);
181
	vld1.8		{M_0}, [ROR24_TABLE, :64]
182
	veor		d5, d5, d10
183
	veor		d6, d6, d11
184
	veor		d7, d7, d8
185
	veor		d4, d4, d9
186
	vtbl.8		d5, {d5}, M_0
187
	vtbl.8		d6, {d6}, M_0
188
	vtbl.8		d7, {d7}, M_0
189
	vtbl.8		d4, {d4}, M_0
190

191
	// a += b + m[blake2b_sigma[r][2*i + 1]];
192
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
193
	vld1.8		{M_0}, [sp, :64]
194
.endif
195
	vadd.u64	d0, d0, d5
196
	vadd.u64	d1, d1, d6
197
	vadd.u64	d2, d2, d7
198
	vadd.u64	d3, d3, d4
199
	vadd.u64	d0, d0, M_\s9
200
	vadd.u64	d1, d1, M_\s11
201
	vadd.u64	d2, d2, M_\s13
202
	vadd.u64	d3, d3, M_\s15
203

204
	// d = ror64(d ^ a, 16);
205
	vld1.8		{M_0}, [ROR16_TABLE, :64]
206
	veor		d15, d15, d0
207
	veor		d12, d12, d1
208
	veor		d13, d13, d2
209
	veor		d14, d14, d3
210
	vtbl.8		d12, {d12}, M_0
211
	vtbl.8		d13, {d13}, M_0
212
	vtbl.8		d14, {d14}, M_0
213
	vtbl.8		d15, {d15}, M_0
214

215
	// c += d;
216
	vadd.u64	d10, d10, d15
217
	vadd.u64	d11, d11, d12
218
	vadd.u64	d8, d8, d13
219
	vadd.u64	d9, d9, d14
220

221
	// b = ror64(b ^ c, 63);
222
	veor		d16, d4, d9
223
	veor		d17, d5, d10
224
	veor		d18, d6, d11
225
	veor		d19, d7, d8
226
	vshr.u64	q2, q8, #63
227
	vshr.u64	q3, q9, #63
228
	vsli.u64	q2, q8, #1
229
	vsli.u64	q3, q9, #1
230
	// Reloading q8-q9 can be skipped on the final round.
231
.if ! \final
232
	vld1.8		{q8-q9}, [sp, :256]
233
.endif
234
.endm
235

236
//
237
// void blake2b_compress_neon(struct blake2b_state *state,
238
//			      const u8 *block, size_t nblocks, u32 inc);
239
//
240
// Only the first three fields of struct blake2b_state are used:
241
//	u64 h[8];	(inout)
242
//	u64 t[2];	(inout)
243
//	u64 f[2];	(in)
244
//
245
	.align		5
246
ENTRY(blake2b_compress_neon)
247
	push		{r4-r10}
248

249
	// Allocate a 32-byte stack buffer that is 32-byte aligned.
250
	mov		ORIG_SP, sp
251
	sub		ip, sp, #32
252
	bic		ip, ip, #31
253
	mov		sp, ip
254

255
	adr		ROR24_TABLE, .Lror24_table
256
	adr		ROR16_TABLE, .Lror16_table
257

258
	mov		ip, STATE
259
	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
260
	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
261
.Lnext_block:
262
	  adr		r10, .Lblake2b_IV
263
	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
264
	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
265
	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
266
	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
267
	  adds		r7, r7, INC		// Increment counter
268
	bcs		.Lslow_inc_ctr
269
	vmov.i32	d28[0], r7
270
	vst1.64		{d28}, [ip]		// Update t[0]
271
.Linc_ctr_done:
272

273
	// Load the next message block and finish initializing the state matrix
274
	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
275
	// entire state matrix in q0-q7 and the entire message block in q8-15.
276
	//
277
	// However, _blake2b_round also needs some extra registers for rotates,
278
	// so we have to spill some registers.  It's better to spill the message
279
	// registers than the state registers, as the message doesn't change.
280
	// Therefore we store a copy of the first 32 bytes of the message block
281
	// (q8-q9) in an aligned buffer on the stack so that they can be
282
	// reloaded when needed.  (We could just reload directly from the
283
	// message buffer, but it's faster to use aligned loads.)
284
	vld1.8		{q8-q9}, [BLOCK]!
285
	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
286
	vld1.8		{q10-q11}, [BLOCK]!
287
	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
288
	vld1.8		{q12-q13}, [BLOCK]!
289
	vst1.8		{q8-q9}, [sp, :256]
290
	  mov		ip, STATE
291
	vld1.8		{q14-q15}, [BLOCK]!
292

293
	// Execute the rounds.  Each round is provided the order in which it
294
	// needs to use the message words.
295
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297
	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298
	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299
	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300
	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301
	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302
	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303
	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304
	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305
	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306
	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
307
			final=1
308

309
	// Fold the final state matrix into the hash chaining value:
310
	//
311
	//	for (i = 0; i < 8; i++)
312
	//		h[i] ^= v[i] ^ v[i + 8];
313
	//
314
	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
315
	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
316
	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
317
	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
318
	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
319
	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
320
	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
321
	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
322
	  mov		ip, STATE
323
	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
324
	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
325
	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
326
	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
327
	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
328

329
	// Advance to the next block, if there is one.
330
	bne		.Lnext_block		// nblocks != 0?
331

332
	mov		sp, ORIG_SP
333
	pop		{r4-r10}
334
	mov		pc, lr
335

336
.Lslow_inc_ctr:
337
	// Handle the case where the counter overflowed its low 32 bits, by
338
	// carrying the overflow bit into the full 128-bit counter.
339
	vmov		r9, r10, d29
340
	adcs		r8, r8, #0
341
	adcs		r9, r9, #0
342
	adc		r10, r10, #0
343
	vmov		d28, r7, r8
344
	vmov		d29, r9, r10
345
	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
346
	b		.Linc_ctr_done
347
ENDPROC(blake2b_compress_neon)
348

349
Product

Resources

Company