Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm/crypto/blake2b-neon-core.S
29266 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* BLAKE2b digest algorithm, NEON accelerated
4
*
5
* Copyright 2020 Google LLC
6
*
7
* Author: Eric Biggers <[email protected]>
8
*/
9
10
#include <linux/linkage.h>
11
12
.text
13
.fpu neon
14
15
// The arguments to blake2b_compress_neon()
16
STATE .req r0
17
BLOCK .req r1
18
NBLOCKS .req r2
19
INC .req r3
20
21
// Pointers to the rotation tables
22
ROR24_TABLE .req r4
23
ROR16_TABLE .req r5
24
25
// The original stack pointer
26
ORIG_SP .req r6
27
28
// NEON registers which contain the message words of the current block.
29
// M_0-M_3 are occasionally used for other purposes too.
30
M_0 .req d16
31
M_1 .req d17
32
M_2 .req d18
33
M_3 .req d19
34
M_4 .req d20
35
M_5 .req d21
36
M_6 .req d22
37
M_7 .req d23
38
M_8 .req d24
39
M_9 .req d25
40
M_10 .req d26
41
M_11 .req d27
42
M_12 .req d28
43
M_13 .req d29
44
M_14 .req d30
45
M_15 .req d31
46
47
.align 4
48
// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49
// instruction. This is the most efficient way to implement these
50
// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
51
// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52
.Lror24_table:
53
.byte 3, 4, 5, 6, 7, 0, 1, 2
54
.Lror16_table:
55
.byte 2, 3, 4, 5, 6, 7, 0, 1
56
// The BLAKE2b initialization vector
57
.Lblake2b_IV:
58
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
59
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
60
.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
61
.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
62
63
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64
// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
65
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66
// (M_0-M_3), so that they can be reloaded if they are used as temporary
67
// registers. The macro arguments s0-s15 give the order in which the message
68
// words are used in this round. 'final' is 1 if this is the final round.
69
.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
70
s8, s9, s10, s11, s12, s13, s14, s15, final=0
71
72
// Mix the columns:
73
// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74
// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75
76
// a += b + m[blake2b_sigma[r][2*i + 0]];
77
vadd.u64 q0, q0, q2
78
vadd.u64 q1, q1, q3
79
vadd.u64 d0, d0, M_\s0
80
vadd.u64 d1, d1, M_\s2
81
vadd.u64 d2, d2, M_\s4
82
vadd.u64 d3, d3, M_\s6
83
84
// d = ror64(d ^ a, 32);
85
veor q6, q6, q0
86
veor q7, q7, q1
87
vrev64.32 q6, q6
88
vrev64.32 q7, q7
89
90
// c += d;
91
vadd.u64 q4, q4, q6
92
vadd.u64 q5, q5, q7
93
94
// b = ror64(b ^ c, 24);
95
vld1.8 {M_0}, [ROR24_TABLE, :64]
96
veor q2, q2, q4
97
veor q3, q3, q5
98
vtbl.8 d4, {d4}, M_0
99
vtbl.8 d5, {d5}, M_0
100
vtbl.8 d6, {d6}, M_0
101
vtbl.8 d7, {d7}, M_0
102
103
// a += b + m[blake2b_sigma[r][2*i + 1]];
104
//
105
// M_0 got clobbered above, so we have to reload it if any of the four
106
// message words this step needs happens to be M_0. Otherwise we don't
107
// need to reload it here, as it will just get clobbered again below.
108
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
109
vld1.8 {M_0}, [sp, :64]
110
.endif
111
vadd.u64 q0, q0, q2
112
vadd.u64 q1, q1, q3
113
vadd.u64 d0, d0, M_\s1
114
vadd.u64 d1, d1, M_\s3
115
vadd.u64 d2, d2, M_\s5
116
vadd.u64 d3, d3, M_\s7
117
118
// d = ror64(d ^ a, 16);
119
vld1.8 {M_0}, [ROR16_TABLE, :64]
120
veor q6, q6, q0
121
veor q7, q7, q1
122
vtbl.8 d12, {d12}, M_0
123
vtbl.8 d13, {d13}, M_0
124
vtbl.8 d14, {d14}, M_0
125
vtbl.8 d15, {d15}, M_0
126
127
// c += d;
128
vadd.u64 q4, q4, q6
129
vadd.u64 q5, q5, q7
130
131
// b = ror64(b ^ c, 63);
132
//
133
// This rotation amount isn't a multiple of 8, so it has to be
134
// implemented using a pair of shifts, which requires temporary
135
// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136
veor q8, q2, q4
137
veor q9, q3, q5
138
vshr.u64 q2, q8, #63
139
vshr.u64 q3, q9, #63
140
vsli.u64 q2, q8, #1
141
vsli.u64 q3, q9, #1
142
vld1.8 {q8-q9}, [sp, :256]
143
144
// Mix the diagonals:
145
// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146
// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147
//
148
// There are two possible ways to do this: use 'vext' instructions to
149
// shift the rows of the matrix so that the diagonals become columns,
150
// and undo it afterwards; or just use 64-bit operations on 'd'
151
// registers instead of 128-bit operations on 'q' registers. We use the
152
// latter approach, as it performs much better on Cortex-A7.
153
154
// a += b + m[blake2b_sigma[r][2*i + 0]];
155
vadd.u64 d0, d0, d5
156
vadd.u64 d1, d1, d6
157
vadd.u64 d2, d2, d7
158
vadd.u64 d3, d3, d4
159
vadd.u64 d0, d0, M_\s8
160
vadd.u64 d1, d1, M_\s10
161
vadd.u64 d2, d2, M_\s12
162
vadd.u64 d3, d3, M_\s14
163
164
// d = ror64(d ^ a, 32);
165
veor d15, d15, d0
166
veor d12, d12, d1
167
veor d13, d13, d2
168
veor d14, d14, d3
169
vrev64.32 d15, d15
170
vrev64.32 d12, d12
171
vrev64.32 d13, d13
172
vrev64.32 d14, d14
173
174
// c += d;
175
vadd.u64 d10, d10, d15
176
vadd.u64 d11, d11, d12
177
vadd.u64 d8, d8, d13
178
vadd.u64 d9, d9, d14
179
180
// b = ror64(b ^ c, 24);
181
vld1.8 {M_0}, [ROR24_TABLE, :64]
182
veor d5, d5, d10
183
veor d6, d6, d11
184
veor d7, d7, d8
185
veor d4, d4, d9
186
vtbl.8 d5, {d5}, M_0
187
vtbl.8 d6, {d6}, M_0
188
vtbl.8 d7, {d7}, M_0
189
vtbl.8 d4, {d4}, M_0
190
191
// a += b + m[blake2b_sigma[r][2*i + 1]];
192
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
193
vld1.8 {M_0}, [sp, :64]
194
.endif
195
vadd.u64 d0, d0, d5
196
vadd.u64 d1, d1, d6
197
vadd.u64 d2, d2, d7
198
vadd.u64 d3, d3, d4
199
vadd.u64 d0, d0, M_\s9
200
vadd.u64 d1, d1, M_\s11
201
vadd.u64 d2, d2, M_\s13
202
vadd.u64 d3, d3, M_\s15
203
204
// d = ror64(d ^ a, 16);
205
vld1.8 {M_0}, [ROR16_TABLE, :64]
206
veor d15, d15, d0
207
veor d12, d12, d1
208
veor d13, d13, d2
209
veor d14, d14, d3
210
vtbl.8 d12, {d12}, M_0
211
vtbl.8 d13, {d13}, M_0
212
vtbl.8 d14, {d14}, M_0
213
vtbl.8 d15, {d15}, M_0
214
215
// c += d;
216
vadd.u64 d10, d10, d15
217
vadd.u64 d11, d11, d12
218
vadd.u64 d8, d8, d13
219
vadd.u64 d9, d9, d14
220
221
// b = ror64(b ^ c, 63);
222
veor d16, d4, d9
223
veor d17, d5, d10
224
veor d18, d6, d11
225
veor d19, d7, d8
226
vshr.u64 q2, q8, #63
227
vshr.u64 q3, q9, #63
228
vsli.u64 q2, q8, #1
229
vsli.u64 q3, q9, #1
230
// Reloading q8-q9 can be skipped on the final round.
231
.if ! \final
232
vld1.8 {q8-q9}, [sp, :256]
233
.endif
234
.endm
235
236
//
237
// void blake2b_compress_neon(struct blake2b_state *state,
238
// const u8 *block, size_t nblocks, u32 inc);
239
//
240
// Only the first three fields of struct blake2b_state are used:
241
// u64 h[8]; (inout)
242
// u64 t[2]; (inout)
243
// u64 f[2]; (in)
244
//
245
.align 5
246
ENTRY(blake2b_compress_neon)
247
push {r4-r10}
248
249
// Allocate a 32-byte stack buffer that is 32-byte aligned.
250
mov ORIG_SP, sp
251
sub ip, sp, #32
252
bic ip, ip, #31
253
mov sp, ip
254
255
adr ROR24_TABLE, .Lror24_table
256
adr ROR16_TABLE, .Lror16_table
257
258
mov ip, STATE
259
vld1.64 {q0-q1}, [ip]! // Load h[0..3]
260
vld1.64 {q2-q3}, [ip]! // Load h[4..7]
261
.Lnext_block:
262
adr r10, .Lblake2b_IV
263
vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
264
vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
265
vmov r7, r8, d28 // Copy t[0] to (r7, r8)
266
vld1.64 {q6-q7}, [r10] // Load IV[4..7]
267
adds r7, r7, INC // Increment counter
268
bcs .Lslow_inc_ctr
269
vmov.i32 d28[0], r7
270
vst1.64 {d28}, [ip] // Update t[0]
271
.Linc_ctr_done:
272
273
// Load the next message block and finish initializing the state matrix
274
// 'v'. Fortunately, there are exactly enough NEON registers to fit the
275
// entire state matrix in q0-q7 and the entire message block in q8-15.
276
//
277
// However, _blake2b_round also needs some extra registers for rotates,
278
// so we have to spill some registers. It's better to spill the message
279
// registers than the state registers, as the message doesn't change.
280
// Therefore we store a copy of the first 32 bytes of the message block
281
// (q8-q9) in an aligned buffer on the stack so that they can be
282
// reloaded when needed. (We could just reload directly from the
283
// message buffer, but it's faster to use aligned loads.)
284
vld1.8 {q8-q9}, [BLOCK]!
285
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
286
vld1.8 {q10-q11}, [BLOCK]!
287
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
288
vld1.8 {q12-q13}, [BLOCK]!
289
vst1.8 {q8-q9}, [sp, :256]
290
mov ip, STATE
291
vld1.8 {q14-q15}, [BLOCK]!
292
293
// Execute the rounds. Each round is provided the order in which it
294
// needs to use the message words.
295
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
296
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
297
_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
298
_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
299
_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
300
_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
301
_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
302
_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
303
_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
304
_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
305
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
306
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
307
final=1
308
309
// Fold the final state matrix into the hash chaining value:
310
//
311
// for (i = 0; i < 8; i++)
312
// h[i] ^= v[i] ^ v[i + 8];
313
//
314
vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
315
veor q0, q0, q4 // v[0..1] ^= v[8..9]
316
veor q1, q1, q5 // v[2..3] ^= v[10..11]
317
vld1.64 {q10-q11}, [ip] // Load old h[4..7]
318
veor q2, q2, q6 // v[4..5] ^= v[12..13]
319
veor q3, q3, q7 // v[6..7] ^= v[14..15]
320
veor q0, q0, q8 // v[0..1] ^= h[0..1]
321
veor q1, q1, q9 // v[2..3] ^= h[2..3]
322
mov ip, STATE
323
subs NBLOCKS, NBLOCKS, #1 // nblocks--
324
vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
325
veor q2, q2, q10 // v[4..5] ^= h[4..5]
326
veor q3, q3, q11 // v[6..7] ^= h[6..7]
327
vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
328
329
// Advance to the next block, if there is one.
330
bne .Lnext_block // nblocks != 0?
331
332
mov sp, ORIG_SP
333
pop {r4-r10}
334
mov pc, lr
335
336
.Lslow_inc_ctr:
337
// Handle the case where the counter overflowed its low 32 bits, by
338
// carrying the overflow bit into the full 128-bit counter.
339
vmov r9, r10, d29
340
adcs r8, r8, #0
341
adcs r9, r9, #0
342
adc r10, r10, #0
343
vmov d28, r7, r8
344
vmov d29, r9, r10
345
vst1.64 {q14}, [ip] // Update t[0] and t[1]
346
b .Linc_ctr_done
347
ENDPROC(blake2b_compress_neon)
348
349