Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/blake2s-core.S
29278 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* BLAKE2s digest algorithm, ARM scalar implementation. This is faster
4
* than the generic implementations of BLAKE2s and BLAKE2b, but slower
5
* than the NEON implementation of BLAKE2b. There is no NEON
6
* implementation of BLAKE2s, since NEON doesn't really help with it.
7
*
8
* Copyright 2020 Google LLC
9
*
10
* Author: Eric Biggers <[email protected]>
11
*/
12
13
#include <linux/linkage.h>
14
#include <asm/assembler.h>
15
16
// Registers used to hold message words temporarily. There aren't
17
// enough ARM registers to hold the whole message block, so we have to
18
// load the words on-demand.
19
M_0 .req r12
20
M_1 .req r14
21
22
// The BLAKE2s initialization vector
23
.Lblake2s_IV:
24
.word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
25
.word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
26
27
.macro __ldrd a, b, src, offset
28
#if __LINUX_ARM_ARCH__ >= 6
29
ldrd \a, \b, [\src, #\offset]
30
#else
31
ldr \a, [\src, #\offset]
32
ldr \b, [\src, #\offset + 4]
33
#endif
34
.endm
35
36
.macro __strd a, b, dst, offset
37
#if __LINUX_ARM_ARCH__ >= 6
38
strd \a, \b, [\dst, #\offset]
39
#else
40
str \a, [\dst, #\offset]
41
str \b, [\dst, #\offset + 4]
42
#endif
43
.endm
44
45
.macro _le32_bswap a, tmp
46
#ifdef __ARMEB__
47
rev_l \a, \tmp
48
#endif
49
.endm
50
51
.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
52
_le32_bswap \a, \tmp
53
_le32_bswap \b, \tmp
54
_le32_bswap \c, \tmp
55
_le32_bswap \d, \tmp
56
_le32_bswap \e, \tmp
57
_le32_bswap \f, \tmp
58
_le32_bswap \g, \tmp
59
_le32_bswap \h, \tmp
60
.endm
61
62
// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
63
// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
64
// columns/diagonals. s0-s1 are the word offsets to the message words the first
65
// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
66
// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
67
//
68
// Note that to save instructions, the rotations don't happen when the
69
// pseudocode says they should, but rather they are delayed until the values are
70
// used. See the comment above _blake2s_round().
71
.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
72
73
ldr M_0, [sp, #32 + 4 * \s0]
74
ldr M_1, [sp, #32 + 4 * \s2]
75
76
// a += b + m[blake2s_sigma[r][2*i + 0]];
77
add \a0, \a0, \b0, ror #brot
78
add \a1, \a1, \b1, ror #brot
79
add \a0, \a0, M_0
80
add \a1, \a1, M_1
81
82
// d = ror32(d ^ a, 16);
83
eor \d0, \a0, \d0, ror #drot
84
eor \d1, \a1, \d1, ror #drot
85
86
// c += d;
87
add \c0, \c0, \d0, ror #16
88
add \c1, \c1, \d1, ror #16
89
90
// b = ror32(b ^ c, 12);
91
eor \b0, \c0, \b0, ror #brot
92
eor \b1, \c1, \b1, ror #brot
93
94
ldr M_0, [sp, #32 + 4 * \s1]
95
ldr M_1, [sp, #32 + 4 * \s3]
96
97
// a += b + m[blake2s_sigma[r][2*i + 1]];
98
add \a0, \a0, \b0, ror #12
99
add \a1, \a1, \b1, ror #12
100
add \a0, \a0, M_0
101
add \a1, \a1, M_1
102
103
// d = ror32(d ^ a, 8);
104
eor \d0, \a0, \d0, ror#16
105
eor \d1, \a1, \d1, ror#16
106
107
// c += d;
108
add \c0, \c0, \d0, ror#8
109
add \c1, \c1, \d1, ror#8
110
111
// b = ror32(b ^ c, 7);
112
eor \b0, \c0, \b0, ror#12
113
eor \b1, \c1, \b1, ror#12
114
.endm
115
116
// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
117
// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
118
// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
119
// r14 are free to use. The macro arguments s0-s15 give the order in which the
120
// message words are used in this round.
121
//
122
// All rotates are performed using the implicit rotate operand accepted by the
123
// 'add' and 'eor' instructions. This is faster than using explicit rotate
124
// instructions. To make this work, we allow the values in the second and last
125
// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
126
// wrong rotation amount. The rotation amount is then fixed up just in time
127
// when the values are used. 'brot' is the number of bits the values in row 'b'
128
// need to be rotated right to arrive at the correct values, and 'drot'
129
// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
130
// that they end up as (7, 8) after every round.
131
.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
132
s8, s9, s10, s11, s12, s13, s14, s15
133
134
// Mix first two columns:
135
// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
136
__ldrd r10, r11, sp, 16 // load v[12] and v[13]
137
_blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
138
\s0, \s1, \s2, \s3
139
__strd r8, r9, sp, 0
140
__strd r10, r11, sp, 16
141
142
// Mix second two columns:
143
// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
144
__ldrd r8, r9, sp, 8 // load v[10] and v[11]
145
__ldrd r10, r11, sp, 24 // load v[14] and v[15]
146
_blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
147
\s4, \s5, \s6, \s7
148
str r10, [sp, #24] // store v[14]
149
// v[10], v[11], and v[15] are used below, so no need to store them yet.
150
151
.set brot, 7
152
.set drot, 8
153
154
// Mix first two diagonals:
155
// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
156
ldr r10, [sp, #16] // load v[12]
157
_blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
158
\s8, \s9, \s10, \s11
159
__strd r8, r9, sp, 8
160
str r11, [sp, #28]
161
str r10, [sp, #16]
162
163
// Mix second two diagonals:
164
// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
165
__ldrd r8, r9, sp, 0 // load v[8] and v[9]
166
__ldrd r10, r11, sp, 20 // load v[13] and v[14]
167
_blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
168
\s12, \s13, \s14, \s15
169
__strd r10, r11, sp, 20
170
.endm
171
172
//
173
// void blake2s_compress(struct blake2s_state *state,
174
// const u8 *block, size_t nblocks, u32 inc);
175
//
176
// Only the first three fields of struct blake2s_state are used:
177
// u32 h[8]; (inout)
178
// u32 t[2]; (inout)
179
// u32 f[2]; (in)
180
//
181
.align 5
182
ENTRY(blake2s_compress)
183
push {r0-r2,r4-r11,lr} // keep this an even number
184
185
.Lnext_block:
186
// r0 is 'state'
187
// r1 is 'block'
188
// r3 is 'inc'
189
190
// Load and increment the counter t[0..1].
191
__ldrd r10, r11, r0, 32
192
adds r10, r10, r3
193
adc r11, r11, #0
194
__strd r10, r11, r0, 32
195
196
// _blake2s_round is very short on registers, so copy the message block
197
// to the stack to save a register during the rounds. This also has the
198
// advantage that misalignment only needs to be dealt with in one place.
199
sub sp, sp, #64
200
mov r12, sp
201
tst r1, #3
202
bne .Lcopy_block_misaligned
203
ldmia r1!, {r2-r9}
204
_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
205
stmia r12!, {r2-r9}
206
ldmia r1!, {r2-r9}
207
_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
208
stmia r12, {r2-r9}
209
.Lcopy_block_done:
210
str r1, [sp, #68] // Update message pointer
211
212
// Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
213
// for spilling v[8..9]. Leave v[8..9] in r8-r9.
214
mov r14, r0 // r14 = state
215
adr r12, .Lblake2s_IV
216
ldmia r12!, {r8-r9} // load IV[0..1]
217
__ldrd r0, r1, r14, 40 // load f[0..1]
218
ldm r12, {r2-r7} // load IV[3..7]
219
eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
220
eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
221
eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
222
eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
223
push {r2-r7} // push v[9..15]
224
sub sp, sp, #8 // leave space for v[8..9]
225
226
// Load h[0..7] == v[0..7].
227
ldm r14, {r0-r7}
228
229
// Execute the rounds. Each round is provided the order in which it
230
// needs to use the message words.
231
.set brot, 0
232
.set drot, 0
233
_blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
234
_blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
235
_blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
236
_blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
237
_blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
238
_blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
239
_blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
240
_blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
241
_blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
242
_blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
243
244
// Fold the final state matrix into the hash chaining value:
245
//
246
// for (i = 0; i < 8; i++)
247
// h[i] ^= v[i] ^ v[i + 8];
248
//
249
ldr r14, [sp, #96] // r14 = &h[0]
250
add sp, sp, #8 // v[8..9] are already loaded.
251
pop {r10-r11} // load v[10..11]
252
eor r0, r0, r8
253
eor r1, r1, r9
254
eor r2, r2, r10
255
eor r3, r3, r11
256
ldm r14, {r8-r11} // load h[0..3]
257
eor r0, r0, r8
258
eor r1, r1, r9
259
eor r2, r2, r10
260
eor r3, r3, r11
261
stmia r14!, {r0-r3} // store new h[0..3]
262
ldm r14, {r0-r3} // load old h[4..7]
263
pop {r8-r11} // load v[12..15]
264
eor r0, r0, r4, ror #brot
265
eor r1, r1, r5, ror #brot
266
eor r2, r2, r6, ror #brot
267
eor r3, r3, r7, ror #brot
268
eor r0, r0, r8, ror #drot
269
eor r1, r1, r9, ror #drot
270
eor r2, r2, r10, ror #drot
271
eor r3, r3, r11, ror #drot
272
add sp, sp, #64 // skip copy of message block
273
stm r14, {r0-r3} // store new h[4..7]
274
275
// Advance to the next block, if there is one. Note that if there are
276
// multiple blocks, then 'inc' (the counter increment amount) must be
277
// 64. So we can simply set it to 64 without re-loading it.
278
ldm sp, {r0, r1, r2} // load (state, block, nblocks)
279
mov r3, #64 // set 'inc'
280
subs r2, r2, #1 // nblocks--
281
str r2, [sp, #8]
282
bne .Lnext_block // nblocks != 0?
283
284
pop {r0-r2,r4-r11,pc}
285
286
// The next message block (pointed to by r1) isn't 4-byte aligned, so it
287
// can't be loaded using ldmia. Copy it to the stack buffer (pointed to
288
// by r12) using an alternative method. r2-r9 are free to use.
289
.Lcopy_block_misaligned:
290
mov r2, #64
291
1:
292
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
293
ldr r3, [r1], #4
294
_le32_bswap r3, r4
295
#else
296
ldrb r3, [r1, #0]
297
ldrb r4, [r1, #1]
298
ldrb r5, [r1, #2]
299
ldrb r6, [r1, #3]
300
add r1, r1, #4
301
orr r3, r3, r4, lsl #8
302
orr r3, r3, r5, lsl #16
303
orr r3, r3, r6, lsl #24
304
#endif
305
subs r2, r2, #4
306
str r3, [r12], #4
307
bne 1b
308
b .Lcopy_block_done
309
ENDPROC(blake2s_compress)
310
311