Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm64/sha256-ce.S
29278 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
4
*
5
* Copyright (C) 2014 Linaro Ltd <[email protected]>
6
*/
7
8
#include <linux/linkage.h>
9
#include <asm/assembler.h>
10
11
.text
12
.arch armv8-a+crypto
13
14
dga .req q20
15
dgav .req v20
16
dgb .req q21
17
dgbv .req v21
18
19
t0 .req v22
20
t1 .req v23
21
22
dg0q .req q24
23
dg0v .req v24
24
dg1q .req q25
25
dg1v .req v25
26
dg2q .req q26
27
dg2v .req v26
28
29
.macro add_only, ev, rc, s0
30
mov dg2v.16b, dg0v.16b
31
.ifeq \ev
32
add t1.4s, v\s0\().4s, \rc\().4s
33
sha256h dg0q, dg1q, t0.4s
34
sha256h2 dg1q, dg2q, t0.4s
35
.else
36
.ifnb \s0
37
add t0.4s, v\s0\().4s, \rc\().4s
38
.endif
39
sha256h dg0q, dg1q, t1.4s
40
sha256h2 dg1q, dg2q, t1.4s
41
.endif
42
.endm
43
44
.macro add_update, ev, rc, s0, s1, s2, s3
45
sha256su0 v\s0\().4s, v\s1\().4s
46
add_only \ev, \rc, \s1
47
sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
48
.endm
49
50
/*
51
* The SHA-256 round constants
52
*/
53
.section ".rodata", "a"
54
.align 4
55
.Lsha2_rcon:
56
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
72
73
.macro load_round_constants tmp
74
adr_l \tmp, .Lsha2_rcon
75
ld1 { v0.4s- v3.4s}, [\tmp], #64
76
ld1 { v4.4s- v7.4s}, [\tmp], #64
77
ld1 { v8.4s-v11.4s}, [\tmp], #64
78
ld1 {v12.4s-v15.4s}, [\tmp]
79
.endm
80
81
/*
82
* size_t __sha256_ce_transform(struct sha256_block_state *state,
83
* const u8 *data, size_t nblocks);
84
*/
85
.text
86
SYM_FUNC_START(__sha256_ce_transform)
87
88
load_round_constants x8
89
90
/* load state */
91
ld1 {dgav.4s, dgbv.4s}, [x0]
92
93
/* load input */
94
0: ld1 {v16.4s-v19.4s}, [x1], #64
95
sub x2, x2, #1
96
97
CPU_LE( rev32 v16.16b, v16.16b )
98
CPU_LE( rev32 v17.16b, v17.16b )
99
CPU_LE( rev32 v18.16b, v18.16b )
100
CPU_LE( rev32 v19.16b, v19.16b )
101
102
add t0.4s, v16.4s, v0.4s
103
mov dg0v.16b, dgav.16b
104
mov dg1v.16b, dgbv.16b
105
106
add_update 0, v1, 16, 17, 18, 19
107
add_update 1, v2, 17, 18, 19, 16
108
add_update 0, v3, 18, 19, 16, 17
109
add_update 1, v4, 19, 16, 17, 18
110
111
add_update 0, v5, 16, 17, 18, 19
112
add_update 1, v6, 17, 18, 19, 16
113
add_update 0, v7, 18, 19, 16, 17
114
add_update 1, v8, 19, 16, 17, 18
115
116
add_update 0, v9, 16, 17, 18, 19
117
add_update 1, v10, 17, 18, 19, 16
118
add_update 0, v11, 18, 19, 16, 17
119
add_update 1, v12, 19, 16, 17, 18
120
121
add_only 0, v13, 17
122
add_only 1, v14, 18
123
add_only 0, v15, 19
124
add_only 1
125
126
/* update state */
127
add dgav.4s, dgav.4s, dg0v.4s
128
add dgbv.4s, dgbv.4s, dg1v.4s
129
130
/* return early if voluntary preemption is needed */
131
cond_yield 1f, x5, x6
132
133
/* handled all input blocks? */
134
cbnz x2, 0b
135
136
/* store new state */
137
1: st1 {dgav.4s, dgbv.4s}, [x0]
138
mov x0, x2
139
ret
140
SYM_FUNC_END(__sha256_ce_transform)
141
142
.unreq dga
143
.unreq dgav
144
.unreq dgb
145
.unreq dgbv
146
.unreq t0
147
.unreq t1
148
.unreq dg0q
149
.unreq dg0v
150
.unreq dg1q
151
.unreq dg1v
152
.unreq dg2q
153
.unreq dg2v
154
155
// parameters for sha256_ce_finup2x()
156
ctx .req x0
157
data1 .req x1
158
data2 .req x2
159
len .req w3
160
out1 .req x4
161
out2 .req x5
162
163
// other scalar variables
164
count .req x6
165
final_step .req w7
166
167
// x8-x9 are used as temporaries.
168
169
// v0-v15 are used to cache the SHA-256 round constants.
170
// v16-v19 are used for the message schedule for the first message.
171
// v20-v23 are used for the message schedule for the second message.
172
// v24-v31 are used for the state and temporaries as given below.
173
// *_a are for the first message and *_b for the second.
174
state0_a_q .req q24
175
state0_a .req v24
176
state1_a_q .req q25
177
state1_a .req v25
178
state0_b_q .req q26
179
state0_b .req v26
180
state1_b_q .req q27
181
state1_b .req v27
182
t0_a .req v28
183
t0_b .req v29
184
t1_a_q .req q30
185
t1_a .req v30
186
t1_b_q .req q31
187
t1_b .req v31
188
189
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
190
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
191
// offsetof(struct __sha256_ctx, state) is assumed to be 0.
192
193
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194
// and m0_b contain the current 4 message schedule words for the first
195
// and second message respectively.
196
//
197
// If not all the message schedule words have been computed yet, then
198
// this also computes 4 more message schedule words for each message.
199
// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200
// the first message, and likewise m1_b-m3_b for the second. After
201
// consuming the current value of m0_a, this macro computes the group
202
// after m3_a and writes it to m0_a, and likewise for *_b. This means
203
// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
204
// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
205
// the registers accordingly.
206
.macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
207
m0_b, m1_b, m2_b, m3_b
208
add t0_a\().4s, \m0_a\().4s, \k\().4s
209
add t0_b\().4s, \m0_b\().4s, \k\().4s
210
.if \i < 48
211
sha256su0 \m0_a\().4s, \m1_a\().4s
212
sha256su0 \m0_b\().4s, \m1_b\().4s
213
sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214
sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
215
.endif
216
mov t1_a.16b, state0_a.16b
217
mov t1_b.16b, state0_b.16b
218
sha256h state0_a_q, state1_a_q, t0_a\().4s
219
sha256h state0_b_q, state1_b_q, t0_b\().4s
220
sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221
sha256h2 state1_b_q, t1_b_q, t0_b\().4s
222
.endm
223
224
.macro do_16rounds_2x i, k0, k1, k2, k3
225
do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
226
do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
227
do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
228
do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
229
.endm
230
231
//
232
// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
233
// const u8 *data1, const u8 *data2, int len,
234
// u8 out1[SHA256_DIGEST_SIZE],
235
// u8 out2[SHA256_DIGEST_SIZE]);
236
//
237
// This function computes the SHA-256 digests of two messages |data1| and
238
// |data2| that are both |len| bytes long, starting from the initial context
239
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
240
//
241
// The instructions for the two SHA-256 operations are interleaved. On many
242
// CPUs, this is almost twice as fast as hashing each message individually due
243
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
244
//
245
SYM_FUNC_START(sha256_ce_finup2x)
246
sub sp, sp, #128
247
mov final_step, #0
248
load_round_constants x8
249
250
// Load the initial state from ctx->state.
251
ld1 {state0_a.4s-state1_a.4s}, [ctx]
252
253
// Load ctx->bytecount. Take the mod 64 of it to get the number of
254
// bytes that are buffered in ctx->buf. Also save it in a register with
255
// len added to it.
256
ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
257
add count, x8, len, sxtw
258
and x8, x8, #63
259
cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
260
261
// x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262
// followed by the first 64 - x8 bytes of data. Since len >= 64, we
263
// just load 64 bytes from each of ctx->buf, data1, and data2
264
// unconditionally and rearrange the data as needed.
265
add x9, ctx, #OFFSETOF_BUF
266
ld1 {v16.16b-v19.16b}, [x9]
267
st1 {v16.16b-v19.16b}, [sp]
268
269
ld1 {v16.16b-v19.16b}, [data1], #64
270
add x9, sp, x8
271
st1 {v16.16b-v19.16b}, [x9]
272
ld1 {v16.4s-v19.4s}, [sp]
273
274
ld1 {v20.16b-v23.16b}, [data2], #64
275
st1 {v20.16b-v23.16b}, [x9]
276
ld1 {v20.4s-v23.4s}, [sp]
277
278
sub len, len, #64
279
sub data1, data1, x8
280
sub data2, data2, x8
281
add len, len, w8
282
mov state0_b.16b, state0_a.16b
283
mov state1_b.16b, state1_a.16b
284
b .Lfinup2x_loop_have_data
285
286
.Lfinup2x_enter_loop:
287
sub len, len, #64
288
mov state0_b.16b, state0_a.16b
289
mov state1_b.16b, state1_a.16b
290
.Lfinup2x_loop:
291
// Load the next two data blocks.
292
ld1 {v16.4s-v19.4s}, [data1], #64
293
ld1 {v20.4s-v23.4s}, [data2], #64
294
.Lfinup2x_loop_have_data:
295
// Convert the words of the data blocks from big endian.
296
CPU_LE( rev32 v16.16b, v16.16b )
297
CPU_LE( rev32 v17.16b, v17.16b )
298
CPU_LE( rev32 v18.16b, v18.16b )
299
CPU_LE( rev32 v19.16b, v19.16b )
300
CPU_LE( rev32 v20.16b, v20.16b )
301
CPU_LE( rev32 v21.16b, v21.16b )
302
CPU_LE( rev32 v22.16b, v22.16b )
303
CPU_LE( rev32 v23.16b, v23.16b )
304
.Lfinup2x_loop_have_bswapped_data:
305
306
// Save the original state for each block.
307
st1 {state0_a.4s-state1_b.4s}, [sp]
308
309
// Do the SHA-256 rounds on each block.
310
do_16rounds_2x 0, v0, v1, v2, v3
311
do_16rounds_2x 16, v4, v5, v6, v7
312
do_16rounds_2x 32, v8, v9, v10, v11
313
do_16rounds_2x 48, v12, v13, v14, v15
314
315
// Add the original state for each block.
316
ld1 {v16.4s-v19.4s}, [sp]
317
add state0_a.4s, state0_a.4s, v16.4s
318
add state1_a.4s, state1_a.4s, v17.4s
319
add state0_b.4s, state0_b.4s, v18.4s
320
add state1_b.4s, state1_b.4s, v19.4s
321
322
// Update len and loop back if more blocks remain.
323
sub len, len, #64
324
tbz len, #31, .Lfinup2x_loop // len >= 0?
325
326
// Check if any final blocks need to be handled.
327
// final_step = 2: all done
328
// final_step = 1: need to do count-only padding block
329
// final_step = 0: need to do the block with 0x80 padding byte
330
tbnz final_step, #1, .Lfinup2x_done
331
tbnz final_step, #0, .Lfinup2x_finalize_countonly
332
add len, len, #64
333
cbz len, .Lfinup2x_finalize_blockaligned
334
335
// Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
336
// To do this, write the padding starting with the 0x80 byte to
337
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
338
// and load from &sp[64 - len] to get the needed padding block. This
339
// code relies on the data buffers being >= 64 bytes in length.
340
sub w8, len, #64 // w8 = len - 64
341
add data1, data1, w8, sxtw // data1 += len - 64
342
add data2, data2, w8, sxtw // data2 += len - 64
343
CPU_LE( mov x9, #0x80 )
344
CPU_LE( fmov d16, x9 )
345
CPU_BE( movi v16.16b, #0 )
346
CPU_BE( mov x9, #0x8000000000000000 )
347
CPU_BE( mov v16.d[1], x9 )
348
movi v17.16b, #0
349
stp q16, q17, [sp, #64]
350
stp q17, q17, [sp, #96]
351
sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
352
cmp len, #56
353
b.ge 1f // will count spill into its own block?
354
lsl count, count, #3
355
CPU_LE( rev count, count )
356
str count, [x9, #56]
357
mov final_step, #2 // won't need count-only block
358
b 2f
359
1:
360
mov final_step, #1 // will need count-only block
361
2:
362
ld1 {v16.16b-v19.16b}, [data1]
363
st1 {v16.16b-v19.16b}, [sp]
364
ld1 {v16.4s-v19.4s}, [x9]
365
ld1 {v20.16b-v23.16b}, [data2]
366
st1 {v20.16b-v23.16b}, [sp]
367
ld1 {v20.4s-v23.4s}, [x9]
368
b .Lfinup2x_loop_have_data
369
370
// Prepare a padding block, either:
371
//
372
// {0x80, 0, 0, 0, ..., count (as __be64)}
373
// This is for a block aligned message.
374
//
375
// { 0, 0, 0, 0, ..., count (as __be64)}
376
// This is for a message whose length mod 64 is >= 56.
377
//
378
// Pre-swap the endianness of the words.
379
.Lfinup2x_finalize_countonly:
380
movi v16.2d, #0
381
b 1f
382
.Lfinup2x_finalize_blockaligned:
383
mov x8, #0x80000000
384
fmov d16, x8
385
1:
386
movi v17.2d, #0
387
movi v18.2d, #0
388
ror count, count, #29 // ror(lsl(count, 3), 32)
389
mov v19.d[0], xzr
390
mov v19.d[1], count
391
mov v20.16b, v16.16b
392
movi v21.2d, #0
393
movi v22.2d, #0
394
mov v23.16b, v19.16b
395
mov final_step, #2
396
b .Lfinup2x_loop_have_bswapped_data
397
398
.Lfinup2x_done:
399
// Write the two digests with all bytes in the correct order.
400
CPU_LE( rev32 state0_a.16b, state0_a.16b )
401
CPU_LE( rev32 state1_a.16b, state1_a.16b )
402
CPU_LE( rev32 state0_b.16b, state0_b.16b )
403
CPU_LE( rev32 state1_b.16b, state1_b.16b )
404
st1 {state0_a.4s-state1_a.4s}, [out1]
405
st1 {state0_b.4s-state1_b.4s}, [out2]
406
add sp, sp, #128
407
ret
408
SYM_FUNC_END(sha256_ce_finup2x)
409
410