CoCalc -- sha256-ce.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm64/sha256-ce.S
²⁹²⁷⁸ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
4
 *
5
 * Copyright (C) 2014 Linaro Ltd <[email protected]>
6
 */
7

8
#include <linux/linkage.h>
9
#include <asm/assembler.h>
10

11
	.text
12
	.arch		armv8-a+crypto
13

14
	dga		.req	q20
15
	dgav		.req	v20
16
	dgb		.req	q21
17
	dgbv		.req	v21
18

19
	t0		.req	v22
20
	t1		.req	v23
21

22
	dg0q		.req	q24
23
	dg0v		.req	v24
24
	dg1q		.req	q25
25
	dg1v		.req	v25
26
	dg2q		.req	q26
27
	dg2v		.req	v26
28

29
	.macro		add_only, ev, rc, s0
30
	mov		dg2v.16b, dg0v.16b
31
	.ifeq		\ev
32
	add		t1.4s, v\s0\().4s, \rc\().4s
33
	sha256h		dg0q, dg1q, t0.4s
34
	sha256h2	dg1q, dg2q, t0.4s
35
	.else
36
	.ifnb		\s0
37
	add		t0.4s, v\s0\().4s, \rc\().4s
38
	.endif
39
	sha256h		dg0q, dg1q, t1.4s
40
	sha256h2	dg1q, dg2q, t1.4s
41
	.endif
42
	.endm
43

44
	.macro		add_update, ev, rc, s0, s1, s2, s3
45
	sha256su0	v\s0\().4s, v\s1\().4s
46
	add_only	\ev, \rc, \s1
47
	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
48
	.endm
49

50
	/*
51
	 * The SHA-256 round constants
52
	 */
53
	.section	".rodata", "a"
54
	.align		4
55
.Lsha2_rcon:
56
	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57
	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58
	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59
	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60
	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61
	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62
	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63
	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64
	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65
	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66
	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67
	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68
	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69
	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70
	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71
	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
72

73
	.macro load_round_constants	tmp
74
	adr_l		\tmp, .Lsha2_rcon
75
	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76
	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77
	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78
	ld1		{v12.4s-v15.4s}, [\tmp]
79
	.endm
80

81
	/*
82
	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
83
	 *				const u8 *data, size_t nblocks);
84
	 */
85
	.text
86
SYM_FUNC_START(__sha256_ce_transform)
87

88
	load_round_constants	x8
89

90
	/* load state */
91
	ld1		{dgav.4s, dgbv.4s}, [x0]
92

93
	/* load input */
94
0:	ld1		{v16.4s-v19.4s}, [x1], #64
95
	sub		x2, x2, #1
96

97
CPU_LE(	rev32		v16.16b, v16.16b	)
98
CPU_LE(	rev32		v17.16b, v17.16b	)
99
CPU_LE(	rev32		v18.16b, v18.16b	)
100
CPU_LE(	rev32		v19.16b, v19.16b	)
101

102
	add		t0.4s, v16.4s, v0.4s
103
	mov		dg0v.16b, dgav.16b
104
	mov		dg1v.16b, dgbv.16b
105

106
	add_update	0,  v1, 16, 17, 18, 19
107
	add_update	1,  v2, 17, 18, 19, 16
108
	add_update	0,  v3, 18, 19, 16, 17
109
	add_update	1,  v4, 19, 16, 17, 18
110

111
	add_update	0,  v5, 16, 17, 18, 19
112
	add_update	1,  v6, 17, 18, 19, 16
113
	add_update	0,  v7, 18, 19, 16, 17
114
	add_update	1,  v8, 19, 16, 17, 18
115

116
	add_update	0,  v9, 16, 17, 18, 19
117
	add_update	1, v10, 17, 18, 19, 16
118
	add_update	0, v11, 18, 19, 16, 17
119
	add_update	1, v12, 19, 16, 17, 18
120

121
	add_only	0, v13, 17
122
	add_only	1, v14, 18
123
	add_only	0, v15, 19
124
	add_only	1
125

126
	/* update state */
127
	add		dgav.4s, dgav.4s, dg0v.4s
128
	add		dgbv.4s, dgbv.4s, dg1v.4s
129

130
	/* return early if voluntary preemption is needed */
131
	cond_yield	1f, x5, x6
132

133
	/* handled all input blocks? */
134
	cbnz		x2, 0b
135

136
	/* store new state */
137
1:	st1		{dgav.4s, dgbv.4s}, [x0]
138
	mov		x0, x2
139
	ret
140
SYM_FUNC_END(__sha256_ce_transform)
141

142
	.unreq dga
143
	.unreq dgav
144
	.unreq dgb
145
	.unreq dgbv
146
	.unreq t0
147
	.unreq t1
148
	.unreq dg0q
149
	.unreq dg0v
150
	.unreq dg1q
151
	.unreq dg1v
152
	.unreq dg2q
153
	.unreq dg2v
154

155
	// parameters for sha256_ce_finup2x()
156
	ctx		.req	x0
157
	data1		.req	x1
158
	data2		.req	x2
159
	len		.req	w3
160
	out1		.req	x4
161
	out2		.req	x5
162

163
	// other scalar variables
164
	count		.req	x6
165
	final_step	.req	w7
166

167
	// x8-x9 are used as temporaries.
168

169
	// v0-v15 are used to cache the SHA-256 round constants.
170
	// v16-v19 are used for the message schedule for the first message.
171
	// v20-v23 are used for the message schedule for the second message.
172
	// v24-v31 are used for the state and temporaries as given below.
173
	// *_a are for the first message and *_b for the second.
174
	state0_a_q	.req	q24
175
	state0_a	.req	v24
176
	state1_a_q	.req	q25
177
	state1_a	.req	v25
178
	state0_b_q	.req	q26
179
	state0_b	.req	v26
180
	state1_b_q	.req	q27
181
	state1_b	.req	v27
182
	t0_a		.req	v28
183
	t0_b		.req	v29
184
	t1_a_q		.req	q30
185
	t1_a		.req	v30
186
	t1_b_q		.req	q31
187
	t1_b		.req	v31
188

189
#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
190
#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
191
// offsetof(struct __sha256_ctx, state) is assumed to be 0.
192

193
	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
194
	// and m0_b contain the current 4 message schedule words for the first
195
	// and second message respectively.
196
	//
197
	// If not all the message schedule words have been computed yet, then
198
	// this also computes 4 more message schedule words for each message.
199
	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200
	// the first message, and likewise m1_b-m3_b for the second.  After
201
	// consuming the current value of m0_a, this macro computes the group
202
	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
203
	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
204
	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
205
	// the registers accordingly.
206
	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
207
				       m0_b, m1_b, m2_b, m3_b
208
	add		t0_a\().4s, \m0_a\().4s, \k\().4s
209
	add		t0_b\().4s, \m0_b\().4s, \k\().4s
210
	.if \i < 48
211
	sha256su0	\m0_a\().4s, \m1_a\().4s
212
	sha256su0	\m0_b\().4s, \m1_b\().4s
213
	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214
	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
215
	.endif
216
	mov		t1_a.16b, state0_a.16b
217
	mov		t1_b.16b, state0_b.16b
218
	sha256h		state0_a_q, state1_a_q, t0_a\().4s
219
	sha256h		state0_b_q, state1_b_q, t0_b\().4s
220
	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
221
	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
222
	.endm
223

224
	.macro	do_16rounds_2x	i, k0, k1, k2, k3
225
	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
226
	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
227
	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
228
	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
229
	.endm
230

231
//
232
// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
233
//			  const u8 *data1, const u8 *data2, int len,
234
//			  u8 out1[SHA256_DIGEST_SIZE],
235
//			  u8 out2[SHA256_DIGEST_SIZE]);
236
//
237
// This function computes the SHA-256 digests of two messages |data1| and
238
// |data2| that are both |len| bytes long, starting from the initial context
239
// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
240
//
241
// The instructions for the two SHA-256 operations are interleaved.  On many
242
// CPUs, this is almost twice as fast as hashing each message individually due
243
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
244
//
245
SYM_FUNC_START(sha256_ce_finup2x)
246
	sub		sp, sp, #128
247
	mov		final_step, #0
248
	load_round_constants	x8
249

250
	// Load the initial state from ctx->state.
251
	ld1		{state0_a.4s-state1_a.4s}, [ctx]
252

253
	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
254
	// bytes that are buffered in ctx->buf.  Also save it in a register with
255
	// len added to it.
256
	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
257
	add		count, x8, len, sxtw
258
	and		x8, x8, #63
259
	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
260

261
	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
262
	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
263
	// just load 64 bytes from each of ctx->buf, data1, and data2
264
	// unconditionally and rearrange the data as needed.
265
	add		x9, ctx, #OFFSETOF_BUF
266
	ld1		{v16.16b-v19.16b}, [x9]
267
	st1		{v16.16b-v19.16b}, [sp]
268

269
	ld1		{v16.16b-v19.16b}, [data1], #64
270
	add		x9, sp, x8
271
	st1		{v16.16b-v19.16b}, [x9]
272
	ld1		{v16.4s-v19.4s}, [sp]
273

274
	ld1		{v20.16b-v23.16b}, [data2], #64
275
	st1		{v20.16b-v23.16b}, [x9]
276
	ld1		{v20.4s-v23.4s}, [sp]
277

278
	sub		len, len, #64
279
	sub		data1, data1, x8
280
	sub		data2, data2, x8
281
	add		len, len, w8
282
	mov		state0_b.16b, state0_a.16b
283
	mov		state1_b.16b, state1_a.16b
284
	b		.Lfinup2x_loop_have_data
285

286
.Lfinup2x_enter_loop:
287
	sub		len, len, #64
288
	mov		state0_b.16b, state0_a.16b
289
	mov		state1_b.16b, state1_a.16b
290
.Lfinup2x_loop:
291
	// Load the next two data blocks.
292
	ld1		{v16.4s-v19.4s}, [data1], #64
293
	ld1		{v20.4s-v23.4s}, [data2], #64
294
.Lfinup2x_loop_have_data:
295
	// Convert the words of the data blocks from big endian.
296
CPU_LE(	rev32		v16.16b, v16.16b	)
297
CPU_LE(	rev32		v17.16b, v17.16b	)
298
CPU_LE(	rev32		v18.16b, v18.16b	)
299
CPU_LE(	rev32		v19.16b, v19.16b	)
300
CPU_LE(	rev32		v20.16b, v20.16b	)
301
CPU_LE(	rev32		v21.16b, v21.16b	)
302
CPU_LE(	rev32		v22.16b, v22.16b	)
303
CPU_LE(	rev32		v23.16b, v23.16b	)
304
.Lfinup2x_loop_have_bswapped_data:
305

306
	// Save the original state for each block.
307
	st1		{state0_a.4s-state1_b.4s}, [sp]
308

309
	// Do the SHA-256 rounds on each block.
310
	do_16rounds_2x	0,  v0, v1, v2, v3
311
	do_16rounds_2x	16, v4, v5, v6, v7
312
	do_16rounds_2x	32, v8, v9, v10, v11
313
	do_16rounds_2x	48, v12, v13, v14, v15
314

315
	// Add the original state for each block.
316
	ld1		{v16.4s-v19.4s}, [sp]
317
	add		state0_a.4s, state0_a.4s, v16.4s
318
	add		state1_a.4s, state1_a.4s, v17.4s
319
	add		state0_b.4s, state0_b.4s, v18.4s
320
	add		state1_b.4s, state1_b.4s, v19.4s
321

322
	// Update len and loop back if more blocks remain.
323
	sub		len, len, #64
324
	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
325

326
	// Check if any final blocks need to be handled.
327
	// final_step = 2: all done
328
	// final_step = 1: need to do count-only padding block
329
	// final_step = 0: need to do the block with 0x80 padding byte
330
	tbnz		final_step, #1, .Lfinup2x_done
331
	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
332
	add		len, len, #64
333
	cbz		len, .Lfinup2x_finalize_blockaligned
334

335
	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
336
	// To do this, write the padding starting with the 0x80 byte to
337
	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
338
	// and load from &sp[64 - len] to get the needed padding block.  This
339
	// code relies on the data buffers being >= 64 bytes in length.
340
	sub		w8, len, #64		// w8 = len - 64
341
	add		data1, data1, w8, sxtw	// data1 += len - 64
342
	add		data2, data2, w8, sxtw	// data2 += len - 64
343
CPU_LE(	mov		x9, #0x80		)
344
CPU_LE(	fmov		d16, x9			)
345
CPU_BE(	movi		v16.16b, #0		)
346
CPU_BE(	mov		x9, #0x8000000000000000	)
347
CPU_BE(	mov		v16.d[1], x9		)
348
	movi		v17.16b, #0
349
	stp		q16, q17, [sp, #64]
350
	stp		q17, q17, [sp, #96]
351
	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
352
	cmp		len, #56
353
	b.ge		1f		// will count spill into its own block?
354
	lsl		count, count, #3
355
CPU_LE(	rev		count, count		)
356
	str		count, [x9, #56]
357
	mov		final_step, #2	// won't need count-only block
358
	b		2f
359
1:
360
	mov		final_step, #1	// will need count-only block
361
2:
362
	ld1		{v16.16b-v19.16b}, [data1]
363
	st1		{v16.16b-v19.16b}, [sp]
364
	ld1		{v16.4s-v19.4s}, [x9]
365
	ld1		{v20.16b-v23.16b}, [data2]
366
	st1		{v20.16b-v23.16b}, [sp]
367
	ld1		{v20.4s-v23.4s}, [x9]
368
	b		.Lfinup2x_loop_have_data
369

370
	// Prepare a padding block, either:
371
	//
372
	//	{0x80, 0, 0, 0, ..., count (as __be64)}
373
	//	This is for a block aligned message.
374
	//
375
	//	{   0, 0, 0, 0, ..., count (as __be64)}
376
	//	This is for a message whose length mod 64 is >= 56.
377
	//
378
	// Pre-swap the endianness of the words.
379
.Lfinup2x_finalize_countonly:
380
	movi		v16.2d, #0
381
	b		1f
382
.Lfinup2x_finalize_blockaligned:
383
	mov		x8, #0x80000000
384
	fmov		d16, x8
385
1:
386
	movi		v17.2d, #0
387
	movi		v18.2d, #0
388
	ror		count, count, #29	// ror(lsl(count, 3), 32)
389
	mov		v19.d[0], xzr
390
	mov		v19.d[1], count
391
	mov		v20.16b, v16.16b
392
	movi		v21.2d, #0
393
	movi		v22.2d, #0
394
	mov		v23.16b, v19.16b
395
	mov		final_step, #2
396
	b		.Lfinup2x_loop_have_bswapped_data
397

398
.Lfinup2x_done:
399
	// Write the two digests with all bytes in the correct order.
400
CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
401
CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
402
CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
403
CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
404
	st1		{state0_a.4s-state1_a.4s}, [out1]
405
	st1		{state0_b.4s-state1_b.4s}, [out2]
406
	add		sp, sp, #128
407
	ret
408
SYM_FUNC_END(sha256_ce_finup2x)
409

410
Product

Resources

Company