CoCalc -- sha256-ni-asm.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha256-ni-asm.S
²⁹²⁷⁸ views
1
/*
2
 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3
 *
4
 * This file is provided under a dual BSD/GPLv2 license.  When using or
5
 * redistributing this file, you may do so under either license.
6
 *
7
 * GPL LICENSE SUMMARY
8
 *
9
 * Copyright(c) 2015 Intel Corporation.
10
 *
11
 * This program is free software; you can redistribute it and/or modify
12
 * it under the terms of version 2 of the GNU General Public License as
13
 * published by the Free Software Foundation.
14
 *
15
 * This program is distributed in the hope that it will be useful, but
16
 * WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * General Public License for more details.
19
 *
20
 * Contact Information:
21
 * 	Sean Gulley <[email protected]>
22
 * 	Tim Chen <[email protected]>
23
 *
24
 * BSD LICENSE
25
 *
26
 * Copyright(c) 2015 Intel Corporation.
27
 *
28
 * Redistribution and use in source and binary forms, with or without
29
 * modification, are permitted provided that the following conditions
30
 * are met:
31
 *
32
 * 	* Redistributions of source code must retain the above copyright
33
 * 	  notice, this list of conditions and the following disclaimer.
34
 * 	* Redistributions in binary form must reproduce the above copyright
35
 * 	  notice, this list of conditions and the following disclaimer in
36
 * 	  the documentation and/or other materials provided with the
37
 * 	  distribution.
38
 * 	* Neither the name of Intel Corporation nor the names of its
39
 * 	  contributors may be used to endorse or promote products derived
40
 * 	  from this software without specific prior written permission.
41
 *
42
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53
 *
54
 */
55

56
#include <linux/linkage.h>
57

58
#define STATE_PTR	%rdi	/* 1st arg */
59
#define DATA_PTR	%rsi	/* 2nd arg */
60
#define NUM_BLKS	%rdx	/* 3rd arg */
61

62
#define SHA256CONSTANTS	%rax
63

64
#define MSG		%xmm0  /* sha256rnds2 implicit operand */
65
#define STATE0		%xmm1
66
#define STATE1		%xmm2
67
#define MSG0		%xmm3
68
#define MSG1		%xmm4
69
#define MSG2		%xmm5
70
#define MSG3		%xmm6
71
#define TMP		%xmm7
72

73
#define SHUF_MASK	%xmm8
74

75
#define ABEF_SAVE	%xmm9
76
#define CDGH_SAVE	%xmm10
77

78
.macro do_4rounds	i, m0, m1, m2, m3
79
.if \i < 16
80
	movdqu		\i*4(DATA_PTR), \m0
81
	pshufb		SHUF_MASK, \m0
82
.endif
83
	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
84
	paddd		\m0, MSG
85
	sha256rnds2	STATE0, STATE1
86
.if \i >= 12 && \i < 60
87
	movdqa		\m0, TMP
88
	palignr		$4, \m3, TMP
89
	paddd		TMP, \m1
90
	sha256msg2	\m0, \m1
91
.endif
92
	punpckhqdq	MSG, MSG
93
	sha256rnds2	STATE1, STATE0
94
.if \i >= 4 && \i < 52
95
	sha256msg1	\m0, \m3
96
.endif
97
.endm
98

99
/*
100
 * Intel SHA Extensions optimized implementation of a SHA-256 block function
101
 *
102
 * This function takes a pointer to the current SHA-256 state, a pointer to the
103
 * input data, and the number of 64-byte blocks to process.  Once all blocks
104
 * have been processed, the state is updated with the new state.  This function
105
 * only processes complete blocks.  State initialization, buffering of partial
106
 * blocks, and digest finalization is expected to be handled elsewhere.
107
 *
108
 * void sha256_ni_transform(struct sha256_block_state *state,
109
 *			    const u8 *data, size_t nblocks);
110
 */
111
.text
112
SYM_FUNC_START(sha256_ni_transform)
113

114
	shl		$6, NUM_BLKS		/*  convert to bytes */
115
	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
116

117
	/*
118
	 * load initial hash values
119
	 * Need to reorder these appropriately
120
	 * DCBA, HGFE -> ABEF, CDGH
121
	 */
122
	movdqu		0*16(STATE_PTR), STATE0		/* DCBA */
123
	movdqu		1*16(STATE_PTR), STATE1		/* HGFE */
124

125
	movdqa		STATE0, TMP
126
	punpcklqdq	STATE1, STATE0			/* FEBA */
127
	punpckhqdq	TMP, STATE1			/* DCHG */
128
	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
129
	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
130

131
	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
132
	lea		K256+32*4(%rip), SHA256CONSTANTS
133

134
.Lloop0:
135
	/* Save hash values for addition after rounds */
136
	movdqa		STATE0, ABEF_SAVE
137
	movdqa		STATE1, CDGH_SAVE
138

139
.irp i, 0, 16, 32, 48
140
	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
141
	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
142
	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
143
	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
144
.endr
145

146
	/* Add current hash values with previously saved */
147
	paddd		ABEF_SAVE, STATE0
148
	paddd		CDGH_SAVE, STATE1
149

150
	/* Increment data pointer and loop if more to process */
151
	add		$64, DATA_PTR
152
	cmp		NUM_BLKS, DATA_PTR
153
	jne		.Lloop0
154

155
	/* Write hash values back in the correct order */
156
	movdqa		STATE0, TMP
157
	punpcklqdq	STATE1, STATE0			/* GHEF */
158
	punpckhqdq	TMP, STATE1			/* ABCD */
159
	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
160
	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
161

162
	movdqu		STATE1, 0*16(STATE_PTR)
163
	movdqu		STATE0, 1*16(STATE_PTR)
164

165
	RET
166
SYM_FUNC_END(sha256_ni_transform)
167

168
#undef DIGEST_PTR
169
#undef DATA_PTR
170
#undef NUM_BLKS
171
#undef SHA256CONSTANTS
172
#undef MSG
173
#undef STATE0
174
#undef STATE1
175
#undef MSG0
176
#undef MSG1
177
#undef MSG2
178
#undef MSG3
179
#undef TMP
180
#undef SHUF_MASK
181
#undef ABEF_SAVE
182
#undef CDGH_SAVE
183

184
// parameters for sha256_ni_finup2x()
185
#define CTX		%rdi
186
#define DATA1		%rsi
187
#define DATA2		%rdx
188
#define LEN		%ecx
189
#define LEN8		%cl
190
#define LEN64		%rcx
191
#define OUT1		%r8
192
#define OUT2		%r9
193

194
// other scalar variables
195
#define SHA256CONSTANTS	%rax
196
#define COUNT		%r10
197
#define COUNT32		%r10d
198
#define FINAL_STEP	%r11d
199

200
// rbx is used as a temporary.
201

202
#define MSG		%xmm0	// sha256rnds2 implicit operand
203
#define STATE0_A	%xmm1
204
#define STATE1_A	%xmm2
205
#define STATE0_B	%xmm3
206
#define STATE1_B	%xmm4
207
#define TMP_A		%xmm5
208
#define TMP_B		%xmm6
209
#define MSG0_A		%xmm7
210
#define MSG1_A		%xmm8
211
#define MSG2_A		%xmm9
212
#define MSG3_A		%xmm10
213
#define MSG0_B		%xmm11
214
#define MSG1_B		%xmm12
215
#define MSG2_B		%xmm13
216
#define MSG3_B		%xmm14
217
#define SHUF_MASK	%xmm15
218

219
#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
220
#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
221
#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
222

223
// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
224
// contain the current 4 message schedule words for the first and second message
225
// respectively.
226
//
227
// If not all the message schedule words have been computed yet, then this also
228
// computes 4 more message schedule words for each message.  m1_a-m3_a contain
229
// the next 3 groups of 4 message schedule words for the first message, and
230
// likewise m1_b-m3_b for the second.  After consuming the current value of
231
// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
232
// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
233
// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
234
// cycle through the registers accordingly.
235
.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
236
	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
237
	movdqa		TMP_A, TMP_B
238
	paddd		\m0_a, TMP_A
239
	paddd		\m0_b, TMP_B
240
.if \i < 48
241
	sha256msg1	\m1_a, \m0_a
242
	sha256msg1	\m1_b, \m0_b
243
.endif
244
	movdqa		TMP_A, MSG
245
	sha256rnds2	STATE0_A, STATE1_A
246
	movdqa		TMP_B, MSG
247
	sha256rnds2	STATE0_B, STATE1_B
248
	pshufd 		$0x0E, TMP_A, MSG
249
	sha256rnds2	STATE1_A, STATE0_A
250
	pshufd 		$0x0E, TMP_B, MSG
251
	sha256rnds2	STATE1_B, STATE0_B
252
.if \i < 48
253
	movdqa		\m3_a, TMP_A
254
	movdqa		\m3_b, TMP_B
255
	palignr		$4, \m2_a, TMP_A
256
	palignr		$4, \m2_b, TMP_B
257
	paddd		TMP_A, \m0_a
258
	paddd		TMP_B, \m0_b
259
	sha256msg2	\m3_a, \m0_a
260
	sha256msg2	\m3_b, \m0_b
261
.endif
262
.endm
263

264
//
265
// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
266
//			  const u8 *data1, const u8 *data2, int len,
267
//			  u8 out1[SHA256_DIGEST_SIZE],
268
//			  u8 out2[SHA256_DIGEST_SIZE]);
269
//
270
// This function computes the SHA-256 digests of two messages |data1| and
271
// |data2| that are both |len| bytes long, starting from the initial context
272
// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
273
//
274
// The instructions for the two SHA-256 operations are interleaved.  On many
275
// CPUs, this is almost twice as fast as hashing each message individually due
276
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
277
//
278
SYM_FUNC_START(sha256_ni_finup2x)
279
	// Allocate 128 bytes of stack space, 16-byte aligned.
280
	push		%rbx
281
	push		%rbp
282
	mov		%rsp, %rbp
283
	sub		$128, %rsp
284
	and		$~15, %rsp
285

286
	// Load the shuffle mask for swapping the endianness of 32-bit words.
287
	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
288

289
	// Set up pointer to the round constants.
290
	lea		K256+32*4(%rip), SHA256CONSTANTS
291

292
	// Initially we're not processing the final blocks.
293
	xor		FINAL_STEP, FINAL_STEP
294

295
	// Load the initial state from ctx->state.
296
	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
297
	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
298
	movdqa		STATE0_A, TMP_A
299
	punpcklqdq	STATE1_A, STATE0_A			// FEBA
300
	punpckhqdq	TMP_A, STATE1_A				// DCHG
301
	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
302
	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
303

304
	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
305
	// bytes that are buffered in ctx->buf.  Also save it in a register with
306
	// LEN added to it.
307
	mov		LEN, LEN
308
	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
309
	lea		(%rbx, LEN64, 1), COUNT
310
	and		$63, %ebx
311
	jz		.Lfinup2x_enter_loop	// No bytes buffered?
312

313
	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
314
	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
315
	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
316
	// unconditionally and rearrange the data as needed.
317

318
	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
319
	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
320
	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
321
	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
322
	movdqa		MSG0_A, 0*16(%rsp)
323
	movdqa		MSG1_A, 1*16(%rsp)
324
	movdqa		MSG2_A, 2*16(%rsp)
325
	movdqa		MSG3_A, 3*16(%rsp)
326

327
	movdqu		0*16(DATA1), MSG0_A
328
	movdqu		1*16(DATA1), MSG1_A
329
	movdqu		2*16(DATA1), MSG2_A
330
	movdqu		3*16(DATA1), MSG3_A
331
	movdqu		MSG0_A, 0*16(%rsp,%rbx)
332
	movdqu		MSG1_A, 1*16(%rsp,%rbx)
333
	movdqu		MSG2_A, 2*16(%rsp,%rbx)
334
	movdqu		MSG3_A, 3*16(%rsp,%rbx)
335
	movdqa		0*16(%rsp), MSG0_A
336
	movdqa		1*16(%rsp), MSG1_A
337
	movdqa		2*16(%rsp), MSG2_A
338
	movdqa		3*16(%rsp), MSG3_A
339

340
	movdqu		0*16(DATA2), MSG0_B
341
	movdqu		1*16(DATA2), MSG1_B
342
	movdqu		2*16(DATA2), MSG2_B
343
	movdqu		3*16(DATA2), MSG3_B
344
	movdqu		MSG0_B, 0*16(%rsp,%rbx)
345
	movdqu		MSG1_B, 1*16(%rsp,%rbx)
346
	movdqu		MSG2_B, 2*16(%rsp,%rbx)
347
	movdqu		MSG3_B, 3*16(%rsp,%rbx)
348
	movdqa		0*16(%rsp), MSG0_B
349
	movdqa		1*16(%rsp), MSG1_B
350
	movdqa		2*16(%rsp), MSG2_B
351
	movdqa		3*16(%rsp), MSG3_B
352

353
	sub		$64, %rbx 	// rbx = buffered - 64
354
	sub		%rbx, DATA1	// DATA1 += 64 - buffered
355
	sub		%rbx, DATA2	// DATA2 += 64 - buffered
356
	add		%ebx, LEN	// LEN += buffered - 64
357
	movdqa		STATE0_A, STATE0_B
358
	movdqa		STATE1_A, STATE1_B
359
	jmp		.Lfinup2x_loop_have_data
360

361
.Lfinup2x_enter_loop:
362
	sub		$64, LEN
363
	movdqa		STATE0_A, STATE0_B
364
	movdqa		STATE1_A, STATE1_B
365
.Lfinup2x_loop:
366
	// Load the next two data blocks.
367
	movdqu		0*16(DATA1), MSG0_A
368
	movdqu		0*16(DATA2), MSG0_B
369
	movdqu		1*16(DATA1), MSG1_A
370
	movdqu		1*16(DATA2), MSG1_B
371
	movdqu		2*16(DATA1), MSG2_A
372
	movdqu		2*16(DATA2), MSG2_B
373
	movdqu		3*16(DATA1), MSG3_A
374
	movdqu		3*16(DATA2), MSG3_B
375
	add		$64, DATA1
376
	add		$64, DATA2
377
.Lfinup2x_loop_have_data:
378
	// Convert the words of the data blocks from big endian.
379
	pshufb		SHUF_MASK, MSG0_A
380
	pshufb		SHUF_MASK, MSG0_B
381
	pshufb		SHUF_MASK, MSG1_A
382
	pshufb		SHUF_MASK, MSG1_B
383
	pshufb		SHUF_MASK, MSG2_A
384
	pshufb		SHUF_MASK, MSG2_B
385
	pshufb		SHUF_MASK, MSG3_A
386
	pshufb		SHUF_MASK, MSG3_B
387
.Lfinup2x_loop_have_bswapped_data:
388

389
	// Save the original state for each block.
390
	movdqa		STATE0_A, 0*16(%rsp)
391
	movdqa		STATE0_B, 1*16(%rsp)
392
	movdqa		STATE1_A, 2*16(%rsp)
393
	movdqa		STATE1_B, 3*16(%rsp)
394

395
	// Do the SHA-256 rounds on each block.
396
.irp i, 0, 16, 32, 48
397
	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
398
				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
399
	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
400
				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
401
	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
402
				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
403
	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
404
				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
405
.endr
406

407
	// Add the original state for each block.
408
	paddd		0*16(%rsp), STATE0_A
409
	paddd		1*16(%rsp), STATE0_B
410
	paddd		2*16(%rsp), STATE1_A
411
	paddd		3*16(%rsp), STATE1_B
412

413
	// Update LEN and loop back if more blocks remain.
414
	sub		$64, LEN
415
	jge		.Lfinup2x_loop
416

417
	// Check if any final blocks need to be handled.
418
	// FINAL_STEP = 2: all done
419
	// FINAL_STEP = 1: need to do count-only padding block
420
	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
421
	cmp		$1, FINAL_STEP
422
	jg		.Lfinup2x_done
423
	je		.Lfinup2x_finalize_countonly
424
	add		$64, LEN
425
	jz		.Lfinup2x_finalize_blockaligned
426

427
	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
428
	// To do this, write the padding starting with the 0x80 byte to
429
	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
430
	// and load from &sp[64 - LEN] to get the needed padding block.  This
431
	// code relies on the data buffers being >= 64 bytes in length.
432
	mov		$64, %ebx
433
	sub		LEN, %ebx		// ebx = 64 - LEN
434
	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
435
	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
436
	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
437
	movd		FINAL_STEP, MSG0_A
438
	pxor		MSG1_A, MSG1_A
439
	movdqa		MSG0_A, 4*16(%rsp)
440
	movdqa		MSG1_A, 5*16(%rsp)
441
	movdqa		MSG1_A, 6*16(%rsp)
442
	movdqa		MSG1_A, 7*16(%rsp)
443
	cmp		$56, LEN
444
	jge		1f	// will COUNT spill into its own block?
445
	shl		$3, COUNT
446
	bswap		COUNT
447
	mov		COUNT, 56(%rsp,%rbx)
448
	mov		$2, FINAL_STEP	// won't need count-only block
449
	jmp		2f
450
1:
451
	mov		$1, FINAL_STEP	// will need count-only block
452
2:
453
	movdqu		0*16(DATA1), MSG0_A
454
	movdqu		1*16(DATA1), MSG1_A
455
	movdqu		2*16(DATA1), MSG2_A
456
	movdqu		3*16(DATA1), MSG3_A
457
	movdqa		MSG0_A, 0*16(%rsp)
458
	movdqa		MSG1_A, 1*16(%rsp)
459
	movdqa		MSG2_A, 2*16(%rsp)
460
	movdqa		MSG3_A, 3*16(%rsp)
461
	movdqu		0*16(%rsp,%rbx), MSG0_A
462
	movdqu		1*16(%rsp,%rbx), MSG1_A
463
	movdqu		2*16(%rsp,%rbx), MSG2_A
464
	movdqu		3*16(%rsp,%rbx), MSG3_A
465

466
	movdqu		0*16(DATA2), MSG0_B
467
	movdqu		1*16(DATA2), MSG1_B
468
	movdqu		2*16(DATA2), MSG2_B
469
	movdqu		3*16(DATA2), MSG3_B
470
	movdqa		MSG0_B, 0*16(%rsp)
471
	movdqa		MSG1_B, 1*16(%rsp)
472
	movdqa		MSG2_B, 2*16(%rsp)
473
	movdqa		MSG3_B, 3*16(%rsp)
474
	movdqu		0*16(%rsp,%rbx), MSG0_B
475
	movdqu		1*16(%rsp,%rbx), MSG1_B
476
	movdqu		2*16(%rsp,%rbx), MSG2_B
477
	movdqu		3*16(%rsp,%rbx), MSG3_B
478
	jmp		.Lfinup2x_loop_have_data
479

480
	// Prepare a padding block, either:
481
	//
482
	//	{0x80, 0, 0, 0, ..., count (as __be64)}
483
	//	This is for a block aligned message.
484
	//
485
	//	{   0, 0, 0, 0, ..., count (as __be64)}
486
	//	This is for a message whose length mod 64 is >= 56.
487
	//
488
	// Pre-swap the endianness of the words.
489
.Lfinup2x_finalize_countonly:
490
	pxor		MSG0_A, MSG0_A
491
	jmp		1f
492

493
.Lfinup2x_finalize_blockaligned:
494
	mov		$0x80000000, %ebx
495
	movd		%ebx, MSG0_A
496
1:
497
	pxor		MSG1_A, MSG1_A
498
	pxor		MSG2_A, MSG2_A
499
	ror		$29, COUNT
500
	movq		COUNT, MSG3_A
501
	pslldq		$8, MSG3_A
502
	movdqa		MSG0_A, MSG0_B
503
	pxor		MSG1_B, MSG1_B
504
	pxor		MSG2_B, MSG2_B
505
	movdqa		MSG3_A, MSG3_B
506
	mov		$2, FINAL_STEP
507
	jmp		.Lfinup2x_loop_have_bswapped_data
508

509
.Lfinup2x_done:
510
	// Write the two digests with all bytes in the correct order.
511
	movdqa		STATE0_A, TMP_A
512
	movdqa		STATE0_B, TMP_B
513
	punpcklqdq	STATE1_A, STATE0_A		// GHEF
514
	punpcklqdq	STATE1_B, STATE0_B
515
	punpckhqdq	TMP_A, STATE1_A			// ABCD
516
	punpckhqdq	TMP_B, STATE1_B
517
	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
518
	pshufd		$0xB1, STATE0_B, STATE0_B
519
	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
520
	pshufd		$0x1B, STATE1_B, STATE1_B
521
	pshufb		SHUF_MASK, STATE0_A
522
	pshufb		SHUF_MASK, STATE0_B
523
	pshufb		SHUF_MASK, STATE1_A
524
	pshufb		SHUF_MASK, STATE1_B
525
	movdqu		STATE0_A, 1*16(OUT1)
526
	movdqu		STATE0_B, 1*16(OUT2)
527
	movdqu		STATE1_A, 0*16(OUT1)
528
	movdqu		STATE1_B, 0*16(OUT2)
529

530
	mov		%rbp, %rsp
531
	pop		%rbp
532
	pop		%rbx
533
	RET
534
SYM_FUNC_END(sha256_ni_finup2x)
535

536
.section	.rodata.cst256.K256, "aM", @progbits, 256
537
.align 64
538
K256:
539
	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
540
	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
541
	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
542
	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
543
	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
544
	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
545
	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
546
	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
547
	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
548
	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
549
	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
550
	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
551
	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
552
	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
553
	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
554
	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
555

556
.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
557
.align 16
558
PSHUFFLE_BYTE_FLIP_MASK:
559
	.octa 0x0c0d0e0f08090a0b0405060700010203
560

561
Product

Resources

Company