Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/sha256-ni-asm.S
29278 views
1
/*
2
* Intel SHA Extensions optimized implementation of a SHA-256 update function
3
*
4
* This file is provided under a dual BSD/GPLv2 license. When using or
5
* redistributing this file, you may do so under either license.
6
*
7
* GPL LICENSE SUMMARY
8
*
9
* Copyright(c) 2015 Intel Corporation.
10
*
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of version 2 of the GNU General Public License as
13
* published by the Free Software Foundation.
14
*
15
* This program is distributed in the hope that it will be useful, but
16
* WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* General Public License for more details.
19
*
20
* Contact Information:
21
* Sean Gulley <[email protected]>
22
* Tim Chen <[email protected]>
23
*
24
* BSD LICENSE
25
*
26
* Copyright(c) 2015 Intel Corporation.
27
*
28
* Redistribution and use in source and binary forms, with or without
29
* modification, are permitted provided that the following conditions
30
* are met:
31
*
32
* * Redistributions of source code must retain the above copyright
33
* notice, this list of conditions and the following disclaimer.
34
* * Redistributions in binary form must reproduce the above copyright
35
* notice, this list of conditions and the following disclaimer in
36
* the documentation and/or other materials provided with the
37
* distribution.
38
* * Neither the name of Intel Corporation nor the names of its
39
* contributors may be used to endorse or promote products derived
40
* from this software without specific prior written permission.
41
*
42
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53
*
54
*/
55
56
#include <linux/linkage.h>
57
58
#define STATE_PTR %rdi /* 1st arg */
59
#define DATA_PTR %rsi /* 2nd arg */
60
#define NUM_BLKS %rdx /* 3rd arg */
61
62
#define SHA256CONSTANTS %rax
63
64
#define MSG %xmm0 /* sha256rnds2 implicit operand */
65
#define STATE0 %xmm1
66
#define STATE1 %xmm2
67
#define MSG0 %xmm3
68
#define MSG1 %xmm4
69
#define MSG2 %xmm5
70
#define MSG3 %xmm6
71
#define TMP %xmm7
72
73
#define SHUF_MASK %xmm8
74
75
#define ABEF_SAVE %xmm9
76
#define CDGH_SAVE %xmm10
77
78
.macro do_4rounds i, m0, m1, m2, m3
79
.if \i < 16
80
movdqu \i*4(DATA_PTR), \m0
81
pshufb SHUF_MASK, \m0
82
.endif
83
movdqa (\i-32)*4(SHA256CONSTANTS), MSG
84
paddd \m0, MSG
85
sha256rnds2 STATE0, STATE1
86
.if \i >= 12 && \i < 60
87
movdqa \m0, TMP
88
palignr $4, \m3, TMP
89
paddd TMP, \m1
90
sha256msg2 \m0, \m1
91
.endif
92
punpckhqdq MSG, MSG
93
sha256rnds2 STATE1, STATE0
94
.if \i >= 4 && \i < 52
95
sha256msg1 \m0, \m3
96
.endif
97
.endm
98
99
/*
100
* Intel SHA Extensions optimized implementation of a SHA-256 block function
101
*
102
* This function takes a pointer to the current SHA-256 state, a pointer to the
103
* input data, and the number of 64-byte blocks to process. Once all blocks
104
* have been processed, the state is updated with the new state. This function
105
* only processes complete blocks. State initialization, buffering of partial
106
* blocks, and digest finalization is expected to be handled elsewhere.
107
*
108
* void sha256_ni_transform(struct sha256_block_state *state,
109
* const u8 *data, size_t nblocks);
110
*/
111
.text
112
SYM_FUNC_START(sha256_ni_transform)
113
114
shl $6, NUM_BLKS /* convert to bytes */
115
add DATA_PTR, NUM_BLKS /* pointer to end of data */
116
117
/*
118
* load initial hash values
119
* Need to reorder these appropriately
120
* DCBA, HGFE -> ABEF, CDGH
121
*/
122
movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
123
movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
124
125
movdqa STATE0, TMP
126
punpcklqdq STATE1, STATE0 /* FEBA */
127
punpckhqdq TMP, STATE1 /* DCHG */
128
pshufd $0x1B, STATE0, STATE0 /* ABEF */
129
pshufd $0xB1, STATE1, STATE1 /* CDGH */
130
131
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
132
lea K256+32*4(%rip), SHA256CONSTANTS
133
134
.Lloop0:
135
/* Save hash values for addition after rounds */
136
movdqa STATE0, ABEF_SAVE
137
movdqa STATE1, CDGH_SAVE
138
139
.irp i, 0, 16, 32, 48
140
do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
141
do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
142
do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
143
do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
144
.endr
145
146
/* Add current hash values with previously saved */
147
paddd ABEF_SAVE, STATE0
148
paddd CDGH_SAVE, STATE1
149
150
/* Increment data pointer and loop if more to process */
151
add $64, DATA_PTR
152
cmp NUM_BLKS, DATA_PTR
153
jne .Lloop0
154
155
/* Write hash values back in the correct order */
156
movdqa STATE0, TMP
157
punpcklqdq STATE1, STATE0 /* GHEF */
158
punpckhqdq TMP, STATE1 /* ABCD */
159
pshufd $0xB1, STATE0, STATE0 /* HGFE */
160
pshufd $0x1B, STATE1, STATE1 /* DCBA */
161
162
movdqu STATE1, 0*16(STATE_PTR)
163
movdqu STATE0, 1*16(STATE_PTR)
164
165
RET
166
SYM_FUNC_END(sha256_ni_transform)
167
168
#undef DIGEST_PTR
169
#undef DATA_PTR
170
#undef NUM_BLKS
171
#undef SHA256CONSTANTS
172
#undef MSG
173
#undef STATE0
174
#undef STATE1
175
#undef MSG0
176
#undef MSG1
177
#undef MSG2
178
#undef MSG3
179
#undef TMP
180
#undef SHUF_MASK
181
#undef ABEF_SAVE
182
#undef CDGH_SAVE
183
184
// parameters for sha256_ni_finup2x()
185
#define CTX %rdi
186
#define DATA1 %rsi
187
#define DATA2 %rdx
188
#define LEN %ecx
189
#define LEN8 %cl
190
#define LEN64 %rcx
191
#define OUT1 %r8
192
#define OUT2 %r9
193
194
// other scalar variables
195
#define SHA256CONSTANTS %rax
196
#define COUNT %r10
197
#define COUNT32 %r10d
198
#define FINAL_STEP %r11d
199
200
// rbx is used as a temporary.
201
202
#define MSG %xmm0 // sha256rnds2 implicit operand
203
#define STATE0_A %xmm1
204
#define STATE1_A %xmm2
205
#define STATE0_B %xmm3
206
#define STATE1_B %xmm4
207
#define TMP_A %xmm5
208
#define TMP_B %xmm6
209
#define MSG0_A %xmm7
210
#define MSG1_A %xmm8
211
#define MSG2_A %xmm9
212
#define MSG3_A %xmm10
213
#define MSG0_B %xmm11
214
#define MSG1_B %xmm12
215
#define MSG2_B %xmm13
216
#define MSG3_B %xmm14
217
#define SHUF_MASK %xmm15
218
219
#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
220
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
221
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
222
223
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
224
// contain the current 4 message schedule words for the first and second message
225
// respectively.
226
//
227
// If not all the message schedule words have been computed yet, then this also
228
// computes 4 more message schedule words for each message. m1_a-m3_a contain
229
// the next 3 groups of 4 message schedule words for the first message, and
230
// likewise m1_b-m3_b for the second. After consuming the current value of
231
// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
232
// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
233
// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
234
// cycle through the registers accordingly.
235
.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
236
movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
237
movdqa TMP_A, TMP_B
238
paddd \m0_a, TMP_A
239
paddd \m0_b, TMP_B
240
.if \i < 48
241
sha256msg1 \m1_a, \m0_a
242
sha256msg1 \m1_b, \m0_b
243
.endif
244
movdqa TMP_A, MSG
245
sha256rnds2 STATE0_A, STATE1_A
246
movdqa TMP_B, MSG
247
sha256rnds2 STATE0_B, STATE1_B
248
pshufd $0x0E, TMP_A, MSG
249
sha256rnds2 STATE1_A, STATE0_A
250
pshufd $0x0E, TMP_B, MSG
251
sha256rnds2 STATE1_B, STATE0_B
252
.if \i < 48
253
movdqa \m3_a, TMP_A
254
movdqa \m3_b, TMP_B
255
palignr $4, \m2_a, TMP_A
256
palignr $4, \m2_b, TMP_B
257
paddd TMP_A, \m0_a
258
paddd TMP_B, \m0_b
259
sha256msg2 \m3_a, \m0_a
260
sha256msg2 \m3_b, \m0_b
261
.endif
262
.endm
263
264
//
265
// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
266
// const u8 *data1, const u8 *data2, int len,
267
// u8 out1[SHA256_DIGEST_SIZE],
268
// u8 out2[SHA256_DIGEST_SIZE]);
269
//
270
// This function computes the SHA-256 digests of two messages |data1| and
271
// |data2| that are both |len| bytes long, starting from the initial context
272
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
273
//
274
// The instructions for the two SHA-256 operations are interleaved. On many
275
// CPUs, this is almost twice as fast as hashing each message individually due
276
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
277
//
278
SYM_FUNC_START(sha256_ni_finup2x)
279
// Allocate 128 bytes of stack space, 16-byte aligned.
280
push %rbx
281
push %rbp
282
mov %rsp, %rbp
283
sub $128, %rsp
284
and $~15, %rsp
285
286
// Load the shuffle mask for swapping the endianness of 32-bit words.
287
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
288
289
// Set up pointer to the round constants.
290
lea K256+32*4(%rip), SHA256CONSTANTS
291
292
// Initially we're not processing the final blocks.
293
xor FINAL_STEP, FINAL_STEP
294
295
// Load the initial state from ctx->state.
296
movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
297
movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
298
movdqa STATE0_A, TMP_A
299
punpcklqdq STATE1_A, STATE0_A // FEBA
300
punpckhqdq TMP_A, STATE1_A // DCHG
301
pshufd $0x1B, STATE0_A, STATE0_A // ABEF
302
pshufd $0xB1, STATE1_A, STATE1_A // CDGH
303
304
// Load ctx->bytecount. Take the mod 64 of it to get the number of
305
// bytes that are buffered in ctx->buf. Also save it in a register with
306
// LEN added to it.
307
mov LEN, LEN
308
mov OFFSETOF_BYTECOUNT(CTX), %rbx
309
lea (%rbx, LEN64, 1), COUNT
310
and $63, %ebx
311
jz .Lfinup2x_enter_loop // No bytes buffered?
312
313
// %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
314
// followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
315
// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
316
// unconditionally and rearrange the data as needed.
317
318
movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
319
movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
320
movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
321
movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
322
movdqa MSG0_A, 0*16(%rsp)
323
movdqa MSG1_A, 1*16(%rsp)
324
movdqa MSG2_A, 2*16(%rsp)
325
movdqa MSG3_A, 3*16(%rsp)
326
327
movdqu 0*16(DATA1), MSG0_A
328
movdqu 1*16(DATA1), MSG1_A
329
movdqu 2*16(DATA1), MSG2_A
330
movdqu 3*16(DATA1), MSG3_A
331
movdqu MSG0_A, 0*16(%rsp,%rbx)
332
movdqu MSG1_A, 1*16(%rsp,%rbx)
333
movdqu MSG2_A, 2*16(%rsp,%rbx)
334
movdqu MSG3_A, 3*16(%rsp,%rbx)
335
movdqa 0*16(%rsp), MSG0_A
336
movdqa 1*16(%rsp), MSG1_A
337
movdqa 2*16(%rsp), MSG2_A
338
movdqa 3*16(%rsp), MSG3_A
339
340
movdqu 0*16(DATA2), MSG0_B
341
movdqu 1*16(DATA2), MSG1_B
342
movdqu 2*16(DATA2), MSG2_B
343
movdqu 3*16(DATA2), MSG3_B
344
movdqu MSG0_B, 0*16(%rsp,%rbx)
345
movdqu MSG1_B, 1*16(%rsp,%rbx)
346
movdqu MSG2_B, 2*16(%rsp,%rbx)
347
movdqu MSG3_B, 3*16(%rsp,%rbx)
348
movdqa 0*16(%rsp), MSG0_B
349
movdqa 1*16(%rsp), MSG1_B
350
movdqa 2*16(%rsp), MSG2_B
351
movdqa 3*16(%rsp), MSG3_B
352
353
sub $64, %rbx // rbx = buffered - 64
354
sub %rbx, DATA1 // DATA1 += 64 - buffered
355
sub %rbx, DATA2 // DATA2 += 64 - buffered
356
add %ebx, LEN // LEN += buffered - 64
357
movdqa STATE0_A, STATE0_B
358
movdqa STATE1_A, STATE1_B
359
jmp .Lfinup2x_loop_have_data
360
361
.Lfinup2x_enter_loop:
362
sub $64, LEN
363
movdqa STATE0_A, STATE0_B
364
movdqa STATE1_A, STATE1_B
365
.Lfinup2x_loop:
366
// Load the next two data blocks.
367
movdqu 0*16(DATA1), MSG0_A
368
movdqu 0*16(DATA2), MSG0_B
369
movdqu 1*16(DATA1), MSG1_A
370
movdqu 1*16(DATA2), MSG1_B
371
movdqu 2*16(DATA1), MSG2_A
372
movdqu 2*16(DATA2), MSG2_B
373
movdqu 3*16(DATA1), MSG3_A
374
movdqu 3*16(DATA2), MSG3_B
375
add $64, DATA1
376
add $64, DATA2
377
.Lfinup2x_loop_have_data:
378
// Convert the words of the data blocks from big endian.
379
pshufb SHUF_MASK, MSG0_A
380
pshufb SHUF_MASK, MSG0_B
381
pshufb SHUF_MASK, MSG1_A
382
pshufb SHUF_MASK, MSG1_B
383
pshufb SHUF_MASK, MSG2_A
384
pshufb SHUF_MASK, MSG2_B
385
pshufb SHUF_MASK, MSG3_A
386
pshufb SHUF_MASK, MSG3_B
387
.Lfinup2x_loop_have_bswapped_data:
388
389
// Save the original state for each block.
390
movdqa STATE0_A, 0*16(%rsp)
391
movdqa STATE0_B, 1*16(%rsp)
392
movdqa STATE1_A, 2*16(%rsp)
393
movdqa STATE1_B, 3*16(%rsp)
394
395
// Do the SHA-256 rounds on each block.
396
.irp i, 0, 16, 32, 48
397
do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
398
MSG0_B, MSG1_B, MSG2_B, MSG3_B
399
do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
400
MSG1_B, MSG2_B, MSG3_B, MSG0_B
401
do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
402
MSG2_B, MSG3_B, MSG0_B, MSG1_B
403
do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
404
MSG3_B, MSG0_B, MSG1_B, MSG2_B
405
.endr
406
407
// Add the original state for each block.
408
paddd 0*16(%rsp), STATE0_A
409
paddd 1*16(%rsp), STATE0_B
410
paddd 2*16(%rsp), STATE1_A
411
paddd 3*16(%rsp), STATE1_B
412
413
// Update LEN and loop back if more blocks remain.
414
sub $64, LEN
415
jge .Lfinup2x_loop
416
417
// Check if any final blocks need to be handled.
418
// FINAL_STEP = 2: all done
419
// FINAL_STEP = 1: need to do count-only padding block
420
// FINAL_STEP = 0: need to do the block with 0x80 padding byte
421
cmp $1, FINAL_STEP
422
jg .Lfinup2x_done
423
je .Lfinup2x_finalize_countonly
424
add $64, LEN
425
jz .Lfinup2x_finalize_blockaligned
426
427
// Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
428
// To do this, write the padding starting with the 0x80 byte to
429
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
430
// and load from &sp[64 - LEN] to get the needed padding block. This
431
// code relies on the data buffers being >= 64 bytes in length.
432
mov $64, %ebx
433
sub LEN, %ebx // ebx = 64 - LEN
434
sub %rbx, DATA1 // DATA1 -= 64 - LEN
435
sub %rbx, DATA2 // DATA2 -= 64 - LEN
436
mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
437
movd FINAL_STEP, MSG0_A
438
pxor MSG1_A, MSG1_A
439
movdqa MSG0_A, 4*16(%rsp)
440
movdqa MSG1_A, 5*16(%rsp)
441
movdqa MSG1_A, 6*16(%rsp)
442
movdqa MSG1_A, 7*16(%rsp)
443
cmp $56, LEN
444
jge 1f // will COUNT spill into its own block?
445
shl $3, COUNT
446
bswap COUNT
447
mov COUNT, 56(%rsp,%rbx)
448
mov $2, FINAL_STEP // won't need count-only block
449
jmp 2f
450
1:
451
mov $1, FINAL_STEP // will need count-only block
452
2:
453
movdqu 0*16(DATA1), MSG0_A
454
movdqu 1*16(DATA1), MSG1_A
455
movdqu 2*16(DATA1), MSG2_A
456
movdqu 3*16(DATA1), MSG3_A
457
movdqa MSG0_A, 0*16(%rsp)
458
movdqa MSG1_A, 1*16(%rsp)
459
movdqa MSG2_A, 2*16(%rsp)
460
movdqa MSG3_A, 3*16(%rsp)
461
movdqu 0*16(%rsp,%rbx), MSG0_A
462
movdqu 1*16(%rsp,%rbx), MSG1_A
463
movdqu 2*16(%rsp,%rbx), MSG2_A
464
movdqu 3*16(%rsp,%rbx), MSG3_A
465
466
movdqu 0*16(DATA2), MSG0_B
467
movdqu 1*16(DATA2), MSG1_B
468
movdqu 2*16(DATA2), MSG2_B
469
movdqu 3*16(DATA2), MSG3_B
470
movdqa MSG0_B, 0*16(%rsp)
471
movdqa MSG1_B, 1*16(%rsp)
472
movdqa MSG2_B, 2*16(%rsp)
473
movdqa MSG3_B, 3*16(%rsp)
474
movdqu 0*16(%rsp,%rbx), MSG0_B
475
movdqu 1*16(%rsp,%rbx), MSG1_B
476
movdqu 2*16(%rsp,%rbx), MSG2_B
477
movdqu 3*16(%rsp,%rbx), MSG3_B
478
jmp .Lfinup2x_loop_have_data
479
480
// Prepare a padding block, either:
481
//
482
// {0x80, 0, 0, 0, ..., count (as __be64)}
483
// This is for a block aligned message.
484
//
485
// { 0, 0, 0, 0, ..., count (as __be64)}
486
// This is for a message whose length mod 64 is >= 56.
487
//
488
// Pre-swap the endianness of the words.
489
.Lfinup2x_finalize_countonly:
490
pxor MSG0_A, MSG0_A
491
jmp 1f
492
493
.Lfinup2x_finalize_blockaligned:
494
mov $0x80000000, %ebx
495
movd %ebx, MSG0_A
496
1:
497
pxor MSG1_A, MSG1_A
498
pxor MSG2_A, MSG2_A
499
ror $29, COUNT
500
movq COUNT, MSG3_A
501
pslldq $8, MSG3_A
502
movdqa MSG0_A, MSG0_B
503
pxor MSG1_B, MSG1_B
504
pxor MSG2_B, MSG2_B
505
movdqa MSG3_A, MSG3_B
506
mov $2, FINAL_STEP
507
jmp .Lfinup2x_loop_have_bswapped_data
508
509
.Lfinup2x_done:
510
// Write the two digests with all bytes in the correct order.
511
movdqa STATE0_A, TMP_A
512
movdqa STATE0_B, TMP_B
513
punpcklqdq STATE1_A, STATE0_A // GHEF
514
punpcklqdq STATE1_B, STATE0_B
515
punpckhqdq TMP_A, STATE1_A // ABCD
516
punpckhqdq TMP_B, STATE1_B
517
pshufd $0xB1, STATE0_A, STATE0_A // HGFE
518
pshufd $0xB1, STATE0_B, STATE0_B
519
pshufd $0x1B, STATE1_A, STATE1_A // DCBA
520
pshufd $0x1B, STATE1_B, STATE1_B
521
pshufb SHUF_MASK, STATE0_A
522
pshufb SHUF_MASK, STATE0_B
523
pshufb SHUF_MASK, STATE1_A
524
pshufb SHUF_MASK, STATE1_B
525
movdqu STATE0_A, 1*16(OUT1)
526
movdqu STATE0_B, 1*16(OUT2)
527
movdqu STATE1_A, 0*16(OUT1)
528
movdqu STATE1_B, 0*16(OUT2)
529
530
mov %rbp, %rsp
531
pop %rbp
532
pop %rbx
533
RET
534
SYM_FUNC_END(sha256_ni_finup2x)
535
536
.section .rodata.cst256.K256, "aM", @progbits, 256
537
.align 64
538
K256:
539
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
540
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
541
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
542
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
543
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
544
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
545
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
546
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
547
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
548
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
549
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
550
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
551
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
552
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
553
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
554
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
555
556
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
557
.align 16
558
PSHUFFLE_BYTE_FLIP_MASK:
559
.octa 0x0c0d0e0f08090a0b0405060700010203
560
561