Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
29278 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3
#
4
# Copyright (C) 2017-2018 Samuel Neves <[email protected]>. All Rights Reserved.
5
# Copyright (C) 2017-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
6
# Copyright (C) 2006-2017 CRYPTOGAMS by <[email protected]>. All Rights Reserved.
7
#
8
# This code is taken from the OpenSSL project but the author, Andy Polyakov,
9
# has relicensed it under the licenses specified in the SPDX header above.
10
# The original headers, including the original license headers, are
11
# included below for completeness.
12
#
13
# ====================================================================
14
# Written by Andy Polyakov <[email protected]> for the OpenSSL
15
# project. The module is, however, dual licensed under OpenSSL and
16
# CRYPTOGAMS licenses depending on where you obtain it. For further
17
# details see http://www.openssl.org/~appro/cryptogams/.
18
# ====================================================================
19
#
20
# This module implements Poly1305 hash for x86_64.
21
#
22
# March 2015
23
#
24
# Initial release.
25
#
26
# December 2016
27
#
28
# Add AVX512F+VL+BW code path.
29
#
30
# November 2017
31
#
32
# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
33
# executed even on Knights Landing. Trigger for modification was
34
# observation that AVX512 code paths can negatively affect overall
35
# Skylake-X system performance. Since we are likely to suppress
36
# AVX512F capability flag [at least on Skylake-X], conversion serves
37
# as kind of "investment protection". Note that next *lake processor,
38
# Cannonlake, has AVX512IFMA code path to execute...
39
#
40
# Numbers are cycles per processed byte with poly1305_blocks alone,
41
# measured with rdtsc at fixed clock frequency.
42
#
43
# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
44
# P4 4.46/+120% -
45
# Core 2 2.41/+90% -
46
# Westmere 1.88/+120% -
47
# Sandy Bridge 1.39/+140% 1.10
48
# Haswell 1.14/+175% 1.11 0.65
49
# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
50
# Silvermont 2.83/+95% -
51
# Knights L 3.60/? 1.65 1.10 0.41(***)
52
# Goldmont 1.70/+180% -
53
# VIA Nano 1.82/+150% -
54
# Sledgehammer 1.38/+160% -
55
# Bulldozer 2.30/+130% 0.97
56
# Ryzen 1.15/+200% 1.08 1.18
57
#
58
# (*) improvement coefficients relative to clang are more modest and
59
# are ~50% on most processors, in both cases we are comparing to
60
# __int128 code;
61
# (**) SSE2 implementation was attempted, but among non-AVX processors
62
# it was faster than integer-only code only on older Intel P4 and
63
# Core processors, 50-30%, less newer processor is, but slower on
64
# contemporary ones, for example almost 2x slower on Atom, and as
65
# former are naturally disappearing, SSE2 is deemed unnecessary;
66
# (***) strangely enough performance seems to vary from core to core,
67
# listed result is best case;
68
69
$flavour = shift;
70
$output = shift;
71
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
72
73
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
74
$kernel=0; $kernel=1 if (!$flavour && !$output);
75
76
if (!$kernel) {
77
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80
die "can't locate x86_64-xlate.pl";
81
82
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83
*STDOUT=*OUT;
84
85
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
86
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
87
$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
88
}
89
90
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
91
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
92
$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
93
$avx += 1 if ($1==2.11 && $2>=8);
94
}
95
96
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
97
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
98
$avx = ($1>=10) + ($1>=11);
99
}
100
101
if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
102
$avx = ($2>=3.0) + ($2>3.0);
103
}
104
} else {
105
$avx = 4; # The kernel uses ifdefs for this.
106
}
107
108
sub declare_function() {
109
my ($name, $align, $nargs) = @_;
110
if($kernel) {
111
$code .= "SYM_FUNC_START($name)\n";
112
$code .= ".L$name:\n";
113
} else {
114
$code .= ".globl $name\n";
115
$code .= ".type $name,\@function,$nargs\n";
116
$code .= ".align $align\n";
117
$code .= "$name:\n";
118
}
119
}
120
121
sub end_function() {
122
my ($name) = @_;
123
if($kernel) {
124
$code .= "SYM_FUNC_END($name)\n";
125
} else {
126
$code .= ".size $name,.-$name\n";
127
}
128
}
129
130
$code.=<<___ if $kernel;
131
#include <linux/linkage.h>
132
___
133
134
if ($avx) {
135
$code.=<<___ if $kernel;
136
.section .rodata
137
___
138
$code.=<<___;
139
.align 64
140
.Lconst:
141
.Lmask24:
142
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
143
.L129:
144
.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
145
.Lmask26:
146
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
147
.Lpermd_avx2:
148
.long 2,2,2,3,2,0,2,1
149
.Lpermd_avx512:
150
.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
151
152
.L2_44_inp_permd:
153
.long 0,1,1,2,2,3,7,7
154
.L2_44_inp_shift:
155
.quad 0,12,24,64
156
.L2_44_mask:
157
.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
158
.L2_44_shift_rgt:
159
.quad 44,44,42,64
160
.L2_44_shift_lft:
161
.quad 8,8,10,64
162
163
.align 64
164
.Lx_mask44:
165
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
166
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
167
.Lx_mask42:
168
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
169
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
170
___
171
}
172
$code.=<<___ if (!$kernel);
173
.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
174
.align 16
175
___
176
177
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
178
my ($mac,$nonce)=($inp,$len); # *_emit arguments
179
my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
180
my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
181
182
sub poly1305_iteration {
183
# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
184
# output: $h0-$h2 *= $r0-$r1
185
$code.=<<___;
186
mulq $h0 # h0*r1
187
mov %rax,$d2
188
mov $r0,%rax
189
mov %rdx,$d3
190
191
mulq $h0 # h0*r0
192
mov %rax,$h0 # future $h0
193
mov $r0,%rax
194
mov %rdx,$d1
195
196
mulq $h1 # h1*r0
197
add %rax,$d2
198
mov $s1,%rax
199
adc %rdx,$d3
200
201
mulq $h1 # h1*s1
202
mov $h2,$h1 # borrow $h1
203
add %rax,$h0
204
adc %rdx,$d1
205
206
imulq $s1,$h1 # h2*s1
207
add $h1,$d2
208
mov $d1,$h1
209
adc \$0,$d3
210
211
imulq $r0,$h2 # h2*r0
212
add $d2,$h1
213
mov \$-4,%rax # mask value
214
adc $h2,$d3
215
216
and $d3,%rax # last reduction step
217
mov $d3,$h2
218
shr \$2,$d3
219
and \$3,$h2
220
add $d3,%rax
221
add %rax,$h0
222
adc \$0,$h1
223
adc \$0,$h2
224
___
225
}
226
227
########################################################################
228
# Layout of opaque area is following.
229
#
230
# unsigned __int64 h[3]; # current hash value base 2^64
231
# unsigned __int64 r[2]; # key value base 2^64
232
233
$code.=<<___;
234
.text
235
___
236
$code.=<<___ if (!$kernel);
237
.extern OPENSSL_ia32cap_P
238
239
.globl poly1305_init_x86_64
240
.hidden poly1305_init_x86_64
241
.globl poly1305_blocks_x86_64
242
.hidden poly1305_blocks_x86_64
243
.globl poly1305_emit_x86_64
244
.hidden poly1305_emit_x86_64
245
___
246
&declare_function("poly1305_init_x86_64", 32, 3);
247
$code.=<<___;
248
xor %eax,%eax
249
mov %rax,0($ctx) # initialize hash value
250
mov %rax,8($ctx)
251
mov %rax,16($ctx)
252
253
test $inp,$inp
254
je .Lno_key
255
___
256
$code.=<<___ if (!$kernel);
257
lea poly1305_blocks_x86_64(%rip),%r10
258
lea poly1305_emit_x86_64(%rip),%r11
259
___
260
$code.=<<___ if (!$kernel && $avx);
261
mov OPENSSL_ia32cap_P+4(%rip),%r9
262
lea poly1305_blocks_avx(%rip),%rax
263
lea poly1305_emit_avx(%rip),%rcx
264
bt \$`60-32`,%r9 # AVX?
265
cmovc %rax,%r10
266
cmovc %rcx,%r11
267
___
268
$code.=<<___ if (!$kernel && $avx>1);
269
lea poly1305_blocks_avx2(%rip),%rax
270
bt \$`5+32`,%r9 # AVX2?
271
cmovc %rax,%r10
272
___
273
$code.=<<___ if (!$kernel && $avx>3);
274
mov \$`(1<<31|1<<21|1<<16)`,%rax
275
shr \$32,%r9
276
and %rax,%r9
277
cmp %rax,%r9
278
je .Linit_base2_44
279
___
280
$code.=<<___;
281
mov \$0x0ffffffc0fffffff,%rax
282
mov \$0x0ffffffc0ffffffc,%rcx
283
and 0($inp),%rax
284
and 8($inp),%rcx
285
mov %rax,24($ctx)
286
mov %rcx,32($ctx)
287
___
288
$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
289
mov %r10,0(%rdx)
290
mov %r11,8(%rdx)
291
___
292
$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
293
mov %r10d,0(%rdx)
294
mov %r11d,4(%rdx)
295
___
296
$code.=<<___;
297
mov \$1,%eax
298
.Lno_key:
299
RET
300
___
301
&end_function("poly1305_init_x86_64");
302
303
&declare_function("poly1305_blocks_x86_64", 32, 4);
304
$code.=<<___;
305
.cfi_startproc
306
.Lblocks:
307
shr \$4,$len
308
jz .Lno_data # too short
309
310
push %rbx
311
.cfi_push %rbx
312
push %r12
313
.cfi_push %r12
314
push %r13
315
.cfi_push %r13
316
push %r14
317
.cfi_push %r14
318
push %r15
319
.cfi_push %r15
320
push $ctx
321
.cfi_push $ctx
322
.Lblocks_body:
323
324
mov $len,%r15 # reassign $len
325
326
mov 24($ctx),$r0 # load r
327
mov 32($ctx),$s1
328
329
mov 0($ctx),$h0 # load hash value
330
mov 8($ctx),$h1
331
mov 16($ctx),$h2
332
333
mov $s1,$r1
334
shr \$2,$s1
335
mov $r1,%rax
336
add $r1,$s1 # s1 = r1 + (r1 >> 2)
337
jmp .Loop
338
339
.align 32
340
.Loop:
341
add 0($inp),$h0 # accumulate input
342
adc 8($inp),$h1
343
lea 16($inp),$inp
344
adc $padbit,$h2
345
___
346
347
&poly1305_iteration();
348
349
$code.=<<___;
350
mov $r1,%rax
351
dec %r15 # len-=16
352
jnz .Loop
353
354
mov 0(%rsp),$ctx
355
.cfi_restore $ctx
356
357
mov $h0,0($ctx) # store hash value
358
mov $h1,8($ctx)
359
mov $h2,16($ctx)
360
361
mov 8(%rsp),%r15
362
.cfi_restore %r15
363
mov 16(%rsp),%r14
364
.cfi_restore %r14
365
mov 24(%rsp),%r13
366
.cfi_restore %r13
367
mov 32(%rsp),%r12
368
.cfi_restore %r12
369
mov 40(%rsp),%rbx
370
.cfi_restore %rbx
371
lea 48(%rsp),%rsp
372
.cfi_adjust_cfa_offset -48
373
.Lno_data:
374
.Lblocks_epilogue:
375
RET
376
.cfi_endproc
377
___
378
&end_function("poly1305_blocks_x86_64");
379
380
&declare_function("poly1305_emit_x86_64", 32, 3);
381
$code.=<<___;
382
.Lemit:
383
mov 0($ctx),%r8 # load hash value
384
mov 8($ctx),%r9
385
mov 16($ctx),%r10
386
387
mov %r8,%rax
388
add \$5,%r8 # compare to modulus
389
mov %r9,%rcx
390
adc \$0,%r9
391
adc \$0,%r10
392
shr \$2,%r10 # did 130-bit value overflow?
393
cmovnz %r8,%rax
394
cmovnz %r9,%rcx
395
396
add 0($nonce),%rax # accumulate nonce
397
adc 8($nonce),%rcx
398
mov %rax,0($mac) # write result
399
mov %rcx,8($mac)
400
401
RET
402
___
403
&end_function("poly1305_emit_x86_64");
404
if ($avx) {
405
406
########################################################################
407
# Layout of opaque area is following.
408
#
409
# unsigned __int32 h[5]; # current hash value base 2^26
410
# unsigned __int32 is_base2_26;
411
# unsigned __int64 r[2]; # key value base 2^64
412
# unsigned __int64 pad;
413
# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
414
#
415
# where r^n are base 2^26 digits of degrees of multiplier key. There are
416
# 5 digits, but last four are interleaved with multiples of 5, totalling
417
# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
418
419
my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
420
map("%xmm$_",(0..15));
421
422
$code.=<<___;
423
.type __poly1305_block,\@abi-omnipotent
424
.align 32
425
__poly1305_block:
426
push $ctx
427
___
428
&poly1305_iteration();
429
$code.=<<___;
430
pop $ctx
431
RET
432
.size __poly1305_block,.-__poly1305_block
433
434
.type __poly1305_init_avx,\@abi-omnipotent
435
.align 32
436
__poly1305_init_avx:
437
push %rbp
438
mov %rsp,%rbp
439
mov $r0,$h0
440
mov $r1,$h1
441
xor $h2,$h2
442
443
lea 48+64($ctx),$ctx # size optimization
444
445
mov $r1,%rax
446
call __poly1305_block # r^2
447
448
mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
449
mov \$0x3ffffff,%edx
450
mov $h0,$d1
451
and $h0#d,%eax
452
mov $r0,$d2
453
and $r0#d,%edx
454
mov %eax,`16*0+0-64`($ctx)
455
shr \$26,$d1
456
mov %edx,`16*0+4-64`($ctx)
457
shr \$26,$d2
458
459
mov \$0x3ffffff,%eax
460
mov \$0x3ffffff,%edx
461
and $d1#d,%eax
462
and $d2#d,%edx
463
mov %eax,`16*1+0-64`($ctx)
464
lea (%rax,%rax,4),%eax # *5
465
mov %edx,`16*1+4-64`($ctx)
466
lea (%rdx,%rdx,4),%edx # *5
467
mov %eax,`16*2+0-64`($ctx)
468
shr \$26,$d1
469
mov %edx,`16*2+4-64`($ctx)
470
shr \$26,$d2
471
472
mov $h1,%rax
473
mov $r1,%rdx
474
shl \$12,%rax
475
shl \$12,%rdx
476
or $d1,%rax
477
or $d2,%rdx
478
and \$0x3ffffff,%eax
479
and \$0x3ffffff,%edx
480
mov %eax,`16*3+0-64`($ctx)
481
lea (%rax,%rax,4),%eax # *5
482
mov %edx,`16*3+4-64`($ctx)
483
lea (%rdx,%rdx,4),%edx # *5
484
mov %eax,`16*4+0-64`($ctx)
485
mov $h1,$d1
486
mov %edx,`16*4+4-64`($ctx)
487
mov $r1,$d2
488
489
mov \$0x3ffffff,%eax
490
mov \$0x3ffffff,%edx
491
shr \$14,$d1
492
shr \$14,$d2
493
and $d1#d,%eax
494
and $d2#d,%edx
495
mov %eax,`16*5+0-64`($ctx)
496
lea (%rax,%rax,4),%eax # *5
497
mov %edx,`16*5+4-64`($ctx)
498
lea (%rdx,%rdx,4),%edx # *5
499
mov %eax,`16*6+0-64`($ctx)
500
shr \$26,$d1
501
mov %edx,`16*6+4-64`($ctx)
502
shr \$26,$d2
503
504
mov $h2,%rax
505
shl \$24,%rax
506
or %rax,$d1
507
mov $d1#d,`16*7+0-64`($ctx)
508
lea ($d1,$d1,4),$d1 # *5
509
mov $d2#d,`16*7+4-64`($ctx)
510
lea ($d2,$d2,4),$d2 # *5
511
mov $d1#d,`16*8+0-64`($ctx)
512
mov $d2#d,`16*8+4-64`($ctx)
513
514
mov $r1,%rax
515
call __poly1305_block # r^3
516
517
mov \$0x3ffffff,%eax # save r^3 base 2^26
518
mov $h0,$d1
519
and $h0#d,%eax
520
shr \$26,$d1
521
mov %eax,`16*0+12-64`($ctx)
522
523
mov \$0x3ffffff,%edx
524
and $d1#d,%edx
525
mov %edx,`16*1+12-64`($ctx)
526
lea (%rdx,%rdx,4),%edx # *5
527
shr \$26,$d1
528
mov %edx,`16*2+12-64`($ctx)
529
530
mov $h1,%rax
531
shl \$12,%rax
532
or $d1,%rax
533
and \$0x3ffffff,%eax
534
mov %eax,`16*3+12-64`($ctx)
535
lea (%rax,%rax,4),%eax # *5
536
mov $h1,$d1
537
mov %eax,`16*4+12-64`($ctx)
538
539
mov \$0x3ffffff,%edx
540
shr \$14,$d1
541
and $d1#d,%edx
542
mov %edx,`16*5+12-64`($ctx)
543
lea (%rdx,%rdx,4),%edx # *5
544
shr \$26,$d1
545
mov %edx,`16*6+12-64`($ctx)
546
547
mov $h2,%rax
548
shl \$24,%rax
549
or %rax,$d1
550
mov $d1#d,`16*7+12-64`($ctx)
551
lea ($d1,$d1,4),$d1 # *5
552
mov $d1#d,`16*8+12-64`($ctx)
553
554
mov $r1,%rax
555
call __poly1305_block # r^4
556
557
mov \$0x3ffffff,%eax # save r^4 base 2^26
558
mov $h0,$d1
559
and $h0#d,%eax
560
shr \$26,$d1
561
mov %eax,`16*0+8-64`($ctx)
562
563
mov \$0x3ffffff,%edx
564
and $d1#d,%edx
565
mov %edx,`16*1+8-64`($ctx)
566
lea (%rdx,%rdx,4),%edx # *5
567
shr \$26,$d1
568
mov %edx,`16*2+8-64`($ctx)
569
570
mov $h1,%rax
571
shl \$12,%rax
572
or $d1,%rax
573
and \$0x3ffffff,%eax
574
mov %eax,`16*3+8-64`($ctx)
575
lea (%rax,%rax,4),%eax # *5
576
mov $h1,$d1
577
mov %eax,`16*4+8-64`($ctx)
578
579
mov \$0x3ffffff,%edx
580
shr \$14,$d1
581
and $d1#d,%edx
582
mov %edx,`16*5+8-64`($ctx)
583
lea (%rdx,%rdx,4),%edx # *5
584
shr \$26,$d1
585
mov %edx,`16*6+8-64`($ctx)
586
587
mov $h2,%rax
588
shl \$24,%rax
589
or %rax,$d1
590
mov $d1#d,`16*7+8-64`($ctx)
591
lea ($d1,$d1,4),$d1 # *5
592
mov $d1#d,`16*8+8-64`($ctx)
593
594
lea -48-64($ctx),$ctx # size [de-]optimization
595
pop %rbp
596
RET
597
.size __poly1305_init_avx,.-__poly1305_init_avx
598
___
599
600
&declare_function("poly1305_blocks_avx", 32, 4);
601
$code.=<<___;
602
.cfi_startproc
603
mov 20($ctx),%r8d # is_base2_26
604
cmp \$128,$len
605
jae .Lblocks_avx
606
test %r8d,%r8d
607
jz .Lblocks
608
609
.Lblocks_avx:
610
and \$-16,$len
611
jz .Lno_data_avx
612
613
vzeroupper
614
615
test %r8d,%r8d
616
jz .Lbase2_64_avx
617
618
test \$31,$len
619
jz .Leven_avx
620
621
push %rbp
622
.cfi_push %rbp
623
mov %rsp,%rbp
624
push %rbx
625
.cfi_push %rbx
626
push %r12
627
.cfi_push %r12
628
push %r13
629
.cfi_push %r13
630
push %r14
631
.cfi_push %r14
632
push %r15
633
.cfi_push %r15
634
.Lblocks_avx_body:
635
636
mov $len,%r15 # reassign $len
637
638
mov 0($ctx),$d1 # load hash value
639
mov 8($ctx),$d2
640
mov 16($ctx),$h2#d
641
642
mov 24($ctx),$r0 # load r
643
mov 32($ctx),$s1
644
645
################################# base 2^26 -> base 2^64
646
mov $d1#d,$h0#d
647
and \$`-1*(1<<31)`,$d1
648
mov $d2,$r1 # borrow $r1
649
mov $d2#d,$h1#d
650
and \$`-1*(1<<31)`,$d2
651
652
shr \$6,$d1
653
shl \$52,$r1
654
add $d1,$h0
655
shr \$12,$h1
656
shr \$18,$d2
657
add $r1,$h0
658
adc $d2,$h1
659
660
mov $h2,$d1
661
shl \$40,$d1
662
shr \$24,$h2
663
add $d1,$h1
664
adc \$0,$h2 # can be partially reduced...
665
666
mov \$-4,$d2 # ... so reduce
667
mov $h2,$d1
668
and $h2,$d2
669
shr \$2,$d1
670
and \$3,$h2
671
add $d2,$d1 # =*5
672
add $d1,$h0
673
adc \$0,$h1
674
adc \$0,$h2
675
676
mov $s1,$r1
677
mov $s1,%rax
678
shr \$2,$s1
679
add $r1,$s1 # s1 = r1 + (r1 >> 2)
680
681
add 0($inp),$h0 # accumulate input
682
adc 8($inp),$h1
683
lea 16($inp),$inp
684
adc $padbit,$h2
685
686
call __poly1305_block
687
688
test $padbit,$padbit # if $padbit is zero,
689
jz .Lstore_base2_64_avx # store hash in base 2^64 format
690
691
################################# base 2^64 -> base 2^26
692
mov $h0,%rax
693
mov $h0,%rdx
694
shr \$52,$h0
695
mov $h1,$r0
696
mov $h1,$r1
697
shr \$26,%rdx
698
and \$0x3ffffff,%rax # h[0]
699
shl \$12,$r0
700
and \$0x3ffffff,%rdx # h[1]
701
shr \$14,$h1
702
or $r0,$h0
703
shl \$24,$h2
704
and \$0x3ffffff,$h0 # h[2]
705
shr \$40,$r1
706
and \$0x3ffffff,$h1 # h[3]
707
or $r1,$h2 # h[4]
708
709
sub \$16,%r15
710
jz .Lstore_base2_26_avx
711
712
vmovd %rax#d,$H0
713
vmovd %rdx#d,$H1
714
vmovd $h0#d,$H2
715
vmovd $h1#d,$H3
716
vmovd $h2#d,$H4
717
jmp .Lproceed_avx
718
719
.align 32
720
.Lstore_base2_64_avx:
721
mov $h0,0($ctx)
722
mov $h1,8($ctx)
723
mov $h2,16($ctx) # note that is_base2_26 is zeroed
724
jmp .Ldone_avx
725
726
.align 16
727
.Lstore_base2_26_avx:
728
mov %rax#d,0($ctx) # store hash value base 2^26
729
mov %rdx#d,4($ctx)
730
mov $h0#d,8($ctx)
731
mov $h1#d,12($ctx)
732
mov $h2#d,16($ctx)
733
.align 16
734
.Ldone_avx:
735
pop %r15
736
.cfi_restore %r15
737
pop %r14
738
.cfi_restore %r14
739
pop %r13
740
.cfi_restore %r13
741
pop %r12
742
.cfi_restore %r12
743
pop %rbx
744
.cfi_restore %rbx
745
pop %rbp
746
.cfi_restore %rbp
747
.Lno_data_avx:
748
.Lblocks_avx_epilogue:
749
RET
750
.cfi_endproc
751
752
.align 32
753
.Lbase2_64_avx:
754
.cfi_startproc
755
push %rbp
756
.cfi_push %rbp
757
mov %rsp,%rbp
758
push %rbx
759
.cfi_push %rbx
760
push %r12
761
.cfi_push %r12
762
push %r13
763
.cfi_push %r13
764
push %r14
765
.cfi_push %r14
766
push %r15
767
.cfi_push %r15
768
.Lbase2_64_avx_body:
769
770
mov $len,%r15 # reassign $len
771
772
mov 24($ctx),$r0 # load r
773
mov 32($ctx),$s1
774
775
mov 0($ctx),$h0 # load hash value
776
mov 8($ctx),$h1
777
mov 16($ctx),$h2#d
778
779
mov $s1,$r1
780
mov $s1,%rax
781
shr \$2,$s1
782
add $r1,$s1 # s1 = r1 + (r1 >> 2)
783
784
test \$31,$len
785
jz .Linit_avx
786
787
add 0($inp),$h0 # accumulate input
788
adc 8($inp),$h1
789
lea 16($inp),$inp
790
adc $padbit,$h2
791
sub \$16,%r15
792
793
call __poly1305_block
794
795
.Linit_avx:
796
################################# base 2^64 -> base 2^26
797
mov $h0,%rax
798
mov $h0,%rdx
799
shr \$52,$h0
800
mov $h1,$d1
801
mov $h1,$d2
802
shr \$26,%rdx
803
and \$0x3ffffff,%rax # h[0]
804
shl \$12,$d1
805
and \$0x3ffffff,%rdx # h[1]
806
shr \$14,$h1
807
or $d1,$h0
808
shl \$24,$h2
809
and \$0x3ffffff,$h0 # h[2]
810
shr \$40,$d2
811
and \$0x3ffffff,$h1 # h[3]
812
or $d2,$h2 # h[4]
813
814
vmovd %rax#d,$H0
815
vmovd %rdx#d,$H1
816
vmovd $h0#d,$H2
817
vmovd $h1#d,$H3
818
vmovd $h2#d,$H4
819
movl \$1,20($ctx) # set is_base2_26
820
821
call __poly1305_init_avx
822
823
.Lproceed_avx:
824
mov %r15,$len
825
pop %r15
826
.cfi_restore %r15
827
pop %r14
828
.cfi_restore %r14
829
pop %r13
830
.cfi_restore %r13
831
pop %r12
832
.cfi_restore %r12
833
pop %rbx
834
.cfi_restore %rbx
835
pop %rbp
836
.cfi_restore %rbp
837
.Lbase2_64_avx_epilogue:
838
jmp .Ldo_avx
839
.cfi_endproc
840
841
.align 32
842
.Leven_avx:
843
.cfi_startproc
844
vmovd 4*0($ctx),$H0 # load hash value
845
vmovd 4*1($ctx),$H1
846
vmovd 4*2($ctx),$H2
847
vmovd 4*3($ctx),$H3
848
vmovd 4*4($ctx),$H4
849
850
.Ldo_avx:
851
___
852
$code.=<<___ if (!$win64);
853
lea 8(%rsp),%r10
854
.cfi_def_cfa_register %r10
855
and \$-32,%rsp
856
sub \$-8,%rsp
857
lea -0x58(%rsp),%r11
858
sub \$0x178,%rsp
859
___
860
$code.=<<___ if ($win64);
861
lea -0xf8(%rsp),%r11
862
sub \$0x218,%rsp
863
vmovdqa %xmm6,0x50(%r11)
864
vmovdqa %xmm7,0x60(%r11)
865
vmovdqa %xmm8,0x70(%r11)
866
vmovdqa %xmm9,0x80(%r11)
867
vmovdqa %xmm10,0x90(%r11)
868
vmovdqa %xmm11,0xa0(%r11)
869
vmovdqa %xmm12,0xb0(%r11)
870
vmovdqa %xmm13,0xc0(%r11)
871
vmovdqa %xmm14,0xd0(%r11)
872
vmovdqa %xmm15,0xe0(%r11)
873
.Ldo_avx_body:
874
___
875
$code.=<<___;
876
sub \$64,$len
877
lea -32($inp),%rax
878
cmovc %rax,$inp
879
880
vmovdqu `16*3`($ctx),$D4 # preload r0^2
881
lea `16*3+64`($ctx),$ctx # size optimization
882
lea .Lconst(%rip),%rcx
883
884
################################################################
885
# load input
886
vmovdqu 16*2($inp),$T0
887
vmovdqu 16*3($inp),$T1
888
vmovdqa 64(%rcx),$MASK # .Lmask26
889
890
vpsrldq \$6,$T0,$T2 # splat input
891
vpsrldq \$6,$T1,$T3
892
vpunpckhqdq $T1,$T0,$T4 # 4
893
vpunpcklqdq $T1,$T0,$T0 # 0:1
894
vpunpcklqdq $T3,$T2,$T3 # 2:3
895
896
vpsrlq \$40,$T4,$T4 # 4
897
vpsrlq \$26,$T0,$T1
898
vpand $MASK,$T0,$T0 # 0
899
vpsrlq \$4,$T3,$T2
900
vpand $MASK,$T1,$T1 # 1
901
vpsrlq \$30,$T3,$T3
902
vpand $MASK,$T2,$T2 # 2
903
vpand $MASK,$T3,$T3 # 3
904
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
905
906
jbe .Lskip_loop_avx
907
908
# expand and copy pre-calculated table to stack
909
vmovdqu `16*1-64`($ctx),$D1
910
vmovdqu `16*2-64`($ctx),$D2
911
vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
912
vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
913
vmovdqa $D3,-0x90(%r11)
914
vmovdqa $D0,0x00(%rsp)
915
vpshufd \$0xEE,$D1,$D4
916
vmovdqu `16*3-64`($ctx),$D0
917
vpshufd \$0x44,$D1,$D1
918
vmovdqa $D4,-0x80(%r11)
919
vmovdqa $D1,0x10(%rsp)
920
vpshufd \$0xEE,$D2,$D3
921
vmovdqu `16*4-64`($ctx),$D1
922
vpshufd \$0x44,$D2,$D2
923
vmovdqa $D3,-0x70(%r11)
924
vmovdqa $D2,0x20(%rsp)
925
vpshufd \$0xEE,$D0,$D4
926
vmovdqu `16*5-64`($ctx),$D2
927
vpshufd \$0x44,$D0,$D0
928
vmovdqa $D4,-0x60(%r11)
929
vmovdqa $D0,0x30(%rsp)
930
vpshufd \$0xEE,$D1,$D3
931
vmovdqu `16*6-64`($ctx),$D0
932
vpshufd \$0x44,$D1,$D1
933
vmovdqa $D3,-0x50(%r11)
934
vmovdqa $D1,0x40(%rsp)
935
vpshufd \$0xEE,$D2,$D4
936
vmovdqu `16*7-64`($ctx),$D1
937
vpshufd \$0x44,$D2,$D2
938
vmovdqa $D4,-0x40(%r11)
939
vmovdqa $D2,0x50(%rsp)
940
vpshufd \$0xEE,$D0,$D3
941
vmovdqu `16*8-64`($ctx),$D2
942
vpshufd \$0x44,$D0,$D0
943
vmovdqa $D3,-0x30(%r11)
944
vmovdqa $D0,0x60(%rsp)
945
vpshufd \$0xEE,$D1,$D4
946
vpshufd \$0x44,$D1,$D1
947
vmovdqa $D4,-0x20(%r11)
948
vmovdqa $D1,0x70(%rsp)
949
vpshufd \$0xEE,$D2,$D3
950
vmovdqa 0x00(%rsp),$D4 # preload r0^2
951
vpshufd \$0x44,$D2,$D2
952
vmovdqa $D3,-0x10(%r11)
953
vmovdqa $D2,0x80(%rsp)
954
955
jmp .Loop_avx
956
957
.align 32
958
.Loop_avx:
959
################################################################
960
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
961
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
962
# \___________________/
963
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
964
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
965
# \___________________/ \____________________/
966
#
967
# Note that we start with inp[2:3]*r^2. This is because it
968
# doesn't depend on reduction in previous iteration.
969
################################################################
970
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
971
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
972
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
973
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
974
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
975
#
976
# though note that $Tx and $Hx are "reversed" in this section,
977
# and $D4 is preloaded with r0^2...
978
979
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
980
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
981
vmovdqa $H2,0x20(%r11) # offload hash
982
vpmuludq $T2,$D4,$D2 # d3 = h2*r0
983
vmovdqa 0x10(%rsp),$H2 # r1^2
984
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
985
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
986
987
vmovdqa $H0,0x00(%r11) #
988
vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
989
vmovdqa $H1,0x10(%r11) #
990
vpmuludq $T3,$H2,$H1 # h3*r1
991
vpaddq $H0,$D0,$D0 # d0 += h4*s1
992
vpaddq $H1,$D4,$D4 # d4 += h3*r1
993
vmovdqa $H3,0x30(%r11) #
994
vpmuludq $T2,$H2,$H0 # h2*r1
995
vpmuludq $T1,$H2,$H1 # h1*r1
996
vpaddq $H0,$D3,$D3 # d3 += h2*r1
997
vmovdqa 0x30(%rsp),$H3 # r2^2
998
vpaddq $H1,$D2,$D2 # d2 += h1*r1
999
vmovdqa $H4,0x40(%r11) #
1000
vpmuludq $T0,$H2,$H2 # h0*r1
1001
vpmuludq $T2,$H3,$H0 # h2*r2
1002
vpaddq $H2,$D1,$D1 # d1 += h0*r1
1003
1004
vmovdqa 0x40(%rsp),$H4 # s2^2
1005
vpaddq $H0,$D4,$D4 # d4 += h2*r2
1006
vpmuludq $T1,$H3,$H1 # h1*r2
1007
vpmuludq $T0,$H3,$H3 # h0*r2
1008
vpaddq $H1,$D3,$D3 # d3 += h1*r2
1009
vmovdqa 0x50(%rsp),$H2 # r3^2
1010
vpaddq $H3,$D2,$D2 # d2 += h0*r2
1011
vpmuludq $T4,$H4,$H0 # h4*s2
1012
vpmuludq $T3,$H4,$H4 # h3*s2
1013
vpaddq $H0,$D1,$D1 # d1 += h4*s2
1014
vmovdqa 0x60(%rsp),$H3 # s3^2
1015
vpaddq $H4,$D0,$D0 # d0 += h3*s2
1016
1017
vmovdqa 0x80(%rsp),$H4 # s4^2
1018
vpmuludq $T1,$H2,$H1 # h1*r3
1019
vpmuludq $T0,$H2,$H2 # h0*r3
1020
vpaddq $H1,$D4,$D4 # d4 += h1*r3
1021
vpaddq $H2,$D3,$D3 # d3 += h0*r3
1022
vpmuludq $T4,$H3,$H0 # h4*s3
1023
vpmuludq $T3,$H3,$H1 # h3*s3
1024
vpaddq $H0,$D2,$D2 # d2 += h4*s3
1025
vmovdqu 16*0($inp),$H0 # load input
1026
vpaddq $H1,$D1,$D1 # d1 += h3*s3
1027
vpmuludq $T2,$H3,$H3 # h2*s3
1028
vpmuludq $T2,$H4,$T2 # h2*s4
1029
vpaddq $H3,$D0,$D0 # d0 += h2*s3
1030
1031
vmovdqu 16*1($inp),$H1 #
1032
vpaddq $T2,$D1,$D1 # d1 += h2*s4
1033
vpmuludq $T3,$H4,$T3 # h3*s4
1034
vpmuludq $T4,$H4,$T4 # h4*s4
1035
vpsrldq \$6,$H0,$H2 # splat input
1036
vpaddq $T3,$D2,$D2 # d2 += h3*s4
1037
vpaddq $T4,$D3,$D3 # d3 += h4*s4
1038
vpsrldq \$6,$H1,$H3 #
1039
vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
1040
vpmuludq $T1,$H4,$T0 # h1*s4
1041
vpunpckhqdq $H1,$H0,$H4 # 4
1042
vpaddq $T4,$D4,$D4 # d4 += h0*r4
1043
vmovdqa -0x90(%r11),$T4 # r0^4
1044
vpaddq $T0,$D0,$D0 # d0 += h1*s4
1045
1046
vpunpcklqdq $H1,$H0,$H0 # 0:1
1047
vpunpcklqdq $H3,$H2,$H3 # 2:3
1048
1049
#vpsrlq \$40,$H4,$H4 # 4
1050
vpsrldq \$`40/8`,$H4,$H4 # 4
1051
vpsrlq \$26,$H0,$H1
1052
vpand $MASK,$H0,$H0 # 0
1053
vpsrlq \$4,$H3,$H2
1054
vpand $MASK,$H1,$H1 # 1
1055
vpand 0(%rcx),$H4,$H4 # .Lmask24
1056
vpsrlq \$30,$H3,$H3
1057
vpand $MASK,$H2,$H2 # 2
1058
vpand $MASK,$H3,$H3 # 3
1059
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1060
1061
vpaddq 0x00(%r11),$H0,$H0 # add hash value
1062
vpaddq 0x10(%r11),$H1,$H1
1063
vpaddq 0x20(%r11),$H2,$H2
1064
vpaddq 0x30(%r11),$H3,$H3
1065
vpaddq 0x40(%r11),$H4,$H4
1066
1067
lea 16*2($inp),%rax
1068
lea 16*4($inp),$inp
1069
sub \$64,$len
1070
cmovc %rax,$inp
1071
1072
################################################################
1073
# Now we accumulate (inp[0:1]+hash)*r^4
1074
################################################################
1075
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1076
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1077
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1078
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1079
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1080
1081
vpmuludq $H0,$T4,$T0 # h0*r0
1082
vpmuludq $H1,$T4,$T1 # h1*r0
1083
vpaddq $T0,$D0,$D0
1084
vpaddq $T1,$D1,$D1
1085
vmovdqa -0x80(%r11),$T2 # r1^4
1086
vpmuludq $H2,$T4,$T0 # h2*r0
1087
vpmuludq $H3,$T4,$T1 # h3*r0
1088
vpaddq $T0,$D2,$D2
1089
vpaddq $T1,$D3,$D3
1090
vpmuludq $H4,$T4,$T4 # h4*r0
1091
vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1092
vpaddq $T4,$D4,$D4
1093
1094
vpaddq $T0,$D0,$D0 # d0 += h4*s1
1095
vpmuludq $H2,$T2,$T1 # h2*r1
1096
vpmuludq $H3,$T2,$T0 # h3*r1
1097
vpaddq $T1,$D3,$D3 # d3 += h2*r1
1098
vmovdqa -0x60(%r11),$T3 # r2^4
1099
vpaddq $T0,$D4,$D4 # d4 += h3*r1
1100
vpmuludq $H1,$T2,$T1 # h1*r1
1101
vpmuludq $H0,$T2,$T2 # h0*r1
1102
vpaddq $T1,$D2,$D2 # d2 += h1*r1
1103
vpaddq $T2,$D1,$D1 # d1 += h0*r1
1104
1105
vmovdqa -0x50(%r11),$T4 # s2^4
1106
vpmuludq $H2,$T3,$T0 # h2*r2
1107
vpmuludq $H1,$T3,$T1 # h1*r2
1108
vpaddq $T0,$D4,$D4 # d4 += h2*r2
1109
vpaddq $T1,$D3,$D3 # d3 += h1*r2
1110
vmovdqa -0x40(%r11),$T2 # r3^4
1111
vpmuludq $H0,$T3,$T3 # h0*r2
1112
vpmuludq $H4,$T4,$T0 # h4*s2
1113
vpaddq $T3,$D2,$D2 # d2 += h0*r2
1114
vpaddq $T0,$D1,$D1 # d1 += h4*s2
1115
vmovdqa -0x30(%r11),$T3 # s3^4
1116
vpmuludq $H3,$T4,$T4 # h3*s2
1117
vpmuludq $H1,$T2,$T1 # h1*r3
1118
vpaddq $T4,$D0,$D0 # d0 += h3*s2
1119
1120
vmovdqa -0x10(%r11),$T4 # s4^4
1121
vpaddq $T1,$D4,$D4 # d4 += h1*r3
1122
vpmuludq $H0,$T2,$T2 # h0*r3
1123
vpmuludq $H4,$T3,$T0 # h4*s3
1124
vpaddq $T2,$D3,$D3 # d3 += h0*r3
1125
vpaddq $T0,$D2,$D2 # d2 += h4*s3
1126
vmovdqu 16*2($inp),$T0 # load input
1127
vpmuludq $H3,$T3,$T2 # h3*s3
1128
vpmuludq $H2,$T3,$T3 # h2*s3
1129
vpaddq $T2,$D1,$D1 # d1 += h3*s3
1130
vmovdqu 16*3($inp),$T1 #
1131
vpaddq $T3,$D0,$D0 # d0 += h2*s3
1132
1133
vpmuludq $H2,$T4,$H2 # h2*s4
1134
vpmuludq $H3,$T4,$H3 # h3*s4
1135
vpsrldq \$6,$T0,$T2 # splat input
1136
vpaddq $H2,$D1,$D1 # d1 += h2*s4
1137
vpmuludq $H4,$T4,$H4 # h4*s4
1138
vpsrldq \$6,$T1,$T3 #
1139
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1140
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1141
vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1142
vpmuludq $H1,$T4,$H0
1143
vpunpckhqdq $T1,$T0,$T4 # 4
1144
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1145
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1146
1147
vpunpcklqdq $T1,$T0,$T0 # 0:1
1148
vpunpcklqdq $T3,$T2,$T3 # 2:3
1149
1150
#vpsrlq \$40,$T4,$T4 # 4
1151
vpsrldq \$`40/8`,$T4,$T4 # 4
1152
vpsrlq \$26,$T0,$T1
1153
vmovdqa 0x00(%rsp),$D4 # preload r0^2
1154
vpand $MASK,$T0,$T0 # 0
1155
vpsrlq \$4,$T3,$T2
1156
vpand $MASK,$T1,$T1 # 1
1157
vpand 0(%rcx),$T4,$T4 # .Lmask24
1158
vpsrlq \$30,$T3,$T3
1159
vpand $MASK,$T2,$T2 # 2
1160
vpand $MASK,$T3,$T3 # 3
1161
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1162
1163
################################################################
1164
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1165
# and P. Schwabe
1166
1167
vpsrlq \$26,$H3,$D3
1168
vpand $MASK,$H3,$H3
1169
vpaddq $D3,$H4,$H4 # h3 -> h4
1170
1171
vpsrlq \$26,$H0,$D0
1172
vpand $MASK,$H0,$H0
1173
vpaddq $D0,$D1,$H1 # h0 -> h1
1174
1175
vpsrlq \$26,$H4,$D0
1176
vpand $MASK,$H4,$H4
1177
1178
vpsrlq \$26,$H1,$D1
1179
vpand $MASK,$H1,$H1
1180
vpaddq $D1,$H2,$H2 # h1 -> h2
1181
1182
vpaddq $D0,$H0,$H0
1183
vpsllq \$2,$D0,$D0
1184
vpaddq $D0,$H0,$H0 # h4 -> h0
1185
1186
vpsrlq \$26,$H2,$D2
1187
vpand $MASK,$H2,$H2
1188
vpaddq $D2,$H3,$H3 # h2 -> h3
1189
1190
vpsrlq \$26,$H0,$D0
1191
vpand $MASK,$H0,$H0
1192
vpaddq $D0,$H1,$H1 # h0 -> h1
1193
1194
vpsrlq \$26,$H3,$D3
1195
vpand $MASK,$H3,$H3
1196
vpaddq $D3,$H4,$H4 # h3 -> h4
1197
1198
ja .Loop_avx
1199
1200
.Lskip_loop_avx:
1201
################################################################
1202
# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1203
1204
vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1205
add \$32,$len
1206
jnz .Long_tail_avx
1207
1208
vpaddq $H2,$T2,$T2
1209
vpaddq $H0,$T0,$T0
1210
vpaddq $H1,$T1,$T1
1211
vpaddq $H3,$T3,$T3
1212
vpaddq $H4,$T4,$T4
1213
1214
.Long_tail_avx:
1215
vmovdqa $H2,0x20(%r11)
1216
vmovdqa $H0,0x00(%r11)
1217
vmovdqa $H1,0x10(%r11)
1218
vmovdqa $H3,0x30(%r11)
1219
vmovdqa $H4,0x40(%r11)
1220
1221
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1222
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1223
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1224
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1225
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1226
1227
vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1228
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1229
vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1230
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1231
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1232
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1233
1234
vpmuludq $T3,$H2,$H0 # h3*r1
1235
vpaddq $H0,$D4,$D4 # d4 += h3*r1
1236
vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1237
vpmuludq $T2,$H2,$H1 # h2*r1
1238
vpaddq $H1,$D3,$D3 # d3 += h2*r1
1239
vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1240
vpmuludq $T1,$H2,$H0 # h1*r1
1241
vpaddq $H0,$D2,$D2 # d2 += h1*r1
1242
vpmuludq $T0,$H2,$H2 # h0*r1
1243
vpaddq $H2,$D1,$D1 # d1 += h0*r1
1244
vpmuludq $T4,$H3,$H3 # h4*s1
1245
vpaddq $H3,$D0,$D0 # d0 += h4*s1
1246
1247
vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1248
vpmuludq $T2,$H4,$H1 # h2*r2
1249
vpaddq $H1,$D4,$D4 # d4 += h2*r2
1250
vpmuludq $T1,$H4,$H0 # h1*r2
1251
vpaddq $H0,$D3,$D3 # d3 += h1*r2
1252
vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1253
vpmuludq $T0,$H4,$H4 # h0*r2
1254
vpaddq $H4,$D2,$D2 # d2 += h0*r2
1255
vpmuludq $T4,$H2,$H1 # h4*s2
1256
vpaddq $H1,$D1,$D1 # d1 += h4*s2
1257
vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1258
vpmuludq $T3,$H2,$H2 # h3*s2
1259
vpaddq $H2,$D0,$D0 # d0 += h3*s2
1260
1261
vpmuludq $T1,$H3,$H0 # h1*r3
1262
vpaddq $H0,$D4,$D4 # d4 += h1*r3
1263
vpmuludq $T0,$H3,$H3 # h0*r3
1264
vpaddq $H3,$D3,$D3 # d3 += h0*r3
1265
vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1266
vpmuludq $T4,$H4,$H1 # h4*s3
1267
vpaddq $H1,$D2,$D2 # d2 += h4*s3
1268
vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1269
vpmuludq $T3,$H4,$H0 # h3*s3
1270
vpaddq $H0,$D1,$D1 # d1 += h3*s3
1271
vpmuludq $T2,$H4,$H4 # h2*s3
1272
vpaddq $H4,$D0,$D0 # d0 += h2*s3
1273
1274
vpmuludq $T0,$H2,$H2 # h0*r4
1275
vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1276
vpmuludq $T4,$H3,$H1 # h4*s4
1277
vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1278
vpmuludq $T3,$H3,$H0 # h3*s4
1279
vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1280
vpmuludq $T2,$H3,$H1 # h2*s4
1281
vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1282
vpmuludq $T1,$H3,$H3 # h1*s4
1283
vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1284
1285
jz .Lshort_tail_avx
1286
1287
vmovdqu 16*0($inp),$H0 # load input
1288
vmovdqu 16*1($inp),$H1
1289
1290
vpsrldq \$6,$H0,$H2 # splat input
1291
vpsrldq \$6,$H1,$H3
1292
vpunpckhqdq $H1,$H0,$H4 # 4
1293
vpunpcklqdq $H1,$H0,$H0 # 0:1
1294
vpunpcklqdq $H3,$H2,$H3 # 2:3
1295
1296
vpsrlq \$40,$H4,$H4 # 4
1297
vpsrlq \$26,$H0,$H1
1298
vpand $MASK,$H0,$H0 # 0
1299
vpsrlq \$4,$H3,$H2
1300
vpand $MASK,$H1,$H1 # 1
1301
vpsrlq \$30,$H3,$H3
1302
vpand $MASK,$H2,$H2 # 2
1303
vpand $MASK,$H3,$H3 # 3
1304
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1305
1306
vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1307
vpaddq 0x00(%r11),$H0,$H0
1308
vpaddq 0x10(%r11),$H1,$H1
1309
vpaddq 0x20(%r11),$H2,$H2
1310
vpaddq 0x30(%r11),$H3,$H3
1311
vpaddq 0x40(%r11),$H4,$H4
1312
1313
################################################################
1314
# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1315
1316
vpmuludq $H0,$T4,$T0 # h0*r0
1317
vpaddq $T0,$D0,$D0 # d0 += h0*r0
1318
vpmuludq $H1,$T4,$T1 # h1*r0
1319
vpaddq $T1,$D1,$D1 # d1 += h1*r0
1320
vpmuludq $H2,$T4,$T0 # h2*r0
1321
vpaddq $T0,$D2,$D2 # d2 += h2*r0
1322
vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1323
vpmuludq $H3,$T4,$T1 # h3*r0
1324
vpaddq $T1,$D3,$D3 # d3 += h3*r0
1325
vpmuludq $H4,$T4,$T4 # h4*r0
1326
vpaddq $T4,$D4,$D4 # d4 += h4*r0
1327
1328
vpmuludq $H3,$T2,$T0 # h3*r1
1329
vpaddq $T0,$D4,$D4 # d4 += h3*r1
1330
vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1331
vpmuludq $H2,$T2,$T1 # h2*r1
1332
vpaddq $T1,$D3,$D3 # d3 += h2*r1
1333
vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1334
vpmuludq $H1,$T2,$T0 # h1*r1
1335
vpaddq $T0,$D2,$D2 # d2 += h1*r1
1336
vpmuludq $H0,$T2,$T2 # h0*r1
1337
vpaddq $T2,$D1,$D1 # d1 += h0*r1
1338
vpmuludq $H4,$T3,$T3 # h4*s1
1339
vpaddq $T3,$D0,$D0 # d0 += h4*s1
1340
1341
vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1342
vpmuludq $H2,$T4,$T1 # h2*r2
1343
vpaddq $T1,$D4,$D4 # d4 += h2*r2
1344
vpmuludq $H1,$T4,$T0 # h1*r2
1345
vpaddq $T0,$D3,$D3 # d3 += h1*r2
1346
vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1347
vpmuludq $H0,$T4,$T4 # h0*r2
1348
vpaddq $T4,$D2,$D2 # d2 += h0*r2
1349
vpmuludq $H4,$T2,$T1 # h4*s2
1350
vpaddq $T1,$D1,$D1 # d1 += h4*s2
1351
vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1352
vpmuludq $H3,$T2,$T2 # h3*s2
1353
vpaddq $T2,$D0,$D0 # d0 += h3*s2
1354
1355
vpmuludq $H1,$T3,$T0 # h1*r3
1356
vpaddq $T0,$D4,$D4 # d4 += h1*r3
1357
vpmuludq $H0,$T3,$T3 # h0*r3
1358
vpaddq $T3,$D3,$D3 # d3 += h0*r3
1359
vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1360
vpmuludq $H4,$T4,$T1 # h4*s3
1361
vpaddq $T1,$D2,$D2 # d2 += h4*s3
1362
vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1363
vpmuludq $H3,$T4,$T0 # h3*s3
1364
vpaddq $T0,$D1,$D1 # d1 += h3*s3
1365
vpmuludq $H2,$T4,$T4 # h2*s3
1366
vpaddq $T4,$D0,$D0 # d0 += h2*s3
1367
1368
vpmuludq $H0,$T2,$T2 # h0*r4
1369
vpaddq $T2,$D4,$D4 # d4 += h0*r4
1370
vpmuludq $H4,$T3,$T1 # h4*s4
1371
vpaddq $T1,$D3,$D3 # d3 += h4*s4
1372
vpmuludq $H3,$T3,$T0 # h3*s4
1373
vpaddq $T0,$D2,$D2 # d2 += h3*s4
1374
vpmuludq $H2,$T3,$T1 # h2*s4
1375
vpaddq $T1,$D1,$D1 # d1 += h2*s4
1376
vpmuludq $H1,$T3,$T3 # h1*s4
1377
vpaddq $T3,$D0,$D0 # d0 += h1*s4
1378
1379
.Lshort_tail_avx:
1380
################################################################
1381
# horizontal addition
1382
1383
vpsrldq \$8,$D4,$T4
1384
vpsrldq \$8,$D3,$T3
1385
vpsrldq \$8,$D1,$T1
1386
vpsrldq \$8,$D0,$T0
1387
vpsrldq \$8,$D2,$T2
1388
vpaddq $T3,$D3,$D3
1389
vpaddq $T4,$D4,$D4
1390
vpaddq $T0,$D0,$D0
1391
vpaddq $T1,$D1,$D1
1392
vpaddq $T2,$D2,$D2
1393
1394
################################################################
1395
# lazy reduction
1396
1397
vpsrlq \$26,$D3,$H3
1398
vpand $MASK,$D3,$D3
1399
vpaddq $H3,$D4,$D4 # h3 -> h4
1400
1401
vpsrlq \$26,$D0,$H0
1402
vpand $MASK,$D0,$D0
1403
vpaddq $H0,$D1,$D1 # h0 -> h1
1404
1405
vpsrlq \$26,$D4,$H4
1406
vpand $MASK,$D4,$D4
1407
1408
vpsrlq \$26,$D1,$H1
1409
vpand $MASK,$D1,$D1
1410
vpaddq $H1,$D2,$D2 # h1 -> h2
1411
1412
vpaddq $H4,$D0,$D0
1413
vpsllq \$2,$H4,$H4
1414
vpaddq $H4,$D0,$D0 # h4 -> h0
1415
1416
vpsrlq \$26,$D2,$H2
1417
vpand $MASK,$D2,$D2
1418
vpaddq $H2,$D3,$D3 # h2 -> h3
1419
1420
vpsrlq \$26,$D0,$H0
1421
vpand $MASK,$D0,$D0
1422
vpaddq $H0,$D1,$D1 # h0 -> h1
1423
1424
vpsrlq \$26,$D3,$H3
1425
vpand $MASK,$D3,$D3
1426
vpaddq $H3,$D4,$D4 # h3 -> h4
1427
1428
vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1429
vmovd $D1,`4*1-48-64`($ctx)
1430
vmovd $D2,`4*2-48-64`($ctx)
1431
vmovd $D3,`4*3-48-64`($ctx)
1432
vmovd $D4,`4*4-48-64`($ctx)
1433
___
1434
$code.=<<___ if ($win64);
1435
vmovdqa 0x50(%r11),%xmm6
1436
vmovdqa 0x60(%r11),%xmm7
1437
vmovdqa 0x70(%r11),%xmm8
1438
vmovdqa 0x80(%r11),%xmm9
1439
vmovdqa 0x90(%r11),%xmm10
1440
vmovdqa 0xa0(%r11),%xmm11
1441
vmovdqa 0xb0(%r11),%xmm12
1442
vmovdqa 0xc0(%r11),%xmm13
1443
vmovdqa 0xd0(%r11),%xmm14
1444
vmovdqa 0xe0(%r11),%xmm15
1445
lea 0xf8(%r11),%rsp
1446
.Ldo_avx_epilogue:
1447
___
1448
$code.=<<___ if (!$win64);
1449
lea -8(%r10),%rsp
1450
.cfi_def_cfa_register %rsp
1451
___
1452
$code.=<<___;
1453
vzeroupper
1454
RET
1455
.cfi_endproc
1456
___
1457
&end_function("poly1305_blocks_avx");
1458
1459
&declare_function("poly1305_emit_avx", 32, 3);
1460
$code.=<<___;
1461
cmpl \$0,20($ctx) # is_base2_26?
1462
je .Lemit
1463
1464
mov 0($ctx),%eax # load hash value base 2^26
1465
mov 4($ctx),%ecx
1466
mov 8($ctx),%r8d
1467
mov 12($ctx),%r11d
1468
mov 16($ctx),%r10d
1469
1470
shl \$26,%rcx # base 2^26 -> base 2^64
1471
mov %r8,%r9
1472
shl \$52,%r8
1473
add %rcx,%rax
1474
shr \$12,%r9
1475
add %rax,%r8 # h0
1476
adc \$0,%r9
1477
1478
shl \$14,%r11
1479
mov %r10,%rax
1480
shr \$24,%r10
1481
add %r11,%r9
1482
shl \$40,%rax
1483
add %rax,%r9 # h1
1484
adc \$0,%r10 # h2
1485
1486
mov %r10,%rax # could be partially reduced, so reduce
1487
mov %r10,%rcx
1488
and \$3,%r10
1489
shr \$2,%rax
1490
and \$-4,%rcx
1491
add %rcx,%rax
1492
add %rax,%r8
1493
adc \$0,%r9
1494
adc \$0,%r10
1495
1496
mov %r8,%rax
1497
add \$5,%r8 # compare to modulus
1498
mov %r9,%rcx
1499
adc \$0,%r9
1500
adc \$0,%r10
1501
shr \$2,%r10 # did 130-bit value overflow?
1502
cmovnz %r8,%rax
1503
cmovnz %r9,%rcx
1504
1505
add 0($nonce),%rax # accumulate nonce
1506
adc 8($nonce),%rcx
1507
mov %rax,0($mac) # write result
1508
mov %rcx,8($mac)
1509
1510
RET
1511
___
1512
&end_function("poly1305_emit_avx");
1513
1514
if ($avx>1) {
1515
1516
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1517
map("%ymm$_",(0..15));
1518
my $S4=$MASK;
1519
1520
sub poly1305_blocks_avxN {
1521
my ($avx512) = @_;
1522
my $suffix = $avx512 ? "_avx512" : "";
1523
$code.=<<___;
1524
.cfi_startproc
1525
mov 20($ctx),%r8d # is_base2_26
1526
cmp \$128,$len
1527
jae .Lblocks_avx2$suffix
1528
test %r8d,%r8d
1529
jz .Lblocks
1530
1531
.Lblocks_avx2$suffix:
1532
and \$-16,$len
1533
jz .Lno_data_avx2$suffix
1534
1535
vzeroupper
1536
1537
test %r8d,%r8d
1538
jz .Lbase2_64_avx2$suffix
1539
1540
test \$63,$len
1541
jz .Leven_avx2$suffix
1542
1543
push %rbp
1544
.cfi_push %rbp
1545
mov %rsp,%rbp
1546
push %rbx
1547
.cfi_push %rbx
1548
push %r12
1549
.cfi_push %r12
1550
push %r13
1551
.cfi_push %r13
1552
push %r14
1553
.cfi_push %r14
1554
push %r15
1555
.cfi_push %r15
1556
.Lblocks_avx2_body$suffix:
1557
1558
mov $len,%r15 # reassign $len
1559
1560
mov 0($ctx),$d1 # load hash value
1561
mov 8($ctx),$d2
1562
mov 16($ctx),$h2#d
1563
1564
mov 24($ctx),$r0 # load r
1565
mov 32($ctx),$s1
1566
1567
################################# base 2^26 -> base 2^64
1568
mov $d1#d,$h0#d
1569
and \$`-1*(1<<31)`,$d1
1570
mov $d2,$r1 # borrow $r1
1571
mov $d2#d,$h1#d
1572
and \$`-1*(1<<31)`,$d2
1573
1574
shr \$6,$d1
1575
shl \$52,$r1
1576
add $d1,$h0
1577
shr \$12,$h1
1578
shr \$18,$d2
1579
add $r1,$h0
1580
adc $d2,$h1
1581
1582
mov $h2,$d1
1583
shl \$40,$d1
1584
shr \$24,$h2
1585
add $d1,$h1
1586
adc \$0,$h2 # can be partially reduced...
1587
1588
mov \$-4,$d2 # ... so reduce
1589
mov $h2,$d1
1590
and $h2,$d2
1591
shr \$2,$d1
1592
and \$3,$h2
1593
add $d2,$d1 # =*5
1594
add $d1,$h0
1595
adc \$0,$h1
1596
adc \$0,$h2
1597
1598
mov $s1,$r1
1599
mov $s1,%rax
1600
shr \$2,$s1
1601
add $r1,$s1 # s1 = r1 + (r1 >> 2)
1602
1603
.Lbase2_26_pre_avx2$suffix:
1604
add 0($inp),$h0 # accumulate input
1605
adc 8($inp),$h1
1606
lea 16($inp),$inp
1607
adc $padbit,$h2
1608
sub \$16,%r15
1609
1610
call __poly1305_block
1611
mov $r1,%rax
1612
1613
test \$63,%r15
1614
jnz .Lbase2_26_pre_avx2$suffix
1615
1616
test $padbit,$padbit # if $padbit is zero,
1617
jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1618
1619
################################# base 2^64 -> base 2^26
1620
mov $h0,%rax
1621
mov $h0,%rdx
1622
shr \$52,$h0
1623
mov $h1,$r0
1624
mov $h1,$r1
1625
shr \$26,%rdx
1626
and \$0x3ffffff,%rax # h[0]
1627
shl \$12,$r0
1628
and \$0x3ffffff,%rdx # h[1]
1629
shr \$14,$h1
1630
or $r0,$h0
1631
shl \$24,$h2
1632
and \$0x3ffffff,$h0 # h[2]
1633
shr \$40,$r1
1634
and \$0x3ffffff,$h1 # h[3]
1635
or $r1,$h2 # h[4]
1636
1637
test %r15,%r15
1638
jz .Lstore_base2_26_avx2$suffix
1639
1640
vmovd %rax#d,%x#$H0
1641
vmovd %rdx#d,%x#$H1
1642
vmovd $h0#d,%x#$H2
1643
vmovd $h1#d,%x#$H3
1644
vmovd $h2#d,%x#$H4
1645
jmp .Lproceed_avx2$suffix
1646
1647
.align 32
1648
.Lstore_base2_64_avx2$suffix:
1649
mov $h0,0($ctx)
1650
mov $h1,8($ctx)
1651
mov $h2,16($ctx) # note that is_base2_26 is zeroed
1652
jmp .Ldone_avx2$suffix
1653
1654
.align 16
1655
.Lstore_base2_26_avx2$suffix:
1656
mov %rax#d,0($ctx) # store hash value base 2^26
1657
mov %rdx#d,4($ctx)
1658
mov $h0#d,8($ctx)
1659
mov $h1#d,12($ctx)
1660
mov $h2#d,16($ctx)
1661
.align 16
1662
.Ldone_avx2$suffix:
1663
pop %r15
1664
.cfi_restore %r15
1665
pop %r14
1666
.cfi_restore %r14
1667
pop %r13
1668
.cfi_restore %r13
1669
pop %r12
1670
.cfi_restore %r12
1671
pop %rbx
1672
.cfi_restore %rbx
1673
pop %rbp
1674
.cfi_restore %rbp
1675
.Lno_data_avx2$suffix:
1676
.Lblocks_avx2_epilogue$suffix:
1677
RET
1678
.cfi_endproc
1679
1680
.align 32
1681
.Lbase2_64_avx2$suffix:
1682
.cfi_startproc
1683
push %rbp
1684
.cfi_push %rbp
1685
mov %rsp,%rbp
1686
push %rbx
1687
.cfi_push %rbx
1688
push %r12
1689
.cfi_push %r12
1690
push %r13
1691
.cfi_push %r13
1692
push %r14
1693
.cfi_push %r14
1694
push %r15
1695
.cfi_push %r15
1696
.Lbase2_64_avx2_body$suffix:
1697
1698
mov $len,%r15 # reassign $len
1699
1700
mov 24($ctx),$r0 # load r
1701
mov 32($ctx),$s1
1702
1703
mov 0($ctx),$h0 # load hash value
1704
mov 8($ctx),$h1
1705
mov 16($ctx),$h2#d
1706
1707
mov $s1,$r1
1708
mov $s1,%rax
1709
shr \$2,$s1
1710
add $r1,$s1 # s1 = r1 + (r1 >> 2)
1711
1712
test \$63,$len
1713
jz .Linit_avx2$suffix
1714
1715
.Lbase2_64_pre_avx2$suffix:
1716
add 0($inp),$h0 # accumulate input
1717
adc 8($inp),$h1
1718
lea 16($inp),$inp
1719
adc $padbit,$h2
1720
sub \$16,%r15
1721
1722
call __poly1305_block
1723
mov $r1,%rax
1724
1725
test \$63,%r15
1726
jnz .Lbase2_64_pre_avx2$suffix
1727
1728
.Linit_avx2$suffix:
1729
################################# base 2^64 -> base 2^26
1730
mov $h0,%rax
1731
mov $h0,%rdx
1732
shr \$52,$h0
1733
mov $h1,$d1
1734
mov $h1,$d2
1735
shr \$26,%rdx
1736
and \$0x3ffffff,%rax # h[0]
1737
shl \$12,$d1
1738
and \$0x3ffffff,%rdx # h[1]
1739
shr \$14,$h1
1740
or $d1,$h0
1741
shl \$24,$h2
1742
and \$0x3ffffff,$h0 # h[2]
1743
shr \$40,$d2
1744
and \$0x3ffffff,$h1 # h[3]
1745
or $d2,$h2 # h[4]
1746
1747
vmovd %rax#d,%x#$H0
1748
vmovd %rdx#d,%x#$H1
1749
vmovd $h0#d,%x#$H2
1750
vmovd $h1#d,%x#$H3
1751
vmovd $h2#d,%x#$H4
1752
movl \$1,20($ctx) # set is_base2_26
1753
1754
call __poly1305_init_avx
1755
1756
.Lproceed_avx2$suffix:
1757
mov %r15,$len # restore $len
1758
___
1759
$code.=<<___ if (!$kernel);
1760
mov OPENSSL_ia32cap_P+8(%rip),%r9d
1761
mov \$`(1<<31|1<<30|1<<16)`,%r11d
1762
___
1763
$code.=<<___;
1764
pop %r15
1765
.cfi_restore %r15
1766
pop %r14
1767
.cfi_restore %r14
1768
pop %r13
1769
.cfi_restore %r13
1770
pop %r12
1771
.cfi_restore %r12
1772
pop %rbx
1773
.cfi_restore %rbx
1774
pop %rbp
1775
.cfi_restore %rbp
1776
.Lbase2_64_avx2_epilogue$suffix:
1777
jmp .Ldo_avx2$suffix
1778
.cfi_endproc
1779
1780
.align 32
1781
.Leven_avx2$suffix:
1782
.cfi_startproc
1783
___
1784
$code.=<<___ if (!$kernel);
1785
mov OPENSSL_ia32cap_P+8(%rip),%r9d
1786
___
1787
$code.=<<___;
1788
vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1789
vmovd 4*1($ctx),%x#$H1
1790
vmovd 4*2($ctx),%x#$H2
1791
vmovd 4*3($ctx),%x#$H3
1792
vmovd 4*4($ctx),%x#$H4
1793
1794
.Ldo_avx2$suffix:
1795
___
1796
$code.=<<___ if (!$kernel && $avx>2);
1797
cmp \$512,$len
1798
jb .Lskip_avx512
1799
and %r11d,%r9d
1800
test \$`1<<16`,%r9d # check for AVX512F
1801
jnz .Lblocks_avx512
1802
.Lskip_avx512$suffix:
1803
___
1804
$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1805
cmp \$512,$len
1806
jae .Lblocks_avx512
1807
___
1808
$code.=<<___ if (!$win64);
1809
lea 8(%rsp),%r10
1810
.cfi_def_cfa_register %r10
1811
sub \$0x128,%rsp
1812
___
1813
$code.=<<___ if ($win64);
1814
lea 8(%rsp),%r10
1815
sub \$0x1c8,%rsp
1816
vmovdqa %xmm6,-0xb0(%r10)
1817
vmovdqa %xmm7,-0xa0(%r10)
1818
vmovdqa %xmm8,-0x90(%r10)
1819
vmovdqa %xmm9,-0x80(%r10)
1820
vmovdqa %xmm10,-0x70(%r10)
1821
vmovdqa %xmm11,-0x60(%r10)
1822
vmovdqa %xmm12,-0x50(%r10)
1823
vmovdqa %xmm13,-0x40(%r10)
1824
vmovdqa %xmm14,-0x30(%r10)
1825
vmovdqa %xmm15,-0x20(%r10)
1826
.Ldo_avx2_body$suffix:
1827
___
1828
$code.=<<___;
1829
lea .Lconst(%rip),%rcx
1830
lea 48+64($ctx),$ctx # size optimization
1831
vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1832
1833
# expand and copy pre-calculated table to stack
1834
vmovdqu `16*0-64`($ctx),%x#$T2
1835
and \$-512,%rsp
1836
vmovdqu `16*1-64`($ctx),%x#$T3
1837
vmovdqu `16*2-64`($ctx),%x#$T4
1838
vmovdqu `16*3-64`($ctx),%x#$D0
1839
vmovdqu `16*4-64`($ctx),%x#$D1
1840
vmovdqu `16*5-64`($ctx),%x#$D2
1841
lea 0x90(%rsp),%rax # size optimization
1842
vmovdqu `16*6-64`($ctx),%x#$D3
1843
vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1844
vmovdqu `16*7-64`($ctx),%x#$D4
1845
vpermd $T3,$T0,$T3
1846
vmovdqu `16*8-64`($ctx),%x#$MASK
1847
vpermd $T4,$T0,$T4
1848
vmovdqa $T2,0x00(%rsp)
1849
vpermd $D0,$T0,$D0
1850
vmovdqa $T3,0x20-0x90(%rax)
1851
vpermd $D1,$T0,$D1
1852
vmovdqa $T4,0x40-0x90(%rax)
1853
vpermd $D2,$T0,$D2
1854
vmovdqa $D0,0x60-0x90(%rax)
1855
vpermd $D3,$T0,$D3
1856
vmovdqa $D1,0x80-0x90(%rax)
1857
vpermd $D4,$T0,$D4
1858
vmovdqa $D2,0xa0-0x90(%rax)
1859
vpermd $MASK,$T0,$MASK
1860
vmovdqa $D3,0xc0-0x90(%rax)
1861
vmovdqa $D4,0xe0-0x90(%rax)
1862
vmovdqa $MASK,0x100-0x90(%rax)
1863
vmovdqa 64(%rcx),$MASK # .Lmask26
1864
1865
################################################################
1866
# load input
1867
vmovdqu 16*0($inp),%x#$T0
1868
vmovdqu 16*1($inp),%x#$T1
1869
vinserti128 \$1,16*2($inp),$T0,$T0
1870
vinserti128 \$1,16*3($inp),$T1,$T1
1871
lea 16*4($inp),$inp
1872
1873
vpsrldq \$6,$T0,$T2 # splat input
1874
vpsrldq \$6,$T1,$T3
1875
vpunpckhqdq $T1,$T0,$T4 # 4
1876
vpunpcklqdq $T3,$T2,$T2 # 2:3
1877
vpunpcklqdq $T1,$T0,$T0 # 0:1
1878
1879
vpsrlq \$30,$T2,$T3
1880
vpsrlq \$4,$T2,$T2
1881
vpsrlq \$26,$T0,$T1
1882
vpsrlq \$40,$T4,$T4 # 4
1883
vpand $MASK,$T2,$T2 # 2
1884
vpand $MASK,$T0,$T0 # 0
1885
vpand $MASK,$T1,$T1 # 1
1886
vpand $MASK,$T3,$T3 # 3
1887
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1888
1889
vpaddq $H2,$T2,$H2 # accumulate input
1890
sub \$64,$len
1891
jz .Ltail_avx2$suffix
1892
jmp .Loop_avx2$suffix
1893
1894
.align 32
1895
.Loop_avx2$suffix:
1896
################################################################
1897
# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1898
# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1899
# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1900
# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1901
# \________/\__________/
1902
################################################################
1903
#vpaddq $H2,$T2,$H2 # accumulate input
1904
vpaddq $H0,$T0,$H0
1905
vmovdqa `32*0`(%rsp),$T0 # r0^4
1906
vpaddq $H1,$T1,$H1
1907
vmovdqa `32*1`(%rsp),$T1 # r1^4
1908
vpaddq $H3,$T3,$H3
1909
vmovdqa `32*3`(%rsp),$T2 # r2^4
1910
vpaddq $H4,$T4,$H4
1911
vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1912
vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1913
1914
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1915
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1916
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1917
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1918
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1919
#
1920
# however, as h2 is "chronologically" first one available pull
1921
# corresponding operations up, so it's
1922
#
1923
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1924
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1925
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1926
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1927
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1928
1929
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1930
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1931
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1932
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1933
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1934
1935
vpmuludq $H0,$T1,$T4 # h0*r1
1936
vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1937
vpaddq $T4,$D1,$D1 # d1 += h0*r1
1938
vpaddq $H2,$D2,$D2 # d2 += h1*r1
1939
vpmuludq $H3,$T1,$T4 # h3*r1
1940
vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1941
vpaddq $T4,$D4,$D4 # d4 += h3*r1
1942
vpaddq $H2,$D0,$D0 # d0 += h4*s1
1943
vmovdqa `32*4-0x90`(%rax),$T1 # s2
1944
1945
vpmuludq $H0,$T0,$T4 # h0*r0
1946
vpmuludq $H1,$T0,$H2 # h1*r0
1947
vpaddq $T4,$D0,$D0 # d0 += h0*r0
1948
vpaddq $H2,$D1,$D1 # d1 += h1*r0
1949
vpmuludq $H3,$T0,$T4 # h3*r0
1950
vpmuludq $H4,$T0,$H2 # h4*r0
1951
vmovdqu 16*0($inp),%x#$T0 # load input
1952
vpaddq $T4,$D3,$D3 # d3 += h3*r0
1953
vpaddq $H2,$D4,$D4 # d4 += h4*r0
1954
vinserti128 \$1,16*2($inp),$T0,$T0
1955
1956
vpmuludq $H3,$T1,$T4 # h3*s2
1957
vpmuludq $H4,$T1,$H2 # h4*s2
1958
vmovdqu 16*1($inp),%x#$T1
1959
vpaddq $T4,$D0,$D0 # d0 += h3*s2
1960
vpaddq $H2,$D1,$D1 # d1 += h4*s2
1961
vmovdqa `32*5-0x90`(%rax),$H2 # r3
1962
vpmuludq $H1,$T2,$T4 # h1*r2
1963
vpmuludq $H0,$T2,$T2 # h0*r2
1964
vpaddq $T4,$D3,$D3 # d3 += h1*r2
1965
vpaddq $T2,$D2,$D2 # d2 += h0*r2
1966
vinserti128 \$1,16*3($inp),$T1,$T1
1967
lea 16*4($inp),$inp
1968
1969
vpmuludq $H1,$H2,$T4 # h1*r3
1970
vpmuludq $H0,$H2,$H2 # h0*r3
1971
vpsrldq \$6,$T0,$T2 # splat input
1972
vpaddq $T4,$D4,$D4 # d4 += h1*r3
1973
vpaddq $H2,$D3,$D3 # d3 += h0*r3
1974
vpmuludq $H3,$T3,$T4 # h3*s3
1975
vpmuludq $H4,$T3,$H2 # h4*s3
1976
vpsrldq \$6,$T1,$T3
1977
vpaddq $T4,$D1,$D1 # d1 += h3*s3
1978
vpaddq $H2,$D2,$D2 # d2 += h4*s3
1979
vpunpckhqdq $T1,$T0,$T4 # 4
1980
1981
vpmuludq $H3,$S4,$H3 # h3*s4
1982
vpmuludq $H4,$S4,$H4 # h4*s4
1983
vpunpcklqdq $T1,$T0,$T0 # 0:1
1984
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1985
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1986
vpunpcklqdq $T3,$T2,$T3 # 2:3
1987
vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1988
vpmuludq $H1,$S4,$H0 # h1*s4
1989
vmovdqa 64(%rcx),$MASK # .Lmask26
1990
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1991
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1992
1993
################################################################
1994
# lazy reduction (interleaved with tail of input splat)
1995
1996
vpsrlq \$26,$H3,$D3
1997
vpand $MASK,$H3,$H3
1998
vpaddq $D3,$H4,$H4 # h3 -> h4
1999
2000
vpsrlq \$26,$H0,$D0
2001
vpand $MASK,$H0,$H0
2002
vpaddq $D0,$D1,$H1 # h0 -> h1
2003
2004
vpsrlq \$26,$H4,$D4
2005
vpand $MASK,$H4,$H4
2006
2007
vpsrlq \$4,$T3,$T2
2008
2009
vpsrlq \$26,$H1,$D1
2010
vpand $MASK,$H1,$H1
2011
vpaddq $D1,$H2,$H2 # h1 -> h2
2012
2013
vpaddq $D4,$H0,$H0
2014
vpsllq \$2,$D4,$D4
2015
vpaddq $D4,$H0,$H0 # h4 -> h0
2016
2017
vpand $MASK,$T2,$T2 # 2
2018
vpsrlq \$26,$T0,$T1
2019
2020
vpsrlq \$26,$H2,$D2
2021
vpand $MASK,$H2,$H2
2022
vpaddq $D2,$H3,$H3 # h2 -> h3
2023
2024
vpaddq $T2,$H2,$H2 # modulo-scheduled
2025
vpsrlq \$30,$T3,$T3
2026
2027
vpsrlq \$26,$H0,$D0
2028
vpand $MASK,$H0,$H0
2029
vpaddq $D0,$H1,$H1 # h0 -> h1
2030
2031
vpsrlq \$40,$T4,$T4 # 4
2032
2033
vpsrlq \$26,$H3,$D3
2034
vpand $MASK,$H3,$H3
2035
vpaddq $D3,$H4,$H4 # h3 -> h4
2036
2037
vpand $MASK,$T0,$T0 # 0
2038
vpand $MASK,$T1,$T1 # 1
2039
vpand $MASK,$T3,$T3 # 3
2040
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2041
2042
sub \$64,$len
2043
jnz .Loop_avx2$suffix
2044
2045
.byte 0x66,0x90
2046
.Ltail_avx2$suffix:
2047
################################################################
2048
# while above multiplications were by r^4 in all lanes, in last
2049
# iteration we multiply least significant lane by r^4 and most
2050
# significant one by r, so copy of above except that references
2051
# to the precomputed table are displaced by 4...
2052
2053
#vpaddq $H2,$T2,$H2 # accumulate input
2054
vpaddq $H0,$T0,$H0
2055
vmovdqu `32*0+4`(%rsp),$T0 # r0^4
2056
vpaddq $H1,$T1,$H1
2057
vmovdqu `32*1+4`(%rsp),$T1 # r1^4
2058
vpaddq $H3,$T3,$H3
2059
vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2060
vpaddq $H4,$T4,$H4
2061
vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
2062
vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
2063
2064
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
2065
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
2066
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
2067
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
2068
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2069
2070
vpmuludq $H0,$T1,$T4 # h0*r1
2071
vpmuludq $H1,$T1,$H2 # h1*r1
2072
vpaddq $T4,$D1,$D1 # d1 += h0*r1
2073
vpaddq $H2,$D2,$D2 # d2 += h1*r1
2074
vpmuludq $H3,$T1,$T4 # h3*r1
2075
vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2076
vpaddq $T4,$D4,$D4 # d4 += h3*r1
2077
vpaddq $H2,$D0,$D0 # d0 += h4*s1
2078
2079
vpmuludq $H0,$T0,$T4 # h0*r0
2080
vpmuludq $H1,$T0,$H2 # h1*r0
2081
vpaddq $T4,$D0,$D0 # d0 += h0*r0
2082
vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2083
vpaddq $H2,$D1,$D1 # d1 += h1*r0
2084
vpmuludq $H3,$T0,$T4 # h3*r0
2085
vpmuludq $H4,$T0,$H2 # h4*r0
2086
vpaddq $T4,$D3,$D3 # d3 += h3*r0
2087
vpaddq $H2,$D4,$D4 # d4 += h4*r0
2088
2089
vpmuludq $H3,$T1,$T4 # h3*s2
2090
vpmuludq $H4,$T1,$H2 # h4*s2
2091
vpaddq $T4,$D0,$D0 # d0 += h3*s2
2092
vpaddq $H2,$D1,$D1 # d1 += h4*s2
2093
vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2094
vpmuludq $H1,$T2,$T4 # h1*r2
2095
vpmuludq $H0,$T2,$T2 # h0*r2
2096
vpaddq $T4,$D3,$D3 # d3 += h1*r2
2097
vpaddq $T2,$D2,$D2 # d2 += h0*r2
2098
2099
vpmuludq $H1,$H2,$T4 # h1*r3
2100
vpmuludq $H0,$H2,$H2 # h0*r3
2101
vpaddq $T4,$D4,$D4 # d4 += h1*r3
2102
vpaddq $H2,$D3,$D3 # d3 += h0*r3
2103
vpmuludq $H3,$T3,$T4 # h3*s3
2104
vpmuludq $H4,$T3,$H2 # h4*s3
2105
vpaddq $T4,$D1,$D1 # d1 += h3*s3
2106
vpaddq $H2,$D2,$D2 # d2 += h4*s3
2107
2108
vpmuludq $H3,$S4,$H3 # h3*s4
2109
vpmuludq $H4,$S4,$H4 # h4*s4
2110
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2111
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2112
vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2113
vpmuludq $H1,$S4,$H0 # h1*s4
2114
vmovdqa 64(%rcx),$MASK # .Lmask26
2115
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2116
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2117
2118
################################################################
2119
# horizontal addition
2120
2121
vpsrldq \$8,$D1,$T1
2122
vpsrldq \$8,$H2,$T2
2123
vpsrldq \$8,$H3,$T3
2124
vpsrldq \$8,$H4,$T4
2125
vpsrldq \$8,$H0,$T0
2126
vpaddq $T1,$D1,$D1
2127
vpaddq $T2,$H2,$H2
2128
vpaddq $T3,$H3,$H3
2129
vpaddq $T4,$H4,$H4
2130
vpaddq $T0,$H0,$H0
2131
2132
vpermq \$0x2,$H3,$T3
2133
vpermq \$0x2,$H4,$T4
2134
vpermq \$0x2,$H0,$T0
2135
vpermq \$0x2,$D1,$T1
2136
vpermq \$0x2,$H2,$T2
2137
vpaddq $T3,$H3,$H3
2138
vpaddq $T4,$H4,$H4
2139
vpaddq $T0,$H0,$H0
2140
vpaddq $T1,$D1,$D1
2141
vpaddq $T2,$H2,$H2
2142
2143
################################################################
2144
# lazy reduction
2145
2146
vpsrlq \$26,$H3,$D3
2147
vpand $MASK,$H3,$H3
2148
vpaddq $D3,$H4,$H4 # h3 -> h4
2149
2150
vpsrlq \$26,$H0,$D0
2151
vpand $MASK,$H0,$H0
2152
vpaddq $D0,$D1,$H1 # h0 -> h1
2153
2154
vpsrlq \$26,$H4,$D4
2155
vpand $MASK,$H4,$H4
2156
2157
vpsrlq \$26,$H1,$D1
2158
vpand $MASK,$H1,$H1
2159
vpaddq $D1,$H2,$H2 # h1 -> h2
2160
2161
vpaddq $D4,$H0,$H0
2162
vpsllq \$2,$D4,$D4
2163
vpaddq $D4,$H0,$H0 # h4 -> h0
2164
2165
vpsrlq \$26,$H2,$D2
2166
vpand $MASK,$H2,$H2
2167
vpaddq $D2,$H3,$H3 # h2 -> h3
2168
2169
vpsrlq \$26,$H0,$D0
2170
vpand $MASK,$H0,$H0
2171
vpaddq $D0,$H1,$H1 # h0 -> h1
2172
2173
vpsrlq \$26,$H3,$D3
2174
vpand $MASK,$H3,$H3
2175
vpaddq $D3,$H4,$H4 # h3 -> h4
2176
2177
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2178
vmovd %x#$H1,`4*1-48-64`($ctx)
2179
vmovd %x#$H2,`4*2-48-64`($ctx)
2180
vmovd %x#$H3,`4*3-48-64`($ctx)
2181
vmovd %x#$H4,`4*4-48-64`($ctx)
2182
___
2183
$code.=<<___ if ($win64);
2184
vmovdqa -0xb0(%r10),%xmm6
2185
vmovdqa -0xa0(%r10),%xmm7
2186
vmovdqa -0x90(%r10),%xmm8
2187
vmovdqa -0x80(%r10),%xmm9
2188
vmovdqa -0x70(%r10),%xmm10
2189
vmovdqa -0x60(%r10),%xmm11
2190
vmovdqa -0x50(%r10),%xmm12
2191
vmovdqa -0x40(%r10),%xmm13
2192
vmovdqa -0x30(%r10),%xmm14
2193
vmovdqa -0x20(%r10),%xmm15
2194
lea -8(%r10),%rsp
2195
.Ldo_avx2_epilogue$suffix:
2196
___
2197
$code.=<<___ if (!$win64);
2198
lea -8(%r10),%rsp
2199
.cfi_def_cfa_register %rsp
2200
___
2201
$code.=<<___;
2202
vzeroupper
2203
RET
2204
.cfi_endproc
2205
___
2206
if($avx > 2 && $avx512) {
2207
my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2208
my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2209
my $PADBIT="%zmm30";
2210
2211
map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2212
map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2213
map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2214
map(s/%y/%z/,($MASK));
2215
2216
$code.=<<___;
2217
.cfi_startproc
2218
.Lblocks_avx512:
2219
mov \$15,%eax
2220
kmovw %eax,%k2
2221
___
2222
$code.=<<___ if (!$win64);
2223
lea 8(%rsp),%r10
2224
.cfi_def_cfa_register %r10
2225
sub \$0x128,%rsp
2226
___
2227
$code.=<<___ if ($win64);
2228
lea 8(%rsp),%r10
2229
sub \$0x1c8,%rsp
2230
vmovdqa %xmm6,-0xb0(%r10)
2231
vmovdqa %xmm7,-0xa0(%r10)
2232
vmovdqa %xmm8,-0x90(%r10)
2233
vmovdqa %xmm9,-0x80(%r10)
2234
vmovdqa %xmm10,-0x70(%r10)
2235
vmovdqa %xmm11,-0x60(%r10)
2236
vmovdqa %xmm12,-0x50(%r10)
2237
vmovdqa %xmm13,-0x40(%r10)
2238
vmovdqa %xmm14,-0x30(%r10)
2239
vmovdqa %xmm15,-0x20(%r10)
2240
.Ldo_avx512_body:
2241
___
2242
$code.=<<___;
2243
lea .Lconst(%rip),%rcx
2244
lea 48+64($ctx),$ctx # size optimization
2245
vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2246
2247
# expand pre-calculated table
2248
vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2249
and \$-512,%rsp
2250
vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2251
mov \$0x20,%rax
2252
vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2253
vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2254
vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2255
vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2256
vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2257
vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2258
vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2259
vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2260
vpbroadcastq 64(%rcx),$MASK # .Lmask26
2261
vpermd $D1,$T2,$R1
2262
vpermd $T0,$T2,$S1
2263
vpermd $D2,$T2,$R2
2264
vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2265
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2266
vpermd $T1,$T2,$S2
2267
vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2268
vpsrlq \$32,$R1,$T1
2269
vpermd $D3,$T2,$R3
2270
vmovdqa64 $S1,0x40(%rsp){%k2}
2271
vpermd $T3,$T2,$S3
2272
vpermd $D4,$T2,$R4
2273
vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2274
vpermd $T4,$T2,$S4
2275
vmovdqa64 $S2,0x80(%rsp){%k2}
2276
vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2277
vmovdqa64 $S3,0xc0(%rsp){%k2}
2278
vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2279
vmovdqa64 $S4,0x100(%rsp){%k2}
2280
2281
################################################################
2282
# calculate 5th through 8th powers of the key
2283
#
2284
# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2285
# d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2286
# d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2287
# d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2288
# d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2289
2290
vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2291
vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2292
vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2293
vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2294
vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2295
vpsrlq \$32,$R2,$T2
2296
2297
vpmuludq $T1,$S4,$M0
2298
vpmuludq $T1,$R0,$M1
2299
vpmuludq $T1,$R1,$M2
2300
vpmuludq $T1,$R2,$M3
2301
vpmuludq $T1,$R3,$M4
2302
vpsrlq \$32,$R3,$T3
2303
vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2304
vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2305
vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2306
vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2307
vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2308
2309
vpmuludq $T2,$S3,$M0
2310
vpmuludq $T2,$S4,$M1
2311
vpmuludq $T2,$R1,$M3
2312
vpmuludq $T2,$R2,$M4
2313
vpmuludq $T2,$R0,$M2
2314
vpsrlq \$32,$R4,$T4
2315
vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2316
vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2317
vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2318
vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2319
vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2320
2321
vpmuludq $T3,$S2,$M0
2322
vpmuludq $T3,$R0,$M3
2323
vpmuludq $T3,$R1,$M4
2324
vpmuludq $T3,$S3,$M1
2325
vpmuludq $T3,$S4,$M2
2326
vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2327
vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2328
vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2329
vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2330
vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2331
2332
vpmuludq $T4,$S4,$M3
2333
vpmuludq $T4,$R0,$M4
2334
vpmuludq $T4,$S1,$M0
2335
vpmuludq $T4,$S2,$M1
2336
vpmuludq $T4,$S3,$M2
2337
vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2338
vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2339
vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2340
vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2341
vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2342
2343
################################################################
2344
# load input
2345
vmovdqu64 16*0($inp),%z#$T3
2346
vmovdqu64 16*4($inp),%z#$T4
2347
lea 16*8($inp),$inp
2348
2349
################################################################
2350
# lazy reduction
2351
2352
vpsrlq \$26,$D3,$M3
2353
vpandq $MASK,$D3,$D3
2354
vpaddq $M3,$D4,$D4 # d3 -> d4
2355
2356
vpsrlq \$26,$D0,$M0
2357
vpandq $MASK,$D0,$D0
2358
vpaddq $M0,$D1,$D1 # d0 -> d1
2359
2360
vpsrlq \$26,$D4,$M4
2361
vpandq $MASK,$D4,$D4
2362
2363
vpsrlq \$26,$D1,$M1
2364
vpandq $MASK,$D1,$D1
2365
vpaddq $M1,$D2,$D2 # d1 -> d2
2366
2367
vpaddq $M4,$D0,$D0
2368
vpsllq \$2,$M4,$M4
2369
vpaddq $M4,$D0,$D0 # d4 -> d0
2370
2371
vpsrlq \$26,$D2,$M2
2372
vpandq $MASK,$D2,$D2
2373
vpaddq $M2,$D3,$D3 # d2 -> d3
2374
2375
vpsrlq \$26,$D0,$M0
2376
vpandq $MASK,$D0,$D0
2377
vpaddq $M0,$D1,$D1 # d0 -> d1
2378
2379
vpsrlq \$26,$D3,$M3
2380
vpandq $MASK,$D3,$D3
2381
vpaddq $M3,$D4,$D4 # d3 -> d4
2382
2383
################################################################
2384
# at this point we have 14243444 in $R0-$S4 and 05060708 in
2385
# $D0-$D4, ...
2386
2387
vpunpcklqdq $T4,$T3,$T0 # transpose input
2388
vpunpckhqdq $T4,$T3,$T4
2389
2390
# ... since input 64-bit lanes are ordered as 73625140, we could
2391
# "vperm" it to 76543210 (here and in each loop iteration), *or*
2392
# we could just flow along, hence the goal for $R0-$S4 is
2393
# 1858286838784888 ...
2394
2395
vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2396
mov \$0x7777,%eax
2397
kmovw %eax,%k1
2398
2399
vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2400
vpermd $R1,$M0,$R1
2401
vpermd $R2,$M0,$R2
2402
vpermd $R3,$M0,$R3
2403
vpermd $R4,$M0,$R4
2404
2405
vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2406
vpermd $D1,$M0,${R1}{%k1}
2407
vpermd $D2,$M0,${R2}{%k1}
2408
vpermd $D3,$M0,${R3}{%k1}
2409
vpermd $D4,$M0,${R4}{%k1}
2410
2411
vpslld \$2,$R1,$S1 # *5
2412
vpslld \$2,$R2,$S2
2413
vpslld \$2,$R3,$S3
2414
vpslld \$2,$R4,$S4
2415
vpaddd $R1,$S1,$S1
2416
vpaddd $R2,$S2,$S2
2417
vpaddd $R3,$S3,$S3
2418
vpaddd $R4,$S4,$S4
2419
2420
vpbroadcastq 32(%rcx),$PADBIT # .L129
2421
2422
vpsrlq \$52,$T0,$T2 # splat input
2423
vpsllq \$12,$T4,$T3
2424
vporq $T3,$T2,$T2
2425
vpsrlq \$26,$T0,$T1
2426
vpsrlq \$14,$T4,$T3
2427
vpsrlq \$40,$T4,$T4 # 4
2428
vpandq $MASK,$T2,$T2 # 2
2429
vpandq $MASK,$T0,$T0 # 0
2430
#vpandq $MASK,$T1,$T1 # 1
2431
#vpandq $MASK,$T3,$T3 # 3
2432
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
2433
2434
vpaddq $H2,$T2,$H2 # accumulate input
2435
sub \$192,$len
2436
jbe .Ltail_avx512
2437
jmp .Loop_avx512
2438
2439
.align 32
2440
.Loop_avx512:
2441
################################################################
2442
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2443
# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2444
# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2445
# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2446
# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2447
# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2448
# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2449
# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2450
# \________/\___________/
2451
################################################################
2452
#vpaddq $H2,$T2,$H2 # accumulate input
2453
2454
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2455
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2456
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2457
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2458
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2459
#
2460
# however, as h2 is "chronologically" first one available pull
2461
# corresponding operations up, so it's
2462
#
2463
# d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2464
# d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2465
# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2466
# d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2467
# d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2468
2469
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2470
vpaddq $H0,$T0,$H0
2471
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2472
vpandq $MASK,$T1,$T1 # 1
2473
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2474
vpandq $MASK,$T3,$T3 # 3
2475
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2476
vporq $PADBIT,$T4,$T4 # padbit, yes, always
2477
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2478
vpaddq $H1,$T1,$H1 # accumulate input
2479
vpaddq $H3,$T3,$H3
2480
vpaddq $H4,$T4,$H4
2481
2482
vmovdqu64 16*0($inp),$T3 # load input
2483
vmovdqu64 16*4($inp),$T4
2484
lea 16*8($inp),$inp
2485
vpmuludq $H0,$R3,$M3
2486
vpmuludq $H0,$R4,$M4
2487
vpmuludq $H0,$R0,$M0
2488
vpmuludq $H0,$R1,$M1
2489
vpaddq $M3,$D3,$D3 # d3 += h0*r3
2490
vpaddq $M4,$D4,$D4 # d4 += h0*r4
2491
vpaddq $M0,$D0,$D0 # d0 += h0*r0
2492
vpaddq $M1,$D1,$D1 # d1 += h0*r1
2493
2494
vpmuludq $H1,$R2,$M3
2495
vpmuludq $H1,$R3,$M4
2496
vpmuludq $H1,$S4,$M0
2497
vpmuludq $H0,$R2,$M2
2498
vpaddq $M3,$D3,$D3 # d3 += h1*r2
2499
vpaddq $M4,$D4,$D4 # d4 += h1*r3
2500
vpaddq $M0,$D0,$D0 # d0 += h1*s4
2501
vpaddq $M2,$D2,$D2 # d2 += h0*r2
2502
2503
vpunpcklqdq $T4,$T3,$T0 # transpose input
2504
vpunpckhqdq $T4,$T3,$T4
2505
2506
vpmuludq $H3,$R0,$M3
2507
vpmuludq $H3,$R1,$M4
2508
vpmuludq $H1,$R0,$M1
2509
vpmuludq $H1,$R1,$M2
2510
vpaddq $M3,$D3,$D3 # d3 += h3*r0
2511
vpaddq $M4,$D4,$D4 # d4 += h3*r1
2512
vpaddq $M1,$D1,$D1 # d1 += h1*r0
2513
vpaddq $M2,$D2,$D2 # d2 += h1*r1
2514
2515
vpmuludq $H4,$S4,$M3
2516
vpmuludq $H4,$R0,$M4
2517
vpmuludq $H3,$S2,$M0
2518
vpmuludq $H3,$S3,$M1
2519
vpaddq $M3,$D3,$D3 # d3 += h4*s4
2520
vpmuludq $H3,$S4,$M2
2521
vpaddq $M4,$D4,$D4 # d4 += h4*r0
2522
vpaddq $M0,$D0,$D0 # d0 += h3*s2
2523
vpaddq $M1,$D1,$D1 # d1 += h3*s3
2524
vpaddq $M2,$D2,$D2 # d2 += h3*s4
2525
2526
vpmuludq $H4,$S1,$M0
2527
vpmuludq $H4,$S2,$M1
2528
vpmuludq $H4,$S3,$M2
2529
vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2530
vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2531
vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2532
2533
################################################################
2534
# lazy reduction (interleaved with input splat)
2535
2536
vpsrlq \$52,$T0,$T2 # splat input
2537
vpsllq \$12,$T4,$T3
2538
2539
vpsrlq \$26,$D3,$H3
2540
vpandq $MASK,$D3,$D3
2541
vpaddq $H3,$D4,$H4 # h3 -> h4
2542
2543
vporq $T3,$T2,$T2
2544
2545
vpsrlq \$26,$H0,$D0
2546
vpandq $MASK,$H0,$H0
2547
vpaddq $D0,$H1,$H1 # h0 -> h1
2548
2549
vpandq $MASK,$T2,$T2 # 2
2550
2551
vpsrlq \$26,$H4,$D4
2552
vpandq $MASK,$H4,$H4
2553
2554
vpsrlq \$26,$H1,$D1
2555
vpandq $MASK,$H1,$H1
2556
vpaddq $D1,$H2,$H2 # h1 -> h2
2557
2558
vpaddq $D4,$H0,$H0
2559
vpsllq \$2,$D4,$D4
2560
vpaddq $D4,$H0,$H0 # h4 -> h0
2561
2562
vpaddq $T2,$H2,$H2 # modulo-scheduled
2563
vpsrlq \$26,$T0,$T1
2564
2565
vpsrlq \$26,$H2,$D2
2566
vpandq $MASK,$H2,$H2
2567
vpaddq $D2,$D3,$H3 # h2 -> h3
2568
2569
vpsrlq \$14,$T4,$T3
2570
2571
vpsrlq \$26,$H0,$D0
2572
vpandq $MASK,$H0,$H0
2573
vpaddq $D0,$H1,$H1 # h0 -> h1
2574
2575
vpsrlq \$40,$T4,$T4 # 4
2576
2577
vpsrlq \$26,$H3,$D3
2578
vpandq $MASK,$H3,$H3
2579
vpaddq $D3,$H4,$H4 # h3 -> h4
2580
2581
vpandq $MASK,$T0,$T0 # 0
2582
#vpandq $MASK,$T1,$T1 # 1
2583
#vpandq $MASK,$T3,$T3 # 3
2584
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
2585
2586
sub \$128,$len
2587
ja .Loop_avx512
2588
2589
.Ltail_avx512:
2590
################################################################
2591
# while above multiplications were by r^8 in all lanes, in last
2592
# iteration we multiply least significant lane by r^8 and most
2593
# significant one by r, that's why table gets shifted...
2594
2595
vpsrlq \$32,$R0,$R0 # 0105020603070408
2596
vpsrlq \$32,$R1,$R1
2597
vpsrlq \$32,$R2,$R2
2598
vpsrlq \$32,$S3,$S3
2599
vpsrlq \$32,$S4,$S4
2600
vpsrlq \$32,$R3,$R3
2601
vpsrlq \$32,$R4,$R4
2602
vpsrlq \$32,$S1,$S1
2603
vpsrlq \$32,$S2,$S2
2604
2605
################################################################
2606
# load either next or last 64 byte of input
2607
lea ($inp,$len),$inp
2608
2609
#vpaddq $H2,$T2,$H2 # accumulate input
2610
vpaddq $H0,$T0,$H0
2611
2612
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2613
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2614
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2615
vpandq $MASK,$T1,$T1 # 1
2616
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2617
vpandq $MASK,$T3,$T3 # 3
2618
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2619
vporq $PADBIT,$T4,$T4 # padbit, yes, always
2620
vpaddq $H1,$T1,$H1 # accumulate input
2621
vpaddq $H3,$T3,$H3
2622
vpaddq $H4,$T4,$H4
2623
2624
vmovdqu 16*0($inp),%x#$T0
2625
vpmuludq $H0,$R3,$M3
2626
vpmuludq $H0,$R4,$M4
2627
vpmuludq $H0,$R0,$M0
2628
vpmuludq $H0,$R1,$M1
2629
vpaddq $M3,$D3,$D3 # d3 += h0*r3
2630
vpaddq $M4,$D4,$D4 # d4 += h0*r4
2631
vpaddq $M0,$D0,$D0 # d0 += h0*r0
2632
vpaddq $M1,$D1,$D1 # d1 += h0*r1
2633
2634
vmovdqu 16*1($inp),%x#$T1
2635
vpmuludq $H1,$R2,$M3
2636
vpmuludq $H1,$R3,$M4
2637
vpmuludq $H1,$S4,$M0
2638
vpmuludq $H0,$R2,$M2
2639
vpaddq $M3,$D3,$D3 # d3 += h1*r2
2640
vpaddq $M4,$D4,$D4 # d4 += h1*r3
2641
vpaddq $M0,$D0,$D0 # d0 += h1*s4
2642
vpaddq $M2,$D2,$D2 # d2 += h0*r2
2643
2644
vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2645
vpmuludq $H3,$R0,$M3
2646
vpmuludq $H3,$R1,$M4
2647
vpmuludq $H1,$R0,$M1
2648
vpmuludq $H1,$R1,$M2
2649
vpaddq $M3,$D3,$D3 # d3 += h3*r0
2650
vpaddq $M4,$D4,$D4 # d4 += h3*r1
2651
vpaddq $M1,$D1,$D1 # d1 += h1*r0
2652
vpaddq $M2,$D2,$D2 # d2 += h1*r1
2653
2654
vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2655
vpmuludq $H4,$S4,$M3
2656
vpmuludq $H4,$R0,$M4
2657
vpmuludq $H3,$S2,$M0
2658
vpmuludq $H3,$S3,$M1
2659
vpmuludq $H3,$S4,$M2
2660
vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2661
vpaddq $M4,$D4,$D4 # d4 += h4*r0
2662
vpaddq $M0,$D0,$D0 # d0 += h3*s2
2663
vpaddq $M1,$D1,$D1 # d1 += h3*s3
2664
vpaddq $M2,$D2,$D2 # d2 += h3*s4
2665
2666
vpmuludq $H4,$S1,$M0
2667
vpmuludq $H4,$S2,$M1
2668
vpmuludq $H4,$S3,$M2
2669
vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2670
vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2671
vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2672
2673
################################################################
2674
# horizontal addition
2675
2676
mov \$1,%eax
2677
vpermq \$0xb1,$H3,$D3
2678
vpermq \$0xb1,$D4,$H4
2679
vpermq \$0xb1,$H0,$D0
2680
vpermq \$0xb1,$H1,$D1
2681
vpermq \$0xb1,$H2,$D2
2682
vpaddq $D3,$H3,$H3
2683
vpaddq $D4,$H4,$H4
2684
vpaddq $D0,$H0,$H0
2685
vpaddq $D1,$H1,$H1
2686
vpaddq $D2,$H2,$H2
2687
2688
kmovw %eax,%k3
2689
vpermq \$0x2,$H3,$D3
2690
vpermq \$0x2,$H4,$D4
2691
vpermq \$0x2,$H0,$D0
2692
vpermq \$0x2,$H1,$D1
2693
vpermq \$0x2,$H2,$D2
2694
vpaddq $D3,$H3,$H3
2695
vpaddq $D4,$H4,$H4
2696
vpaddq $D0,$H0,$H0
2697
vpaddq $D1,$H1,$H1
2698
vpaddq $D2,$H2,$H2
2699
2700
vextracti64x4 \$0x1,$H3,%y#$D3
2701
vextracti64x4 \$0x1,$H4,%y#$D4
2702
vextracti64x4 \$0x1,$H0,%y#$D0
2703
vextracti64x4 \$0x1,$H1,%y#$D1
2704
vextracti64x4 \$0x1,$H2,%y#$D2
2705
vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2706
vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2707
vpaddq $D0,$H0,${H0}{%k3}{z}
2708
vpaddq $D1,$H1,${H1}{%k3}{z}
2709
vpaddq $D2,$H2,${H2}{%k3}{z}
2710
___
2711
map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2712
map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2713
$code.=<<___;
2714
################################################################
2715
# lazy reduction (interleaved with input splat)
2716
2717
vpsrlq \$26,$H3,$D3
2718
vpand $MASK,$H3,$H3
2719
vpsrldq \$6,$T0,$T2 # splat input
2720
vpsrldq \$6,$T1,$T3
2721
vpunpckhqdq $T1,$T0,$T4 # 4
2722
vpaddq $D3,$H4,$H4 # h3 -> h4
2723
2724
vpsrlq \$26,$H0,$D0
2725
vpand $MASK,$H0,$H0
2726
vpunpcklqdq $T3,$T2,$T2 # 2:3
2727
vpunpcklqdq $T1,$T0,$T0 # 0:1
2728
vpaddq $D0,$H1,$H1 # h0 -> h1
2729
2730
vpsrlq \$26,$H4,$D4
2731
vpand $MASK,$H4,$H4
2732
2733
vpsrlq \$26,$H1,$D1
2734
vpand $MASK,$H1,$H1
2735
vpsrlq \$30,$T2,$T3
2736
vpsrlq \$4,$T2,$T2
2737
vpaddq $D1,$H2,$H2 # h1 -> h2
2738
2739
vpaddq $D4,$H0,$H0
2740
vpsllq \$2,$D4,$D4
2741
vpsrlq \$26,$T0,$T1
2742
vpsrlq \$40,$T4,$T4 # 4
2743
vpaddq $D4,$H0,$H0 # h4 -> h0
2744
2745
vpsrlq \$26,$H2,$D2
2746
vpand $MASK,$H2,$H2
2747
vpand $MASK,$T2,$T2 # 2
2748
vpand $MASK,$T0,$T0 # 0
2749
vpaddq $D2,$H3,$H3 # h2 -> h3
2750
2751
vpsrlq \$26,$H0,$D0
2752
vpand $MASK,$H0,$H0
2753
vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2754
vpand $MASK,$T1,$T1 # 1
2755
vpaddq $D0,$H1,$H1 # h0 -> h1
2756
2757
vpsrlq \$26,$H3,$D3
2758
vpand $MASK,$H3,$H3
2759
vpand $MASK,$T3,$T3 # 3
2760
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2761
vpaddq $D3,$H4,$H4 # h3 -> h4
2762
2763
lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2764
add \$64,$len
2765
jnz .Ltail_avx2$suffix
2766
2767
vpsubq $T2,$H2,$H2 # undo input accumulation
2768
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2769
vmovd %x#$H1,`4*1-48-64`($ctx)
2770
vmovd %x#$H2,`4*2-48-64`($ctx)
2771
vmovd %x#$H3,`4*3-48-64`($ctx)
2772
vmovd %x#$H4,`4*4-48-64`($ctx)
2773
vzeroall
2774
___
2775
$code.=<<___ if ($win64);
2776
movdqa -0xb0(%r10),%xmm6
2777
movdqa -0xa0(%r10),%xmm7
2778
movdqa -0x90(%r10),%xmm8
2779
movdqa -0x80(%r10),%xmm9
2780
movdqa -0x70(%r10),%xmm10
2781
movdqa -0x60(%r10),%xmm11
2782
movdqa -0x50(%r10),%xmm12
2783
movdqa -0x40(%r10),%xmm13
2784
movdqa -0x30(%r10),%xmm14
2785
movdqa -0x20(%r10),%xmm15
2786
lea -8(%r10),%rsp
2787
.Ldo_avx512_epilogue:
2788
___
2789
$code.=<<___ if (!$win64);
2790
lea -8(%r10),%rsp
2791
.cfi_def_cfa_register %rsp
2792
___
2793
$code.=<<___;
2794
RET
2795
.cfi_endproc
2796
___
2797
2798
}
2799
2800
}
2801
2802
&declare_function("poly1305_blocks_avx2", 32, 4);
2803
poly1305_blocks_avxN(0);
2804
&end_function("poly1305_blocks_avx2");
2805
2806
#######################################################################
2807
if ($avx>2) {
2808
# On entry we have input length divisible by 64. But since inner loop
2809
# processes 128 bytes per iteration, cases when length is not divisible
2810
# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2811
# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2812
# for this tail, we wouldn't have to even allocate stack frame...
2813
2814
&declare_function("poly1305_blocks_avx512", 32, 4);
2815
poly1305_blocks_avxN(1);
2816
&end_function("poly1305_blocks_avx512");
2817
2818
if (!$kernel && $avx>3) {
2819
########################################################################
2820
# VPMADD52 version using 2^44 radix.
2821
#
2822
# One can argue that base 2^52 would be more natural. Well, even though
2823
# some operations would be more natural, one has to recognize couple of
2824
# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2825
# at amount of multiply-n-accumulate operations. Secondly, it makes it
2826
# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2827
# reference implementations], which means that more such operations
2828
# would have to be performed in inner loop, which in turn makes critical
2829
# path longer. In other words, even though base 2^44 reduction might
2830
# look less elegant, overall critical path is actually shorter...
2831
2832
########################################################################
2833
# Layout of opaque area is following.
2834
#
2835
# unsigned __int64 h[3]; # current hash value base 2^44
2836
# unsigned __int64 s[2]; # key value*20 base 2^44
2837
# unsigned __int64 r[3]; # key value base 2^44
2838
# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2839
# # r^n positions reflect
2840
# # placement in register, not
2841
# # memory, R[3] is R[1]*20
2842
2843
$code.=<<___;
2844
.type poly1305_init_base2_44,\@function,3
2845
.align 32
2846
poly1305_init_base2_44:
2847
xor %eax,%eax
2848
mov %rax,0($ctx) # initialize hash value
2849
mov %rax,8($ctx)
2850
mov %rax,16($ctx)
2851
2852
.Linit_base2_44:
2853
lea poly1305_blocks_vpmadd52(%rip),%r10
2854
lea poly1305_emit_base2_44(%rip),%r11
2855
2856
mov \$0x0ffffffc0fffffff,%rax
2857
mov \$0x0ffffffc0ffffffc,%rcx
2858
and 0($inp),%rax
2859
mov \$0x00000fffffffffff,%r8
2860
and 8($inp),%rcx
2861
mov \$0x00000fffffffffff,%r9
2862
and %rax,%r8
2863
shrd \$44,%rcx,%rax
2864
mov %r8,40($ctx) # r0
2865
and %r9,%rax
2866
shr \$24,%rcx
2867
mov %rax,48($ctx) # r1
2868
lea (%rax,%rax,4),%rax # *5
2869
mov %rcx,56($ctx) # r2
2870
shl \$2,%rax # magic <<2
2871
lea (%rcx,%rcx,4),%rcx # *5
2872
shl \$2,%rcx # magic <<2
2873
mov %rax,24($ctx) # s1
2874
mov %rcx,32($ctx) # s2
2875
movq \$-1,64($ctx) # write impossible value
2876
___
2877
$code.=<<___ if ($flavour !~ /elf32/);
2878
mov %r10,0(%rdx)
2879
mov %r11,8(%rdx)
2880
___
2881
$code.=<<___ if ($flavour =~ /elf32/);
2882
mov %r10d,0(%rdx)
2883
mov %r11d,4(%rdx)
2884
___
2885
$code.=<<___;
2886
mov \$1,%eax
2887
RET
2888
.size poly1305_init_base2_44,.-poly1305_init_base2_44
2889
___
2890
{
2891
my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2892
my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2893
my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2894
2895
$code.=<<___;
2896
.type poly1305_blocks_vpmadd52,\@function,4
2897
.align 32
2898
poly1305_blocks_vpmadd52:
2899
shr \$4,$len
2900
jz .Lno_data_vpmadd52 # too short
2901
2902
shl \$40,$padbit
2903
mov 64($ctx),%r8 # peek on power of the key
2904
2905
# if powers of the key are not calculated yet, process up to 3
2906
# blocks with this single-block subroutine, otherwise ensure that
2907
# length is divisible by 2 blocks and pass the rest down to next
2908
# subroutine...
2909
2910
mov \$3,%rax
2911
mov \$1,%r10
2912
cmp \$4,$len # is input long
2913
cmovae %r10,%rax
2914
test %r8,%r8 # is power value impossible?
2915
cmovns %r10,%rax
2916
2917
and $len,%rax # is input of favourable length?
2918
jz .Lblocks_vpmadd52_4x
2919
2920
sub %rax,$len
2921
mov \$7,%r10d
2922
mov \$1,%r11d
2923
kmovw %r10d,%k7
2924
lea .L2_44_inp_permd(%rip),%r10
2925
kmovw %r11d,%k1
2926
2927
vmovq $padbit,%x#$PAD
2928
vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2929
vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2930
vpermq \$0xcf,$PAD,$PAD
2931
vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2932
2933
vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2934
vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2935
vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2936
vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2937
2938
vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2939
vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2940
2941
jmp .Loop_vpmadd52
2942
2943
.align 32
2944
.Loop_vpmadd52:
2945
vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2946
lea 16($inp),$inp
2947
2948
vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2949
vpsrlvq $inp_shift,$T0,$T0
2950
vpandq $reduc_mask,$T0,$T0
2951
vporq $PAD,$T0,$T0
2952
2953
vpaddq $T0,$Dlo,$Dlo # accumulate input
2954
2955
vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2956
vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2957
vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2958
2959
vpxord $Dlo,$Dlo,$Dlo
2960
vpxord $Dhi,$Dhi,$Dhi
2961
2962
vpmadd52luq $r2r1r0,$H0,$Dlo
2963
vpmadd52huq $r2r1r0,$H0,$Dhi
2964
2965
vpmadd52luq $r1r0s2,$H1,$Dlo
2966
vpmadd52huq $r1r0s2,$H1,$Dhi
2967
2968
vpmadd52luq $r0s2s1,$H2,$Dlo
2969
vpmadd52huq $r0s2s1,$H2,$Dhi
2970
2971
vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2972
vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2973
vpandq $reduc_mask,$Dlo,$Dlo
2974
2975
vpaddq $T0,$Dhi,$Dhi
2976
2977
vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2978
2979
vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2980
2981
vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2982
vpandq $reduc_mask,$Dlo,$Dlo
2983
2984
vpermq \$0b10010011,$T0,$T0
2985
2986
vpaddq $T0,$Dlo,$Dlo
2987
2988
vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2989
2990
vpaddq $T0,$Dlo,$Dlo
2991
vpsllq \$2,$T0,$T0
2992
2993
vpaddq $T0,$Dlo,$Dlo
2994
2995
dec %rax # len-=16
2996
jnz .Loop_vpmadd52
2997
2998
vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2999
3000
test $len,$len
3001
jnz .Lblocks_vpmadd52_4x
3002
3003
.Lno_data_vpmadd52:
3004
RET
3005
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3006
___
3007
}
3008
{
3009
########################################################################
3010
# As implied by its name 4x subroutine processes 4 blocks in parallel
3011
# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
3012
# and is handled in 256-bit %ymm registers.
3013
3014
my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3015
my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3016
my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3017
3018
$code.=<<___;
3019
.type poly1305_blocks_vpmadd52_4x,\@function,4
3020
.align 32
3021
poly1305_blocks_vpmadd52_4x:
3022
shr \$4,$len
3023
jz .Lno_data_vpmadd52_4x # too short
3024
3025
shl \$40,$padbit
3026
mov 64($ctx),%r8 # peek on power of the key
3027
3028
.Lblocks_vpmadd52_4x:
3029
vpbroadcastq $padbit,$PAD
3030
3031
vmovdqa64 .Lx_mask44(%rip),$mask44
3032
mov \$5,%eax
3033
vmovdqa64 .Lx_mask42(%rip),$mask42
3034
kmovw %eax,%k1 # used in 2x path
3035
3036
test %r8,%r8 # is power value impossible?
3037
js .Linit_vpmadd52 # if it is, then init R[4]
3038
3039
vmovq 0($ctx),%x#$H0 # load current hash value
3040
vmovq 8($ctx),%x#$H1
3041
vmovq 16($ctx),%x#$H2
3042
3043
test \$3,$len # is length 4*n+2?
3044
jnz .Lblocks_vpmadd52_2x_do
3045
3046
.Lblocks_vpmadd52_4x_do:
3047
vpbroadcastq 64($ctx),$R0 # load 4th power of the key
3048
vpbroadcastq 96($ctx),$R1
3049
vpbroadcastq 128($ctx),$R2
3050
vpbroadcastq 160($ctx),$S1
3051
3052
.Lblocks_vpmadd52_4x_key_loaded:
3053
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3054
vpaddq $R2,$S2,$S2
3055
vpsllq \$2,$S2,$S2
3056
3057
test \$7,$len # is len 8*n?
3058
jz .Lblocks_vpmadd52_8x
3059
3060
vmovdqu64 16*0($inp),$T2 # load data
3061
vmovdqu64 16*2($inp),$T3
3062
lea 16*4($inp),$inp
3063
3064
vpunpcklqdq $T3,$T2,$T1 # transpose data
3065
vpunpckhqdq $T3,$T2,$T3
3066
3067
# at this point 64-bit lanes are ordered as 3-1-2-0
3068
3069
vpsrlq \$24,$T3,$T2 # splat the data
3070
vporq $PAD,$T2,$T2
3071
vpaddq $T2,$H2,$H2 # accumulate input
3072
vpandq $mask44,$T1,$T0
3073
vpsrlq \$44,$T1,$T1
3074
vpsllq \$20,$T3,$T3
3075
vporq $T3,$T1,$T1
3076
vpandq $mask44,$T1,$T1
3077
3078
sub \$4,$len
3079
jz .Ltail_vpmadd52_4x
3080
jmp .Loop_vpmadd52_4x
3081
ud2
3082
3083
.align 32
3084
.Linit_vpmadd52:
3085
vmovq 24($ctx),%x#$S1 # load key
3086
vmovq 56($ctx),%x#$H2
3087
vmovq 32($ctx),%x#$S2
3088
vmovq 40($ctx),%x#$R0
3089
vmovq 48($ctx),%x#$R1
3090
3091
vmovdqa $R0,$H0
3092
vmovdqa $R1,$H1
3093
vmovdqa $H2,$R2
3094
3095
mov \$2,%eax
3096
3097
.Lmul_init_vpmadd52:
3098
vpxorq $D0lo,$D0lo,$D0lo
3099
vpmadd52luq $H2,$S1,$D0lo
3100
vpxorq $D0hi,$D0hi,$D0hi
3101
vpmadd52huq $H2,$S1,$D0hi
3102
vpxorq $D1lo,$D1lo,$D1lo
3103
vpmadd52luq $H2,$S2,$D1lo
3104
vpxorq $D1hi,$D1hi,$D1hi
3105
vpmadd52huq $H2,$S2,$D1hi
3106
vpxorq $D2lo,$D2lo,$D2lo
3107
vpmadd52luq $H2,$R0,$D2lo
3108
vpxorq $D2hi,$D2hi,$D2hi
3109
vpmadd52huq $H2,$R0,$D2hi
3110
3111
vpmadd52luq $H0,$R0,$D0lo
3112
vpmadd52huq $H0,$R0,$D0hi
3113
vpmadd52luq $H0,$R1,$D1lo
3114
vpmadd52huq $H0,$R1,$D1hi
3115
vpmadd52luq $H0,$R2,$D2lo
3116
vpmadd52huq $H0,$R2,$D2hi
3117
3118
vpmadd52luq $H1,$S2,$D0lo
3119
vpmadd52huq $H1,$S2,$D0hi
3120
vpmadd52luq $H1,$R0,$D1lo
3121
vpmadd52huq $H1,$R0,$D1hi
3122
vpmadd52luq $H1,$R1,$D2lo
3123
vpmadd52huq $H1,$R1,$D2hi
3124
3125
################################################################
3126
# partial reduction
3127
vpsrlq \$44,$D0lo,$tmp
3128
vpsllq \$8,$D0hi,$D0hi
3129
vpandq $mask44,$D0lo,$H0
3130
vpaddq $tmp,$D0hi,$D0hi
3131
3132
vpaddq $D0hi,$D1lo,$D1lo
3133
3134
vpsrlq \$44,$D1lo,$tmp
3135
vpsllq \$8,$D1hi,$D1hi
3136
vpandq $mask44,$D1lo,$H1
3137
vpaddq $tmp,$D1hi,$D1hi
3138
3139
vpaddq $D1hi,$D2lo,$D2lo
3140
3141
vpsrlq \$42,$D2lo,$tmp
3142
vpsllq \$10,$D2hi,$D2hi
3143
vpandq $mask42,$D2lo,$H2
3144
vpaddq $tmp,$D2hi,$D2hi
3145
3146
vpaddq $D2hi,$H0,$H0
3147
vpsllq \$2,$D2hi,$D2hi
3148
3149
vpaddq $D2hi,$H0,$H0
3150
3151
vpsrlq \$44,$H0,$tmp # additional step
3152
vpandq $mask44,$H0,$H0
3153
3154
vpaddq $tmp,$H1,$H1
3155
3156
dec %eax
3157
jz .Ldone_init_vpmadd52
3158
3159
vpunpcklqdq $R1,$H1,$R1 # 1,2
3160
vpbroadcastq %x#$H1,%x#$H1 # 2,2
3161
vpunpcklqdq $R2,$H2,$R2
3162
vpbroadcastq %x#$H2,%x#$H2
3163
vpunpcklqdq $R0,$H0,$R0
3164
vpbroadcastq %x#$H0,%x#$H0
3165
3166
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3167
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3168
vpaddq $R1,$S1,$S1
3169
vpaddq $R2,$S2,$S2
3170
vpsllq \$2,$S1,$S1
3171
vpsllq \$2,$S2,$S2
3172
3173
jmp .Lmul_init_vpmadd52
3174
ud2
3175
3176
.align 32
3177
.Ldone_init_vpmadd52:
3178
vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3179
vinserti128 \$1,%x#$R2,$H2,$R2
3180
vinserti128 \$1,%x#$R0,$H0,$R0
3181
3182
vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3183
vpermq \$0b11011000,$R2,$R2
3184
vpermq \$0b11011000,$R0,$R0
3185
3186
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3187
vpaddq $R1,$S1,$S1
3188
vpsllq \$2,$S1,$S1
3189
3190
vmovq 0($ctx),%x#$H0 # load current hash value
3191
vmovq 8($ctx),%x#$H1
3192
vmovq 16($ctx),%x#$H2
3193
3194
test \$3,$len # is length 4*n+2?
3195
jnz .Ldone_init_vpmadd52_2x
3196
3197
vmovdqu64 $R0,64($ctx) # save key powers
3198
vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3199
vmovdqu64 $R1,96($ctx)
3200
vpbroadcastq %x#$R1,$R1
3201
vmovdqu64 $R2,128($ctx)
3202
vpbroadcastq %x#$R2,$R2
3203
vmovdqu64 $S1,160($ctx)
3204
vpbroadcastq %x#$S1,$S1
3205
3206
jmp .Lblocks_vpmadd52_4x_key_loaded
3207
ud2
3208
3209
.align 32
3210
.Ldone_init_vpmadd52_2x:
3211
vmovdqu64 $R0,64($ctx) # save key powers
3212
vpsrldq \$8,$R0,$R0 # 0-1-0-2
3213
vmovdqu64 $R1,96($ctx)
3214
vpsrldq \$8,$R1,$R1
3215
vmovdqu64 $R2,128($ctx)
3216
vpsrldq \$8,$R2,$R2
3217
vmovdqu64 $S1,160($ctx)
3218
vpsrldq \$8,$S1,$S1
3219
jmp .Lblocks_vpmadd52_2x_key_loaded
3220
ud2
3221
3222
.align 32
3223
.Lblocks_vpmadd52_2x_do:
3224
vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3225
vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3226
vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3227
vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3228
3229
.Lblocks_vpmadd52_2x_key_loaded:
3230
vmovdqu64 16*0($inp),$T2 # load data
3231
vpxorq $T3,$T3,$T3
3232
lea 16*2($inp),$inp
3233
3234
vpunpcklqdq $T3,$T2,$T1 # transpose data
3235
vpunpckhqdq $T3,$T2,$T3
3236
3237
# at this point 64-bit lanes are ordered as x-1-x-0
3238
3239
vpsrlq \$24,$T3,$T2 # splat the data
3240
vporq $PAD,$T2,$T2
3241
vpaddq $T2,$H2,$H2 # accumulate input
3242
vpandq $mask44,$T1,$T0
3243
vpsrlq \$44,$T1,$T1
3244
vpsllq \$20,$T3,$T3
3245
vporq $T3,$T1,$T1
3246
vpandq $mask44,$T1,$T1
3247
3248
jmp .Ltail_vpmadd52_2x
3249
ud2
3250
3251
.align 32
3252
.Loop_vpmadd52_4x:
3253
#vpaddq $T2,$H2,$H2 # accumulate input
3254
vpaddq $T0,$H0,$H0
3255
vpaddq $T1,$H1,$H1
3256
3257
vpxorq $D0lo,$D0lo,$D0lo
3258
vpmadd52luq $H2,$S1,$D0lo
3259
vpxorq $D0hi,$D0hi,$D0hi
3260
vpmadd52huq $H2,$S1,$D0hi
3261
vpxorq $D1lo,$D1lo,$D1lo
3262
vpmadd52luq $H2,$S2,$D1lo
3263
vpxorq $D1hi,$D1hi,$D1hi
3264
vpmadd52huq $H2,$S2,$D1hi
3265
vpxorq $D2lo,$D2lo,$D2lo
3266
vpmadd52luq $H2,$R0,$D2lo
3267
vpxorq $D2hi,$D2hi,$D2hi
3268
vpmadd52huq $H2,$R0,$D2hi
3269
3270
vmovdqu64 16*0($inp),$T2 # load data
3271
vmovdqu64 16*2($inp),$T3
3272
lea 16*4($inp),$inp
3273
vpmadd52luq $H0,$R0,$D0lo
3274
vpmadd52huq $H0,$R0,$D0hi
3275
vpmadd52luq $H0,$R1,$D1lo
3276
vpmadd52huq $H0,$R1,$D1hi
3277
vpmadd52luq $H0,$R2,$D2lo
3278
vpmadd52huq $H0,$R2,$D2hi
3279
3280
vpunpcklqdq $T3,$T2,$T1 # transpose data
3281
vpunpckhqdq $T3,$T2,$T3
3282
vpmadd52luq $H1,$S2,$D0lo
3283
vpmadd52huq $H1,$S2,$D0hi
3284
vpmadd52luq $H1,$R0,$D1lo
3285
vpmadd52huq $H1,$R0,$D1hi
3286
vpmadd52luq $H1,$R1,$D2lo
3287
vpmadd52huq $H1,$R1,$D2hi
3288
3289
################################################################
3290
# partial reduction (interleaved with data splat)
3291
vpsrlq \$44,$D0lo,$tmp
3292
vpsllq \$8,$D0hi,$D0hi
3293
vpandq $mask44,$D0lo,$H0
3294
vpaddq $tmp,$D0hi,$D0hi
3295
3296
vpsrlq \$24,$T3,$T2
3297
vporq $PAD,$T2,$T2
3298
vpaddq $D0hi,$D1lo,$D1lo
3299
3300
vpsrlq \$44,$D1lo,$tmp
3301
vpsllq \$8,$D1hi,$D1hi
3302
vpandq $mask44,$D1lo,$H1
3303
vpaddq $tmp,$D1hi,$D1hi
3304
3305
vpandq $mask44,$T1,$T0
3306
vpsrlq \$44,$T1,$T1
3307
vpsllq \$20,$T3,$T3
3308
vpaddq $D1hi,$D2lo,$D2lo
3309
3310
vpsrlq \$42,$D2lo,$tmp
3311
vpsllq \$10,$D2hi,$D2hi
3312
vpandq $mask42,$D2lo,$H2
3313
vpaddq $tmp,$D2hi,$D2hi
3314
3315
vpaddq $T2,$H2,$H2 # accumulate input
3316
vpaddq $D2hi,$H0,$H0
3317
vpsllq \$2,$D2hi,$D2hi
3318
3319
vpaddq $D2hi,$H0,$H0
3320
vporq $T3,$T1,$T1
3321
vpandq $mask44,$T1,$T1
3322
3323
vpsrlq \$44,$H0,$tmp # additional step
3324
vpandq $mask44,$H0,$H0
3325
3326
vpaddq $tmp,$H1,$H1
3327
3328
sub \$4,$len # len-=64
3329
jnz .Loop_vpmadd52_4x
3330
3331
.Ltail_vpmadd52_4x:
3332
vmovdqu64 128($ctx),$R2 # load all key powers
3333
vmovdqu64 160($ctx),$S1
3334
vmovdqu64 64($ctx),$R0
3335
vmovdqu64 96($ctx),$R1
3336
3337
.Ltail_vpmadd52_2x:
3338
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3339
vpaddq $R2,$S2,$S2
3340
vpsllq \$2,$S2,$S2
3341
3342
#vpaddq $T2,$H2,$H2 # accumulate input
3343
vpaddq $T0,$H0,$H0
3344
vpaddq $T1,$H1,$H1
3345
3346
vpxorq $D0lo,$D0lo,$D0lo
3347
vpmadd52luq $H2,$S1,$D0lo
3348
vpxorq $D0hi,$D0hi,$D0hi
3349
vpmadd52huq $H2,$S1,$D0hi
3350
vpxorq $D1lo,$D1lo,$D1lo
3351
vpmadd52luq $H2,$S2,$D1lo
3352
vpxorq $D1hi,$D1hi,$D1hi
3353
vpmadd52huq $H2,$S2,$D1hi
3354
vpxorq $D2lo,$D2lo,$D2lo
3355
vpmadd52luq $H2,$R0,$D2lo
3356
vpxorq $D2hi,$D2hi,$D2hi
3357
vpmadd52huq $H2,$R0,$D2hi
3358
3359
vpmadd52luq $H0,$R0,$D0lo
3360
vpmadd52huq $H0,$R0,$D0hi
3361
vpmadd52luq $H0,$R1,$D1lo
3362
vpmadd52huq $H0,$R1,$D1hi
3363
vpmadd52luq $H0,$R2,$D2lo
3364
vpmadd52huq $H0,$R2,$D2hi
3365
3366
vpmadd52luq $H1,$S2,$D0lo
3367
vpmadd52huq $H1,$S2,$D0hi
3368
vpmadd52luq $H1,$R0,$D1lo
3369
vpmadd52huq $H1,$R0,$D1hi
3370
vpmadd52luq $H1,$R1,$D2lo
3371
vpmadd52huq $H1,$R1,$D2hi
3372
3373
################################################################
3374
# horizontal addition
3375
3376
mov \$1,%eax
3377
kmovw %eax,%k1
3378
vpsrldq \$8,$D0lo,$T0
3379
vpsrldq \$8,$D0hi,$H0
3380
vpsrldq \$8,$D1lo,$T1
3381
vpsrldq \$8,$D1hi,$H1
3382
vpaddq $T0,$D0lo,$D0lo
3383
vpaddq $H0,$D0hi,$D0hi
3384
vpsrldq \$8,$D2lo,$T2
3385
vpsrldq \$8,$D2hi,$H2
3386
vpaddq $T1,$D1lo,$D1lo
3387
vpaddq $H1,$D1hi,$D1hi
3388
vpermq \$0x2,$D0lo,$T0
3389
vpermq \$0x2,$D0hi,$H0
3390
vpaddq $T2,$D2lo,$D2lo
3391
vpaddq $H2,$D2hi,$D2hi
3392
3393
vpermq \$0x2,$D1lo,$T1
3394
vpermq \$0x2,$D1hi,$H1
3395
vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3396
vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3397
vpermq \$0x2,$D2lo,$T2
3398
vpermq \$0x2,$D2hi,$H2
3399
vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3400
vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3401
vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3402
vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3403
3404
################################################################
3405
# partial reduction
3406
vpsrlq \$44,$D0lo,$tmp
3407
vpsllq \$8,$D0hi,$D0hi
3408
vpandq $mask44,$D0lo,$H0
3409
vpaddq $tmp,$D0hi,$D0hi
3410
3411
vpaddq $D0hi,$D1lo,$D1lo
3412
3413
vpsrlq \$44,$D1lo,$tmp
3414
vpsllq \$8,$D1hi,$D1hi
3415
vpandq $mask44,$D1lo,$H1
3416
vpaddq $tmp,$D1hi,$D1hi
3417
3418
vpaddq $D1hi,$D2lo,$D2lo
3419
3420
vpsrlq \$42,$D2lo,$tmp
3421
vpsllq \$10,$D2hi,$D2hi
3422
vpandq $mask42,$D2lo,$H2
3423
vpaddq $tmp,$D2hi,$D2hi
3424
3425
vpaddq $D2hi,$H0,$H0
3426
vpsllq \$2,$D2hi,$D2hi
3427
3428
vpaddq $D2hi,$H0,$H0
3429
3430
vpsrlq \$44,$H0,$tmp # additional step
3431
vpandq $mask44,$H0,$H0
3432
3433
vpaddq $tmp,$H1,$H1
3434
# at this point $len is
3435
# either 4*n+2 or 0...
3436
sub \$2,$len # len-=32
3437
ja .Lblocks_vpmadd52_4x_do
3438
3439
vmovq %x#$H0,0($ctx)
3440
vmovq %x#$H1,8($ctx)
3441
vmovq %x#$H2,16($ctx)
3442
vzeroall
3443
3444
.Lno_data_vpmadd52_4x:
3445
RET
3446
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3447
___
3448
}
3449
{
3450
########################################################################
3451
# As implied by its name 8x subroutine processes 8 blocks in parallel...
3452
# This is intermediate version, as it's used only in cases when input
3453
# length is either 8*n, 8*n+1 or 8*n+2...
3454
3455
my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3456
my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3457
my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3458
my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3459
3460
$code.=<<___;
3461
.type poly1305_blocks_vpmadd52_8x,\@function,4
3462
.align 32
3463
poly1305_blocks_vpmadd52_8x:
3464
shr \$4,$len
3465
jz .Lno_data_vpmadd52_8x # too short
3466
3467
shl \$40,$padbit
3468
mov 64($ctx),%r8 # peek on power of the key
3469
3470
vmovdqa64 .Lx_mask44(%rip),$mask44
3471
vmovdqa64 .Lx_mask42(%rip),$mask42
3472
3473
test %r8,%r8 # is power value impossible?
3474
js .Linit_vpmadd52 # if it is, then init R[4]
3475
3476
vmovq 0($ctx),%x#$H0 # load current hash value
3477
vmovq 8($ctx),%x#$H1
3478
vmovq 16($ctx),%x#$H2
3479
3480
.Lblocks_vpmadd52_8x:
3481
################################################################
3482
# fist we calculate more key powers
3483
3484
vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3485
vmovdqu64 160($ctx),$S1
3486
vmovdqu64 64($ctx),$R0
3487
vmovdqu64 96($ctx),$R1
3488
3489
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3490
vpaddq $R2,$S2,$S2
3491
vpsllq \$2,$S2,$S2
3492
3493
vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3494
vpbroadcastq %x#$R0,$RR0
3495
vpbroadcastq %x#$R1,$RR1
3496
3497
vpxorq $D0lo,$D0lo,$D0lo
3498
vpmadd52luq $RR2,$S1,$D0lo
3499
vpxorq $D0hi,$D0hi,$D0hi
3500
vpmadd52huq $RR2,$S1,$D0hi
3501
vpxorq $D1lo,$D1lo,$D1lo
3502
vpmadd52luq $RR2,$S2,$D1lo
3503
vpxorq $D1hi,$D1hi,$D1hi
3504
vpmadd52huq $RR2,$S2,$D1hi
3505
vpxorq $D2lo,$D2lo,$D2lo
3506
vpmadd52luq $RR2,$R0,$D2lo
3507
vpxorq $D2hi,$D2hi,$D2hi
3508
vpmadd52huq $RR2,$R0,$D2hi
3509
3510
vpmadd52luq $RR0,$R0,$D0lo
3511
vpmadd52huq $RR0,$R0,$D0hi
3512
vpmadd52luq $RR0,$R1,$D1lo
3513
vpmadd52huq $RR0,$R1,$D1hi
3514
vpmadd52luq $RR0,$R2,$D2lo
3515
vpmadd52huq $RR0,$R2,$D2hi
3516
3517
vpmadd52luq $RR1,$S2,$D0lo
3518
vpmadd52huq $RR1,$S2,$D0hi
3519
vpmadd52luq $RR1,$R0,$D1lo
3520
vpmadd52huq $RR1,$R0,$D1hi
3521
vpmadd52luq $RR1,$R1,$D2lo
3522
vpmadd52huq $RR1,$R1,$D2hi
3523
3524
################################################################
3525
# partial reduction
3526
vpsrlq \$44,$D0lo,$tmp
3527
vpsllq \$8,$D0hi,$D0hi
3528
vpandq $mask44,$D0lo,$RR0
3529
vpaddq $tmp,$D0hi,$D0hi
3530
3531
vpaddq $D0hi,$D1lo,$D1lo
3532
3533
vpsrlq \$44,$D1lo,$tmp
3534
vpsllq \$8,$D1hi,$D1hi
3535
vpandq $mask44,$D1lo,$RR1
3536
vpaddq $tmp,$D1hi,$D1hi
3537
3538
vpaddq $D1hi,$D2lo,$D2lo
3539
3540
vpsrlq \$42,$D2lo,$tmp
3541
vpsllq \$10,$D2hi,$D2hi
3542
vpandq $mask42,$D2lo,$RR2
3543
vpaddq $tmp,$D2hi,$D2hi
3544
3545
vpaddq $D2hi,$RR0,$RR0
3546
vpsllq \$2,$D2hi,$D2hi
3547
3548
vpaddq $D2hi,$RR0,$RR0
3549
3550
vpsrlq \$44,$RR0,$tmp # additional step
3551
vpandq $mask44,$RR0,$RR0
3552
3553
vpaddq $tmp,$RR1,$RR1
3554
3555
################################################################
3556
# At this point Rx holds 1324 powers, RRx - 5768, and the goal
3557
# is 15263748, which reflects how data is loaded...
3558
3559
vpunpcklqdq $R2,$RR2,$T2 # 3748
3560
vpunpckhqdq $R2,$RR2,$R2 # 1526
3561
vpunpcklqdq $R0,$RR0,$T0
3562
vpunpckhqdq $R0,$RR0,$R0
3563
vpunpcklqdq $R1,$RR1,$T1
3564
vpunpckhqdq $R1,$RR1,$R1
3565
___
3566
######## switch to %zmm
3567
map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3568
map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3569
map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3570
map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3571
3572
$code.=<<___;
3573
vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3574
vshufi64x2 \$0x44,$R0,$T0,$RR0
3575
vshufi64x2 \$0x44,$R1,$T1,$RR1
3576
3577
vmovdqu64 16*0($inp),$T2 # load data
3578
vmovdqu64 16*4($inp),$T3
3579
lea 16*8($inp),$inp
3580
3581
vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3582
vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3583
vpaddq $RR2,$SS2,$SS2
3584
vpaddq $RR1,$SS1,$SS1
3585
vpsllq \$2,$SS2,$SS2
3586
vpsllq \$2,$SS1,$SS1
3587
3588
vpbroadcastq $padbit,$PAD
3589
vpbroadcastq %x#$mask44,$mask44
3590
vpbroadcastq %x#$mask42,$mask42
3591
3592
vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3593
vpbroadcastq %x#$SS2,$S2
3594
vpbroadcastq %x#$RR0,$R0
3595
vpbroadcastq %x#$RR1,$R1
3596
vpbroadcastq %x#$RR2,$R2
3597
3598
vpunpcklqdq $T3,$T2,$T1 # transpose data
3599
vpunpckhqdq $T3,$T2,$T3
3600
3601
# at this point 64-bit lanes are ordered as 73625140
3602
3603
vpsrlq \$24,$T3,$T2 # splat the data
3604
vporq $PAD,$T2,$T2
3605
vpaddq $T2,$H2,$H2 # accumulate input
3606
vpandq $mask44,$T1,$T0
3607
vpsrlq \$44,$T1,$T1
3608
vpsllq \$20,$T3,$T3
3609
vporq $T3,$T1,$T1
3610
vpandq $mask44,$T1,$T1
3611
3612
sub \$8,$len
3613
jz .Ltail_vpmadd52_8x
3614
jmp .Loop_vpmadd52_8x
3615
3616
.align 32
3617
.Loop_vpmadd52_8x:
3618
#vpaddq $T2,$H2,$H2 # accumulate input
3619
vpaddq $T0,$H0,$H0
3620
vpaddq $T1,$H1,$H1
3621
3622
vpxorq $D0lo,$D0lo,$D0lo
3623
vpmadd52luq $H2,$S1,$D0lo
3624
vpxorq $D0hi,$D0hi,$D0hi
3625
vpmadd52huq $H2,$S1,$D0hi
3626
vpxorq $D1lo,$D1lo,$D1lo
3627
vpmadd52luq $H2,$S2,$D1lo
3628
vpxorq $D1hi,$D1hi,$D1hi
3629
vpmadd52huq $H2,$S2,$D1hi
3630
vpxorq $D2lo,$D2lo,$D2lo
3631
vpmadd52luq $H2,$R0,$D2lo
3632
vpxorq $D2hi,$D2hi,$D2hi
3633
vpmadd52huq $H2,$R0,$D2hi
3634
3635
vmovdqu64 16*0($inp),$T2 # load data
3636
vmovdqu64 16*4($inp),$T3
3637
lea 16*8($inp),$inp
3638
vpmadd52luq $H0,$R0,$D0lo
3639
vpmadd52huq $H0,$R0,$D0hi
3640
vpmadd52luq $H0,$R1,$D1lo
3641
vpmadd52huq $H0,$R1,$D1hi
3642
vpmadd52luq $H0,$R2,$D2lo
3643
vpmadd52huq $H0,$R2,$D2hi
3644
3645
vpunpcklqdq $T3,$T2,$T1 # transpose data
3646
vpunpckhqdq $T3,$T2,$T3
3647
vpmadd52luq $H1,$S2,$D0lo
3648
vpmadd52huq $H1,$S2,$D0hi
3649
vpmadd52luq $H1,$R0,$D1lo
3650
vpmadd52huq $H1,$R0,$D1hi
3651
vpmadd52luq $H1,$R1,$D2lo
3652
vpmadd52huq $H1,$R1,$D2hi
3653
3654
################################################################
3655
# partial reduction (interleaved with data splat)
3656
vpsrlq \$44,$D0lo,$tmp
3657
vpsllq \$8,$D0hi,$D0hi
3658
vpandq $mask44,$D0lo,$H0
3659
vpaddq $tmp,$D0hi,$D0hi
3660
3661
vpsrlq \$24,$T3,$T2
3662
vporq $PAD,$T2,$T2
3663
vpaddq $D0hi,$D1lo,$D1lo
3664
3665
vpsrlq \$44,$D1lo,$tmp
3666
vpsllq \$8,$D1hi,$D1hi
3667
vpandq $mask44,$D1lo,$H1
3668
vpaddq $tmp,$D1hi,$D1hi
3669
3670
vpandq $mask44,$T1,$T0
3671
vpsrlq \$44,$T1,$T1
3672
vpsllq \$20,$T3,$T3
3673
vpaddq $D1hi,$D2lo,$D2lo
3674
3675
vpsrlq \$42,$D2lo,$tmp
3676
vpsllq \$10,$D2hi,$D2hi
3677
vpandq $mask42,$D2lo,$H2
3678
vpaddq $tmp,$D2hi,$D2hi
3679
3680
vpaddq $T2,$H2,$H2 # accumulate input
3681
vpaddq $D2hi,$H0,$H0
3682
vpsllq \$2,$D2hi,$D2hi
3683
3684
vpaddq $D2hi,$H0,$H0
3685
vporq $T3,$T1,$T1
3686
vpandq $mask44,$T1,$T1
3687
3688
vpsrlq \$44,$H0,$tmp # additional step
3689
vpandq $mask44,$H0,$H0
3690
3691
vpaddq $tmp,$H1,$H1
3692
3693
sub \$8,$len # len-=128
3694
jnz .Loop_vpmadd52_8x
3695
3696
.Ltail_vpmadd52_8x:
3697
#vpaddq $T2,$H2,$H2 # accumulate input
3698
vpaddq $T0,$H0,$H0
3699
vpaddq $T1,$H1,$H1
3700
3701
vpxorq $D0lo,$D0lo,$D0lo
3702
vpmadd52luq $H2,$SS1,$D0lo
3703
vpxorq $D0hi,$D0hi,$D0hi
3704
vpmadd52huq $H2,$SS1,$D0hi
3705
vpxorq $D1lo,$D1lo,$D1lo
3706
vpmadd52luq $H2,$SS2,$D1lo
3707
vpxorq $D1hi,$D1hi,$D1hi
3708
vpmadd52huq $H2,$SS2,$D1hi
3709
vpxorq $D2lo,$D2lo,$D2lo
3710
vpmadd52luq $H2,$RR0,$D2lo
3711
vpxorq $D2hi,$D2hi,$D2hi
3712
vpmadd52huq $H2,$RR0,$D2hi
3713
3714
vpmadd52luq $H0,$RR0,$D0lo
3715
vpmadd52huq $H0,$RR0,$D0hi
3716
vpmadd52luq $H0,$RR1,$D1lo
3717
vpmadd52huq $H0,$RR1,$D1hi
3718
vpmadd52luq $H0,$RR2,$D2lo
3719
vpmadd52huq $H0,$RR2,$D2hi
3720
3721
vpmadd52luq $H1,$SS2,$D0lo
3722
vpmadd52huq $H1,$SS2,$D0hi
3723
vpmadd52luq $H1,$RR0,$D1lo
3724
vpmadd52huq $H1,$RR0,$D1hi
3725
vpmadd52luq $H1,$RR1,$D2lo
3726
vpmadd52huq $H1,$RR1,$D2hi
3727
3728
################################################################
3729
# horizontal addition
3730
3731
mov \$1,%eax
3732
kmovw %eax,%k1
3733
vpsrldq \$8,$D0lo,$T0
3734
vpsrldq \$8,$D0hi,$H0
3735
vpsrldq \$8,$D1lo,$T1
3736
vpsrldq \$8,$D1hi,$H1
3737
vpaddq $T0,$D0lo,$D0lo
3738
vpaddq $H0,$D0hi,$D0hi
3739
vpsrldq \$8,$D2lo,$T2
3740
vpsrldq \$8,$D2hi,$H2
3741
vpaddq $T1,$D1lo,$D1lo
3742
vpaddq $H1,$D1hi,$D1hi
3743
vpermq \$0x2,$D0lo,$T0
3744
vpermq \$0x2,$D0hi,$H0
3745
vpaddq $T2,$D2lo,$D2lo
3746
vpaddq $H2,$D2hi,$D2hi
3747
3748
vpermq \$0x2,$D1lo,$T1
3749
vpermq \$0x2,$D1hi,$H1
3750
vpaddq $T0,$D0lo,$D0lo
3751
vpaddq $H0,$D0hi,$D0hi
3752
vpermq \$0x2,$D2lo,$T2
3753
vpermq \$0x2,$D2hi,$H2
3754
vpaddq $T1,$D1lo,$D1lo
3755
vpaddq $H1,$D1hi,$D1hi
3756
vextracti64x4 \$1,$D0lo,%y#$T0
3757
vextracti64x4 \$1,$D0hi,%y#$H0
3758
vpaddq $T2,$D2lo,$D2lo
3759
vpaddq $H2,$D2hi,$D2hi
3760
3761
vextracti64x4 \$1,$D1lo,%y#$T1
3762
vextracti64x4 \$1,$D1hi,%y#$H1
3763
vextracti64x4 \$1,$D2lo,%y#$T2
3764
vextracti64x4 \$1,$D2hi,%y#$H2
3765
___
3766
######## switch back to %ymm
3767
map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3768
map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3769
map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3770
3771
$code.=<<___;
3772
vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3773
vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3774
vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3775
vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3776
vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3777
vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3778
3779
################################################################
3780
# partial reduction
3781
vpsrlq \$44,$D0lo,$tmp
3782
vpsllq \$8,$D0hi,$D0hi
3783
vpandq $mask44,$D0lo,$H0
3784
vpaddq $tmp,$D0hi,$D0hi
3785
3786
vpaddq $D0hi,$D1lo,$D1lo
3787
3788
vpsrlq \$44,$D1lo,$tmp
3789
vpsllq \$8,$D1hi,$D1hi
3790
vpandq $mask44,$D1lo,$H1
3791
vpaddq $tmp,$D1hi,$D1hi
3792
3793
vpaddq $D1hi,$D2lo,$D2lo
3794
3795
vpsrlq \$42,$D2lo,$tmp
3796
vpsllq \$10,$D2hi,$D2hi
3797
vpandq $mask42,$D2lo,$H2
3798
vpaddq $tmp,$D2hi,$D2hi
3799
3800
vpaddq $D2hi,$H0,$H0
3801
vpsllq \$2,$D2hi,$D2hi
3802
3803
vpaddq $D2hi,$H0,$H0
3804
3805
vpsrlq \$44,$H0,$tmp # additional step
3806
vpandq $mask44,$H0,$H0
3807
3808
vpaddq $tmp,$H1,$H1
3809
3810
################################################################
3811
3812
vmovq %x#$H0,0($ctx)
3813
vmovq %x#$H1,8($ctx)
3814
vmovq %x#$H2,16($ctx)
3815
vzeroall
3816
3817
.Lno_data_vpmadd52_8x:
3818
RET
3819
.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3820
___
3821
}
3822
$code.=<<___;
3823
.type poly1305_emit_base2_44,\@function,3
3824
.align 32
3825
poly1305_emit_base2_44:
3826
mov 0($ctx),%r8 # load hash value
3827
mov 8($ctx),%r9
3828
mov 16($ctx),%r10
3829
3830
mov %r9,%rax
3831
shr \$20,%r9
3832
shl \$44,%rax
3833
mov %r10,%rcx
3834
shr \$40,%r10
3835
shl \$24,%rcx
3836
3837
add %rax,%r8
3838
adc %rcx,%r9
3839
adc \$0,%r10
3840
3841
mov %r8,%rax
3842
add \$5,%r8 # compare to modulus
3843
mov %r9,%rcx
3844
adc \$0,%r9
3845
adc \$0,%r10
3846
shr \$2,%r10 # did 130-bit value overflow?
3847
cmovnz %r8,%rax
3848
cmovnz %r9,%rcx
3849
3850
add 0($nonce),%rax # accumulate nonce
3851
adc 8($nonce),%rcx
3852
mov %rax,0($mac) # write result
3853
mov %rcx,8($mac)
3854
3855
RET
3856
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3857
___
3858
} } }
3859
}
3860
3861
if (!$kernel)
3862
{ # chacha20-poly1305 helpers
3863
my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3864
("%rdi","%rsi","%rdx","%rcx"); # Unix order
3865
$code.=<<___;
3866
.globl xor128_encrypt_n_pad
3867
.type xor128_encrypt_n_pad,\@abi-omnipotent
3868
.align 16
3869
xor128_encrypt_n_pad:
3870
sub $otp,$inp
3871
sub $otp,$out
3872
mov $len,%r10 # put len aside
3873
shr \$4,$len # len / 16
3874
jz .Ltail_enc
3875
nop
3876
.Loop_enc_xmm:
3877
movdqu ($inp,$otp),%xmm0
3878
pxor ($otp),%xmm0
3879
movdqu %xmm0,($out,$otp)
3880
movdqa %xmm0,($otp)
3881
lea 16($otp),$otp
3882
dec $len
3883
jnz .Loop_enc_xmm
3884
3885
and \$15,%r10 # len % 16
3886
jz .Ldone_enc
3887
3888
.Ltail_enc:
3889
mov \$16,$len
3890
sub %r10,$len
3891
xor %eax,%eax
3892
.Loop_enc_byte:
3893
mov ($inp,$otp),%al
3894
xor ($otp),%al
3895
mov %al,($out,$otp)
3896
mov %al,($otp)
3897
lea 1($otp),$otp
3898
dec %r10
3899
jnz .Loop_enc_byte
3900
3901
xor %eax,%eax
3902
.Loop_enc_pad:
3903
mov %al,($otp)
3904
lea 1($otp),$otp
3905
dec $len
3906
jnz .Loop_enc_pad
3907
3908
.Ldone_enc:
3909
mov $otp,%rax
3910
RET
3911
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3912
3913
.globl xor128_decrypt_n_pad
3914
.type xor128_decrypt_n_pad,\@abi-omnipotent
3915
.align 16
3916
xor128_decrypt_n_pad:
3917
sub $otp,$inp
3918
sub $otp,$out
3919
mov $len,%r10 # put len aside
3920
shr \$4,$len # len / 16
3921
jz .Ltail_dec
3922
nop
3923
.Loop_dec_xmm:
3924
movdqu ($inp,$otp),%xmm0
3925
movdqa ($otp),%xmm1
3926
pxor %xmm0,%xmm1
3927
movdqu %xmm1,($out,$otp)
3928
movdqa %xmm0,($otp)
3929
lea 16($otp),$otp
3930
dec $len
3931
jnz .Loop_dec_xmm
3932
3933
pxor %xmm1,%xmm1
3934
and \$15,%r10 # len % 16
3935
jz .Ldone_dec
3936
3937
.Ltail_dec:
3938
mov \$16,$len
3939
sub %r10,$len
3940
xor %eax,%eax
3941
xor %r11d,%r11d
3942
.Loop_dec_byte:
3943
mov ($inp,$otp),%r11b
3944
mov ($otp),%al
3945
xor %r11b,%al
3946
mov %al,($out,$otp)
3947
mov %r11b,($otp)
3948
lea 1($otp),$otp
3949
dec %r10
3950
jnz .Loop_dec_byte
3951
3952
xor %eax,%eax
3953
.Loop_dec_pad:
3954
mov %al,($otp)
3955
lea 1($otp),$otp
3956
dec $len
3957
jnz .Loop_dec_pad
3958
3959
.Ldone_dec:
3960
mov $otp,%rax
3961
RET
3962
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3963
___
3964
}
3965
3966
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3967
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3968
if ($win64) {
3969
$rec="%rcx";
3970
$frame="%rdx";
3971
$context="%r8";
3972
$disp="%r9";
3973
3974
$code.=<<___;
3975
.extern __imp_RtlVirtualUnwind
3976
.type se_handler,\@abi-omnipotent
3977
.align 16
3978
se_handler:
3979
push %rsi
3980
push %rdi
3981
push %rbx
3982
push %rbp
3983
push %r12
3984
push %r13
3985
push %r14
3986
push %r15
3987
pushfq
3988
sub \$64,%rsp
3989
3990
mov 120($context),%rax # pull context->Rax
3991
mov 248($context),%rbx # pull context->Rip
3992
3993
mov 8($disp),%rsi # disp->ImageBase
3994
mov 56($disp),%r11 # disp->HandlerData
3995
3996
mov 0(%r11),%r10d # HandlerData[0]
3997
lea (%rsi,%r10),%r10 # prologue label
3998
cmp %r10,%rbx # context->Rip<.Lprologue
3999
jb .Lcommon_seh_tail
4000
4001
mov 152($context),%rax # pull context->Rsp
4002
4003
mov 4(%r11),%r10d # HandlerData[1]
4004
lea (%rsi,%r10),%r10 # epilogue label
4005
cmp %r10,%rbx # context->Rip>=.Lepilogue
4006
jae .Lcommon_seh_tail
4007
4008
lea 48(%rax),%rax
4009
4010
mov -8(%rax),%rbx
4011
mov -16(%rax),%rbp
4012
mov -24(%rax),%r12
4013
mov -32(%rax),%r13
4014
mov -40(%rax),%r14
4015
mov -48(%rax),%r15
4016
mov %rbx,144($context) # restore context->Rbx
4017
mov %rbp,160($context) # restore context->Rbp
4018
mov %r12,216($context) # restore context->R12
4019
mov %r13,224($context) # restore context->R13
4020
mov %r14,232($context) # restore context->R14
4021
mov %r15,240($context) # restore context->R14
4022
4023
jmp .Lcommon_seh_tail
4024
.size se_handler,.-se_handler
4025
4026
.type avx_handler,\@abi-omnipotent
4027
.align 16
4028
avx_handler:
4029
push %rsi
4030
push %rdi
4031
push %rbx
4032
push %rbp
4033
push %r12
4034
push %r13
4035
push %r14
4036
push %r15
4037
pushfq
4038
sub \$64,%rsp
4039
4040
mov 120($context),%rax # pull context->Rax
4041
mov 248($context),%rbx # pull context->Rip
4042
4043
mov 8($disp),%rsi # disp->ImageBase
4044
mov 56($disp),%r11 # disp->HandlerData
4045
4046
mov 0(%r11),%r10d # HandlerData[0]
4047
lea (%rsi,%r10),%r10 # prologue label
4048
cmp %r10,%rbx # context->Rip<prologue label
4049
jb .Lcommon_seh_tail
4050
4051
mov 152($context),%rax # pull context->Rsp
4052
4053
mov 4(%r11),%r10d # HandlerData[1]
4054
lea (%rsi,%r10),%r10 # epilogue label
4055
cmp %r10,%rbx # context->Rip>=epilogue label
4056
jae .Lcommon_seh_tail
4057
4058
mov 208($context),%rax # pull context->R11
4059
4060
lea 0x50(%rax),%rsi
4061
lea 0xf8(%rax),%rax
4062
lea 512($context),%rdi # &context.Xmm6
4063
mov \$20,%ecx
4064
.long 0xa548f3fc # cld; rep movsq
4065
4066
.Lcommon_seh_tail:
4067
mov 8(%rax),%rdi
4068
mov 16(%rax),%rsi
4069
mov %rax,152($context) # restore context->Rsp
4070
mov %rsi,168($context) # restore context->Rsi
4071
mov %rdi,176($context) # restore context->Rdi
4072
4073
mov 40($disp),%rdi # disp->ContextRecord
4074
mov $context,%rsi # context
4075
mov \$154,%ecx # sizeof(CONTEXT)
4076
.long 0xa548f3fc # cld; rep movsq
4077
4078
mov $disp,%rsi
4079
xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
4080
mov 8(%rsi),%rdx # arg2, disp->ImageBase
4081
mov 0(%rsi),%r8 # arg3, disp->ControlPc
4082
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4083
mov 40(%rsi),%r10 # disp->ContextRecord
4084
lea 56(%rsi),%r11 # &disp->HandlerData
4085
lea 24(%rsi),%r12 # &disp->EstablisherFrame
4086
mov %r10,32(%rsp) # arg5
4087
mov %r11,40(%rsp) # arg6
4088
mov %r12,48(%rsp) # arg7
4089
mov %rcx,56(%rsp) # arg8, (NULL)
4090
call *__imp_RtlVirtualUnwind(%rip)
4091
4092
mov \$1,%eax # ExceptionContinueSearch
4093
add \$64,%rsp
4094
popfq
4095
pop %r15
4096
pop %r14
4097
pop %r13
4098
pop %r12
4099
pop %rbp
4100
pop %rbx
4101
pop %rdi
4102
pop %rsi
4103
RET
4104
.size avx_handler,.-avx_handler
4105
4106
.section .pdata
4107
.align 4
4108
.rva .LSEH_begin_poly1305_init_x86_64
4109
.rva .LSEH_end_poly1305_init_x86_64
4110
.rva .LSEH_info_poly1305_init_x86_64
4111
4112
.rva .LSEH_begin_poly1305_blocks_x86_64
4113
.rva .LSEH_end_poly1305_blocks_x86_64
4114
.rva .LSEH_info_poly1305_blocks_x86_64
4115
4116
.rva .LSEH_begin_poly1305_emit_x86_64
4117
.rva .LSEH_end_poly1305_emit_x86_64
4118
.rva .LSEH_info_poly1305_emit_x86_64
4119
___
4120
$code.=<<___ if ($avx);
4121
.rva .LSEH_begin_poly1305_blocks_avx
4122
.rva .Lbase2_64_avx
4123
.rva .LSEH_info_poly1305_blocks_avx_1
4124
4125
.rva .Lbase2_64_avx
4126
.rva .Leven_avx
4127
.rva .LSEH_info_poly1305_blocks_avx_2
4128
4129
.rva .Leven_avx
4130
.rva .LSEH_end_poly1305_blocks_avx
4131
.rva .LSEH_info_poly1305_blocks_avx_3
4132
4133
.rva .LSEH_begin_poly1305_emit_avx
4134
.rva .LSEH_end_poly1305_emit_avx
4135
.rva .LSEH_info_poly1305_emit_avx
4136
___
4137
$code.=<<___ if ($avx>1);
4138
.rva .LSEH_begin_poly1305_blocks_avx2
4139
.rva .Lbase2_64_avx2
4140
.rva .LSEH_info_poly1305_blocks_avx2_1
4141
4142
.rva .Lbase2_64_avx2
4143
.rva .Leven_avx2
4144
.rva .LSEH_info_poly1305_blocks_avx2_2
4145
4146
.rva .Leven_avx2
4147
.rva .LSEH_end_poly1305_blocks_avx2
4148
.rva .LSEH_info_poly1305_blocks_avx2_3
4149
___
4150
$code.=<<___ if ($avx>2);
4151
.rva .LSEH_begin_poly1305_blocks_avx512
4152
.rva .LSEH_end_poly1305_blocks_avx512
4153
.rva .LSEH_info_poly1305_blocks_avx512
4154
___
4155
$code.=<<___;
4156
.section .xdata
4157
.align 8
4158
.LSEH_info_poly1305_init_x86_64:
4159
.byte 9,0,0,0
4160
.rva se_handler
4161
.rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
4162
4163
.LSEH_info_poly1305_blocks_x86_64:
4164
.byte 9,0,0,0
4165
.rva se_handler
4166
.rva .Lblocks_body,.Lblocks_epilogue
4167
4168
.LSEH_info_poly1305_emit_x86_64:
4169
.byte 9,0,0,0
4170
.rva se_handler
4171
.rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4172
___
4173
$code.=<<___ if ($avx);
4174
.LSEH_info_poly1305_blocks_avx_1:
4175
.byte 9,0,0,0
4176
.rva se_handler
4177
.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4178
4179
.LSEH_info_poly1305_blocks_avx_2:
4180
.byte 9,0,0,0
4181
.rva se_handler
4182
.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4183
4184
.LSEH_info_poly1305_blocks_avx_3:
4185
.byte 9,0,0,0
4186
.rva avx_handler
4187
.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4188
4189
.LSEH_info_poly1305_emit_avx:
4190
.byte 9,0,0,0
4191
.rva se_handler
4192
.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4193
___
4194
$code.=<<___ if ($avx>1);
4195
.LSEH_info_poly1305_blocks_avx2_1:
4196
.byte 9,0,0,0
4197
.rva se_handler
4198
.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4199
4200
.LSEH_info_poly1305_blocks_avx2_2:
4201
.byte 9,0,0,0
4202
.rva se_handler
4203
.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4204
4205
.LSEH_info_poly1305_blocks_avx2_3:
4206
.byte 9,0,0,0
4207
.rva avx_handler
4208
.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4209
___
4210
$code.=<<___ if ($avx>2);
4211
.LSEH_info_poly1305_blocks_avx512:
4212
.byte 9,0,0,0
4213
.rva avx_handler
4214
.rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4215
___
4216
}
4217
4218
open SELF,$0;
4219
while(<SELF>) {
4220
next if (/^#!/);
4221
last if (!s/^#/\/\// and !/^$/);
4222
print;
4223
}
4224
close SELF;
4225
4226
foreach (split('\n',$code)) {
4227
s/\`([^\`]*)\`/eval($1)/ge;
4228
s/%r([a-z]+)#d/%e$1/g;
4229
s/%r([0-9]+)#d/%r$1d/g;
4230
s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4231
4232
if ($kernel) {
4233
s/(^\.type.*),[0-9]+$/\1/;
4234
s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
4235
next if /^\.cfi.*/;
4236
}
4237
4238
print $_,"\n";
4239
}
4240
close STDOUT;
4241
4242