Path: blob/master/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
29278 views
#!/usr/bin/env perl1# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause2#3# Copyright (C) 2017-2018 Samuel Neves <[email protected]>. All Rights Reserved.4# Copyright (C) 2017-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.5# Copyright (C) 2006-2017 CRYPTOGAMS by <[email protected]>. All Rights Reserved.6#7# This code is taken from the OpenSSL project but the author, Andy Polyakov,8# has relicensed it under the licenses specified in the SPDX header above.9# The original headers, including the original license headers, are10# included below for completeness.11#12# ====================================================================13# Written by Andy Polyakov <[email protected]> for the OpenSSL14# project. The module is, however, dual licensed under OpenSSL and15# CRYPTOGAMS licenses depending on where you obtain it. For further16# details see http://www.openssl.org/~appro/cryptogams/.17# ====================================================================18#19# This module implements Poly1305 hash for x86_64.20#21# March 201522#23# Initial release.24#25# December 201626#27# Add AVX512F+VL+BW code path.28#29# November 201730#31# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be32# executed even on Knights Landing. Trigger for modification was33# observation that AVX512 code paths can negatively affect overall34# Skylake-X system performance. Since we are likely to suppress35# AVX512F capability flag [at least on Skylake-X], conversion serves36# as kind of "investment protection". Note that next *lake processor,37# Cannonlake, has AVX512IFMA code path to execute...38#39# Numbers are cycles per processed byte with poly1305_blocks alone,40# measured with rdtsc at fixed clock frequency.41#42# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-51243# P4 4.46/+120% -44# Core 2 2.41/+90% -45# Westmere 1.88/+120% -46# Sandy Bridge 1.39/+140% 1.1047# Haswell 1.14/+175% 1.11 0.6548# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]49# Silvermont 2.83/+95% -50# Knights L 3.60/? 1.65 1.10 0.41(***)51# Goldmont 1.70/+180% -52# VIA Nano 1.82/+150% -53# Sledgehammer 1.38/+160% -54# Bulldozer 2.30/+130% 0.9755# Ryzen 1.15/+200% 1.08 1.1856#57# (*) improvement coefficients relative to clang are more modest and58# are ~50% on most processors, in both cases we are comparing to59# __int128 code;60# (**) SSE2 implementation was attempted, but among non-AVX processors61# it was faster than integer-only code only on older Intel P4 and62# Core processors, 50-30%, less newer processor is, but slower on63# contemporary ones, for example almost 2x slower on Atom, and as64# former are naturally disappearing, SSE2 is deemed unnecessary;65# (***) strangely enough performance seems to vary from core to core,66# listed result is best case;6768$flavour = shift;69$output = shift;70if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }7172$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);73$kernel=0; $kernel=1 if (!$flavour && !$output);7475if (!$kernel) {76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;77( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or78( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or79die "can't locate x86_64-xlate.pl";8081open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";82*STDOUT=*OUT;8384if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`85=~ /GNU assembler version ([2-9]\.[0-9]+)/) {86$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);87}8889if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&90`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {91$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);92$avx += 1 if ($1==2.11 && $2>=8);93}9495if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&96`ml64 2>&1` =~ /Version ([0-9]+)\./) {97$avx = ($1>=10) + ($1>=11);98}99100if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {101$avx = ($2>=3.0) + ($2>3.0);102}103} else {104$avx = 4; # The kernel uses ifdefs for this.105}106107sub declare_function() {108my ($name, $align, $nargs) = @_;109if($kernel) {110$code .= "SYM_FUNC_START($name)\n";111$code .= ".L$name:\n";112} else {113$code .= ".globl $name\n";114$code .= ".type $name,\@function,$nargs\n";115$code .= ".align $align\n";116$code .= "$name:\n";117}118}119120sub end_function() {121my ($name) = @_;122if($kernel) {123$code .= "SYM_FUNC_END($name)\n";124} else {125$code .= ".size $name,.-$name\n";126}127}128129$code.=<<___ if $kernel;130#include <linux/linkage.h>131___132133if ($avx) {134$code.=<<___ if $kernel;135.section .rodata136___137$code.=<<___;138.align 64139.Lconst:140.Lmask24:141.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0142.L129:143.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0144.Lmask26:145.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0146.Lpermd_avx2:147.long 2,2,2,3,2,0,2,1148.Lpermd_avx512:149.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7150151.L2_44_inp_permd:152.long 0,1,1,2,2,3,7,7153.L2_44_inp_shift:154.quad 0,12,24,64155.L2_44_mask:156.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff157.L2_44_shift_rgt:158.quad 44,44,42,64159.L2_44_shift_lft:160.quad 8,8,10,64161162.align 64163.Lx_mask44:164.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff165.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff166.Lx_mask42:167.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff168.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff169___170}171$code.=<<___ if (!$kernel);172.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"173.align 16174___175176my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");177my ($mac,$nonce)=($inp,$len); # *_emit arguments178my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");179my ($h0,$h1,$h2)=("%r14","%rbx","%r10");180181sub poly1305_iteration {182# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1183# output: $h0-$h2 *= $r0-$r1184$code.=<<___;185mulq $h0 # h0*r1186mov %rax,$d2187mov $r0,%rax188mov %rdx,$d3189190mulq $h0 # h0*r0191mov %rax,$h0 # future $h0192mov $r0,%rax193mov %rdx,$d1194195mulq $h1 # h1*r0196add %rax,$d2197mov $s1,%rax198adc %rdx,$d3199200mulq $h1 # h1*s1201mov $h2,$h1 # borrow $h1202add %rax,$h0203adc %rdx,$d1204205imulq $s1,$h1 # h2*s1206add $h1,$d2207mov $d1,$h1208adc \$0,$d3209210imulq $r0,$h2 # h2*r0211add $d2,$h1212mov \$-4,%rax # mask value213adc $h2,$d3214215and $d3,%rax # last reduction step216mov $d3,$h2217shr \$2,$d3218and \$3,$h2219add $d3,%rax220add %rax,$h0221adc \$0,$h1222adc \$0,$h2223___224}225226########################################################################227# Layout of opaque area is following.228#229# unsigned __int64 h[3]; # current hash value base 2^64230# unsigned __int64 r[2]; # key value base 2^64231232$code.=<<___;233.text234___235$code.=<<___ if (!$kernel);236.extern OPENSSL_ia32cap_P237238.globl poly1305_init_x86_64239.hidden poly1305_init_x86_64240.globl poly1305_blocks_x86_64241.hidden poly1305_blocks_x86_64242.globl poly1305_emit_x86_64243.hidden poly1305_emit_x86_64244___245&declare_function("poly1305_init_x86_64", 32, 3);246$code.=<<___;247xor %eax,%eax248mov %rax,0($ctx) # initialize hash value249mov %rax,8($ctx)250mov %rax,16($ctx)251252test $inp,$inp253je .Lno_key254___255$code.=<<___ if (!$kernel);256lea poly1305_blocks_x86_64(%rip),%r10257lea poly1305_emit_x86_64(%rip),%r11258___259$code.=<<___ if (!$kernel && $avx);260mov OPENSSL_ia32cap_P+4(%rip),%r9261lea poly1305_blocks_avx(%rip),%rax262lea poly1305_emit_avx(%rip),%rcx263bt \$`60-32`,%r9 # AVX?264cmovc %rax,%r10265cmovc %rcx,%r11266___267$code.=<<___ if (!$kernel && $avx>1);268lea poly1305_blocks_avx2(%rip),%rax269bt \$`5+32`,%r9 # AVX2?270cmovc %rax,%r10271___272$code.=<<___ if (!$kernel && $avx>3);273mov \$`(1<<31|1<<21|1<<16)`,%rax274shr \$32,%r9275and %rax,%r9276cmp %rax,%r9277je .Linit_base2_44278___279$code.=<<___;280mov \$0x0ffffffc0fffffff,%rax281mov \$0x0ffffffc0ffffffc,%rcx282and 0($inp),%rax283and 8($inp),%rcx284mov %rax,24($ctx)285mov %rcx,32($ctx)286___287$code.=<<___ if (!$kernel && $flavour !~ /elf32/);288mov %r10,0(%rdx)289mov %r11,8(%rdx)290___291$code.=<<___ if (!$kernel && $flavour =~ /elf32/);292mov %r10d,0(%rdx)293mov %r11d,4(%rdx)294___295$code.=<<___;296mov \$1,%eax297.Lno_key:298RET299___300&end_function("poly1305_init_x86_64");301302&declare_function("poly1305_blocks_x86_64", 32, 4);303$code.=<<___;304.cfi_startproc305.Lblocks:306shr \$4,$len307jz .Lno_data # too short308309push %rbx310.cfi_push %rbx311push %r12312.cfi_push %r12313push %r13314.cfi_push %r13315push %r14316.cfi_push %r14317push %r15318.cfi_push %r15319push $ctx320.cfi_push $ctx321.Lblocks_body:322323mov $len,%r15 # reassign $len324325mov 24($ctx),$r0 # load r326mov 32($ctx),$s1327328mov 0($ctx),$h0 # load hash value329mov 8($ctx),$h1330mov 16($ctx),$h2331332mov $s1,$r1333shr \$2,$s1334mov $r1,%rax335add $r1,$s1 # s1 = r1 + (r1 >> 2)336jmp .Loop337338.align 32339.Loop:340add 0($inp),$h0 # accumulate input341adc 8($inp),$h1342lea 16($inp),$inp343adc $padbit,$h2344___345346&poly1305_iteration();347348$code.=<<___;349mov $r1,%rax350dec %r15 # len-=16351jnz .Loop352353mov 0(%rsp),$ctx354.cfi_restore $ctx355356mov $h0,0($ctx) # store hash value357mov $h1,8($ctx)358mov $h2,16($ctx)359360mov 8(%rsp),%r15361.cfi_restore %r15362mov 16(%rsp),%r14363.cfi_restore %r14364mov 24(%rsp),%r13365.cfi_restore %r13366mov 32(%rsp),%r12367.cfi_restore %r12368mov 40(%rsp),%rbx369.cfi_restore %rbx370lea 48(%rsp),%rsp371.cfi_adjust_cfa_offset -48372.Lno_data:373.Lblocks_epilogue:374RET375.cfi_endproc376___377&end_function("poly1305_blocks_x86_64");378379&declare_function("poly1305_emit_x86_64", 32, 3);380$code.=<<___;381.Lemit:382mov 0($ctx),%r8 # load hash value383mov 8($ctx),%r9384mov 16($ctx),%r10385386mov %r8,%rax387add \$5,%r8 # compare to modulus388mov %r9,%rcx389adc \$0,%r9390adc \$0,%r10391shr \$2,%r10 # did 130-bit value overflow?392cmovnz %r8,%rax393cmovnz %r9,%rcx394395add 0($nonce),%rax # accumulate nonce396adc 8($nonce),%rcx397mov %rax,0($mac) # write result398mov %rcx,8($mac)399400RET401___402&end_function("poly1305_emit_x86_64");403if ($avx) {404405########################################################################406# Layout of opaque area is following.407#408# unsigned __int32 h[5]; # current hash value base 2^26409# unsigned __int32 is_base2_26;410# unsigned __int64 r[2]; # key value base 2^64411# unsigned __int64 pad;412# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];413#414# where r^n are base 2^26 digits of degrees of multiplier key. There are415# 5 digits, but last four are interleaved with multiples of 5, totalling416# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.417418my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =419map("%xmm$_",(0..15));420421$code.=<<___;422.type __poly1305_block,\@abi-omnipotent423.align 32424__poly1305_block:425push $ctx426___427&poly1305_iteration();428$code.=<<___;429pop $ctx430RET431.size __poly1305_block,.-__poly1305_block432433.type __poly1305_init_avx,\@abi-omnipotent434.align 32435__poly1305_init_avx:436push %rbp437mov %rsp,%rbp438mov $r0,$h0439mov $r1,$h1440xor $h2,$h2441442lea 48+64($ctx),$ctx # size optimization443444mov $r1,%rax445call __poly1305_block # r^2446447mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26448mov \$0x3ffffff,%edx449mov $h0,$d1450and $h0#d,%eax451mov $r0,$d2452and $r0#d,%edx453mov %eax,`16*0+0-64`($ctx)454shr \$26,$d1455mov %edx,`16*0+4-64`($ctx)456shr \$26,$d2457458mov \$0x3ffffff,%eax459mov \$0x3ffffff,%edx460and $d1#d,%eax461and $d2#d,%edx462mov %eax,`16*1+0-64`($ctx)463lea (%rax,%rax,4),%eax # *5464mov %edx,`16*1+4-64`($ctx)465lea (%rdx,%rdx,4),%edx # *5466mov %eax,`16*2+0-64`($ctx)467shr \$26,$d1468mov %edx,`16*2+4-64`($ctx)469shr \$26,$d2470471mov $h1,%rax472mov $r1,%rdx473shl \$12,%rax474shl \$12,%rdx475or $d1,%rax476or $d2,%rdx477and \$0x3ffffff,%eax478and \$0x3ffffff,%edx479mov %eax,`16*3+0-64`($ctx)480lea (%rax,%rax,4),%eax # *5481mov %edx,`16*3+4-64`($ctx)482lea (%rdx,%rdx,4),%edx # *5483mov %eax,`16*4+0-64`($ctx)484mov $h1,$d1485mov %edx,`16*4+4-64`($ctx)486mov $r1,$d2487488mov \$0x3ffffff,%eax489mov \$0x3ffffff,%edx490shr \$14,$d1491shr \$14,$d2492and $d1#d,%eax493and $d2#d,%edx494mov %eax,`16*5+0-64`($ctx)495lea (%rax,%rax,4),%eax # *5496mov %edx,`16*5+4-64`($ctx)497lea (%rdx,%rdx,4),%edx # *5498mov %eax,`16*6+0-64`($ctx)499shr \$26,$d1500mov %edx,`16*6+4-64`($ctx)501shr \$26,$d2502503mov $h2,%rax504shl \$24,%rax505or %rax,$d1506mov $d1#d,`16*7+0-64`($ctx)507lea ($d1,$d1,4),$d1 # *5508mov $d2#d,`16*7+4-64`($ctx)509lea ($d2,$d2,4),$d2 # *5510mov $d1#d,`16*8+0-64`($ctx)511mov $d2#d,`16*8+4-64`($ctx)512513mov $r1,%rax514call __poly1305_block # r^3515516mov \$0x3ffffff,%eax # save r^3 base 2^26517mov $h0,$d1518and $h0#d,%eax519shr \$26,$d1520mov %eax,`16*0+12-64`($ctx)521522mov \$0x3ffffff,%edx523and $d1#d,%edx524mov %edx,`16*1+12-64`($ctx)525lea (%rdx,%rdx,4),%edx # *5526shr \$26,$d1527mov %edx,`16*2+12-64`($ctx)528529mov $h1,%rax530shl \$12,%rax531or $d1,%rax532and \$0x3ffffff,%eax533mov %eax,`16*3+12-64`($ctx)534lea (%rax,%rax,4),%eax # *5535mov $h1,$d1536mov %eax,`16*4+12-64`($ctx)537538mov \$0x3ffffff,%edx539shr \$14,$d1540and $d1#d,%edx541mov %edx,`16*5+12-64`($ctx)542lea (%rdx,%rdx,4),%edx # *5543shr \$26,$d1544mov %edx,`16*6+12-64`($ctx)545546mov $h2,%rax547shl \$24,%rax548or %rax,$d1549mov $d1#d,`16*7+12-64`($ctx)550lea ($d1,$d1,4),$d1 # *5551mov $d1#d,`16*8+12-64`($ctx)552553mov $r1,%rax554call __poly1305_block # r^4555556mov \$0x3ffffff,%eax # save r^4 base 2^26557mov $h0,$d1558and $h0#d,%eax559shr \$26,$d1560mov %eax,`16*0+8-64`($ctx)561562mov \$0x3ffffff,%edx563and $d1#d,%edx564mov %edx,`16*1+8-64`($ctx)565lea (%rdx,%rdx,4),%edx # *5566shr \$26,$d1567mov %edx,`16*2+8-64`($ctx)568569mov $h1,%rax570shl \$12,%rax571or $d1,%rax572and \$0x3ffffff,%eax573mov %eax,`16*3+8-64`($ctx)574lea (%rax,%rax,4),%eax # *5575mov $h1,$d1576mov %eax,`16*4+8-64`($ctx)577578mov \$0x3ffffff,%edx579shr \$14,$d1580and $d1#d,%edx581mov %edx,`16*5+8-64`($ctx)582lea (%rdx,%rdx,4),%edx # *5583shr \$26,$d1584mov %edx,`16*6+8-64`($ctx)585586mov $h2,%rax587shl \$24,%rax588or %rax,$d1589mov $d1#d,`16*7+8-64`($ctx)590lea ($d1,$d1,4),$d1 # *5591mov $d1#d,`16*8+8-64`($ctx)592593lea -48-64($ctx),$ctx # size [de-]optimization594pop %rbp595RET596.size __poly1305_init_avx,.-__poly1305_init_avx597___598599&declare_function("poly1305_blocks_avx", 32, 4);600$code.=<<___;601.cfi_startproc602mov 20($ctx),%r8d # is_base2_26603cmp \$128,$len604jae .Lblocks_avx605test %r8d,%r8d606jz .Lblocks607608.Lblocks_avx:609and \$-16,$len610jz .Lno_data_avx611612vzeroupper613614test %r8d,%r8d615jz .Lbase2_64_avx616617test \$31,$len618jz .Leven_avx619620push %rbp621.cfi_push %rbp622mov %rsp,%rbp623push %rbx624.cfi_push %rbx625push %r12626.cfi_push %r12627push %r13628.cfi_push %r13629push %r14630.cfi_push %r14631push %r15632.cfi_push %r15633.Lblocks_avx_body:634635mov $len,%r15 # reassign $len636637mov 0($ctx),$d1 # load hash value638mov 8($ctx),$d2639mov 16($ctx),$h2#d640641mov 24($ctx),$r0 # load r642mov 32($ctx),$s1643644################################# base 2^26 -> base 2^64645mov $d1#d,$h0#d646and \$`-1*(1<<31)`,$d1647mov $d2,$r1 # borrow $r1648mov $d2#d,$h1#d649and \$`-1*(1<<31)`,$d2650651shr \$6,$d1652shl \$52,$r1653add $d1,$h0654shr \$12,$h1655shr \$18,$d2656add $r1,$h0657adc $d2,$h1658659mov $h2,$d1660shl \$40,$d1661shr \$24,$h2662add $d1,$h1663adc \$0,$h2 # can be partially reduced...664665mov \$-4,$d2 # ... so reduce666mov $h2,$d1667and $h2,$d2668shr \$2,$d1669and \$3,$h2670add $d2,$d1 # =*5671add $d1,$h0672adc \$0,$h1673adc \$0,$h2674675mov $s1,$r1676mov $s1,%rax677shr \$2,$s1678add $r1,$s1 # s1 = r1 + (r1 >> 2)679680add 0($inp),$h0 # accumulate input681adc 8($inp),$h1682lea 16($inp),$inp683adc $padbit,$h2684685call __poly1305_block686687test $padbit,$padbit # if $padbit is zero,688jz .Lstore_base2_64_avx # store hash in base 2^64 format689690################################# base 2^64 -> base 2^26691mov $h0,%rax692mov $h0,%rdx693shr \$52,$h0694mov $h1,$r0695mov $h1,$r1696shr \$26,%rdx697and \$0x3ffffff,%rax # h[0]698shl \$12,$r0699and \$0x3ffffff,%rdx # h[1]700shr \$14,$h1701or $r0,$h0702shl \$24,$h2703and \$0x3ffffff,$h0 # h[2]704shr \$40,$r1705and \$0x3ffffff,$h1 # h[3]706or $r1,$h2 # h[4]707708sub \$16,%r15709jz .Lstore_base2_26_avx710711vmovd %rax#d,$H0712vmovd %rdx#d,$H1713vmovd $h0#d,$H2714vmovd $h1#d,$H3715vmovd $h2#d,$H4716jmp .Lproceed_avx717718.align 32719.Lstore_base2_64_avx:720mov $h0,0($ctx)721mov $h1,8($ctx)722mov $h2,16($ctx) # note that is_base2_26 is zeroed723jmp .Ldone_avx724725.align 16726.Lstore_base2_26_avx:727mov %rax#d,0($ctx) # store hash value base 2^26728mov %rdx#d,4($ctx)729mov $h0#d,8($ctx)730mov $h1#d,12($ctx)731mov $h2#d,16($ctx)732.align 16733.Ldone_avx:734pop %r15735.cfi_restore %r15736pop %r14737.cfi_restore %r14738pop %r13739.cfi_restore %r13740pop %r12741.cfi_restore %r12742pop %rbx743.cfi_restore %rbx744pop %rbp745.cfi_restore %rbp746.Lno_data_avx:747.Lblocks_avx_epilogue:748RET749.cfi_endproc750751.align 32752.Lbase2_64_avx:753.cfi_startproc754push %rbp755.cfi_push %rbp756mov %rsp,%rbp757push %rbx758.cfi_push %rbx759push %r12760.cfi_push %r12761push %r13762.cfi_push %r13763push %r14764.cfi_push %r14765push %r15766.cfi_push %r15767.Lbase2_64_avx_body:768769mov $len,%r15 # reassign $len770771mov 24($ctx),$r0 # load r772mov 32($ctx),$s1773774mov 0($ctx),$h0 # load hash value775mov 8($ctx),$h1776mov 16($ctx),$h2#d777778mov $s1,$r1779mov $s1,%rax780shr \$2,$s1781add $r1,$s1 # s1 = r1 + (r1 >> 2)782783test \$31,$len784jz .Linit_avx785786add 0($inp),$h0 # accumulate input787adc 8($inp),$h1788lea 16($inp),$inp789adc $padbit,$h2790sub \$16,%r15791792call __poly1305_block793794.Linit_avx:795################################# base 2^64 -> base 2^26796mov $h0,%rax797mov $h0,%rdx798shr \$52,$h0799mov $h1,$d1800mov $h1,$d2801shr \$26,%rdx802and \$0x3ffffff,%rax # h[0]803shl \$12,$d1804and \$0x3ffffff,%rdx # h[1]805shr \$14,$h1806or $d1,$h0807shl \$24,$h2808and \$0x3ffffff,$h0 # h[2]809shr \$40,$d2810and \$0x3ffffff,$h1 # h[3]811or $d2,$h2 # h[4]812813vmovd %rax#d,$H0814vmovd %rdx#d,$H1815vmovd $h0#d,$H2816vmovd $h1#d,$H3817vmovd $h2#d,$H4818movl \$1,20($ctx) # set is_base2_26819820call __poly1305_init_avx821822.Lproceed_avx:823mov %r15,$len824pop %r15825.cfi_restore %r15826pop %r14827.cfi_restore %r14828pop %r13829.cfi_restore %r13830pop %r12831.cfi_restore %r12832pop %rbx833.cfi_restore %rbx834pop %rbp835.cfi_restore %rbp836.Lbase2_64_avx_epilogue:837jmp .Ldo_avx838.cfi_endproc839840.align 32841.Leven_avx:842.cfi_startproc843vmovd 4*0($ctx),$H0 # load hash value844vmovd 4*1($ctx),$H1845vmovd 4*2($ctx),$H2846vmovd 4*3($ctx),$H3847vmovd 4*4($ctx),$H4848849.Ldo_avx:850___851$code.=<<___ if (!$win64);852lea 8(%rsp),%r10853.cfi_def_cfa_register %r10854and \$-32,%rsp855sub \$-8,%rsp856lea -0x58(%rsp),%r11857sub \$0x178,%rsp858___859$code.=<<___ if ($win64);860lea -0xf8(%rsp),%r11861sub \$0x218,%rsp862vmovdqa %xmm6,0x50(%r11)863vmovdqa %xmm7,0x60(%r11)864vmovdqa %xmm8,0x70(%r11)865vmovdqa %xmm9,0x80(%r11)866vmovdqa %xmm10,0x90(%r11)867vmovdqa %xmm11,0xa0(%r11)868vmovdqa %xmm12,0xb0(%r11)869vmovdqa %xmm13,0xc0(%r11)870vmovdqa %xmm14,0xd0(%r11)871vmovdqa %xmm15,0xe0(%r11)872.Ldo_avx_body:873___874$code.=<<___;875sub \$64,$len876lea -32($inp),%rax877cmovc %rax,$inp878879vmovdqu `16*3`($ctx),$D4 # preload r0^2880lea `16*3+64`($ctx),$ctx # size optimization881lea .Lconst(%rip),%rcx882883################################################################884# load input885vmovdqu 16*2($inp),$T0886vmovdqu 16*3($inp),$T1887vmovdqa 64(%rcx),$MASK # .Lmask26888889vpsrldq \$6,$T0,$T2 # splat input890vpsrldq \$6,$T1,$T3891vpunpckhqdq $T1,$T0,$T4 # 4892vpunpcklqdq $T1,$T0,$T0 # 0:1893vpunpcklqdq $T3,$T2,$T3 # 2:3894895vpsrlq \$40,$T4,$T4 # 4896vpsrlq \$26,$T0,$T1897vpand $MASK,$T0,$T0 # 0898vpsrlq \$4,$T3,$T2899vpand $MASK,$T1,$T1 # 1900vpsrlq \$30,$T3,$T3901vpand $MASK,$T2,$T2 # 2902vpand $MASK,$T3,$T3 # 3903vpor 32(%rcx),$T4,$T4 # padbit, yes, always904905jbe .Lskip_loop_avx906907# expand and copy pre-calculated table to stack908vmovdqu `16*1-64`($ctx),$D1909vmovdqu `16*2-64`($ctx),$D2910vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434911vpshufd \$0x44,$D4,$D0 # xx12 -> 1212912vmovdqa $D3,-0x90(%r11)913vmovdqa $D0,0x00(%rsp)914vpshufd \$0xEE,$D1,$D4915vmovdqu `16*3-64`($ctx),$D0916vpshufd \$0x44,$D1,$D1917vmovdqa $D4,-0x80(%r11)918vmovdqa $D1,0x10(%rsp)919vpshufd \$0xEE,$D2,$D3920vmovdqu `16*4-64`($ctx),$D1921vpshufd \$0x44,$D2,$D2922vmovdqa $D3,-0x70(%r11)923vmovdqa $D2,0x20(%rsp)924vpshufd \$0xEE,$D0,$D4925vmovdqu `16*5-64`($ctx),$D2926vpshufd \$0x44,$D0,$D0927vmovdqa $D4,-0x60(%r11)928vmovdqa $D0,0x30(%rsp)929vpshufd \$0xEE,$D1,$D3930vmovdqu `16*6-64`($ctx),$D0931vpshufd \$0x44,$D1,$D1932vmovdqa $D3,-0x50(%r11)933vmovdqa $D1,0x40(%rsp)934vpshufd \$0xEE,$D2,$D4935vmovdqu `16*7-64`($ctx),$D1936vpshufd \$0x44,$D2,$D2937vmovdqa $D4,-0x40(%r11)938vmovdqa $D2,0x50(%rsp)939vpshufd \$0xEE,$D0,$D3940vmovdqu `16*8-64`($ctx),$D2941vpshufd \$0x44,$D0,$D0942vmovdqa $D3,-0x30(%r11)943vmovdqa $D0,0x60(%rsp)944vpshufd \$0xEE,$D1,$D4945vpshufd \$0x44,$D1,$D1946vmovdqa $D4,-0x20(%r11)947vmovdqa $D1,0x70(%rsp)948vpshufd \$0xEE,$D2,$D3949vmovdqa 0x00(%rsp),$D4 # preload r0^2950vpshufd \$0x44,$D2,$D2951vmovdqa $D3,-0x10(%r11)952vmovdqa $D2,0x80(%rsp)953954jmp .Loop_avx955956.align 32957.Loop_avx:958################################################################959# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2960# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r961# \___________________/962# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2963# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r964# \___________________/ \____________________/965#966# Note that we start with inp[2:3]*r^2. This is because it967# doesn't depend on reduction in previous iteration.968################################################################969# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4970# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4971# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4972# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4973# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4974#975# though note that $Tx and $Hx are "reversed" in this section,976# and $D4 is preloaded with r0^2...977978vpmuludq $T0,$D4,$D0 # d0 = h0*r0979vpmuludq $T1,$D4,$D1 # d1 = h1*r0980vmovdqa $H2,0x20(%r11) # offload hash981vpmuludq $T2,$D4,$D2 # d3 = h2*r0982vmovdqa 0x10(%rsp),$H2 # r1^2983vpmuludq $T3,$D4,$D3 # d3 = h3*r0984vpmuludq $T4,$D4,$D4 # d4 = h4*r0985986vmovdqa $H0,0x00(%r11) #987vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1988vmovdqa $H1,0x10(%r11) #989vpmuludq $T3,$H2,$H1 # h3*r1990vpaddq $H0,$D0,$D0 # d0 += h4*s1991vpaddq $H1,$D4,$D4 # d4 += h3*r1992vmovdqa $H3,0x30(%r11) #993vpmuludq $T2,$H2,$H0 # h2*r1994vpmuludq $T1,$H2,$H1 # h1*r1995vpaddq $H0,$D3,$D3 # d3 += h2*r1996vmovdqa 0x30(%rsp),$H3 # r2^2997vpaddq $H1,$D2,$D2 # d2 += h1*r1998vmovdqa $H4,0x40(%r11) #999vpmuludq $T0,$H2,$H2 # h0*r11000vpmuludq $T2,$H3,$H0 # h2*r21001vpaddq $H2,$D1,$D1 # d1 += h0*r110021003vmovdqa 0x40(%rsp),$H4 # s2^21004vpaddq $H0,$D4,$D4 # d4 += h2*r21005vpmuludq $T1,$H3,$H1 # h1*r21006vpmuludq $T0,$H3,$H3 # h0*r21007vpaddq $H1,$D3,$D3 # d3 += h1*r21008vmovdqa 0x50(%rsp),$H2 # r3^21009vpaddq $H3,$D2,$D2 # d2 += h0*r21010vpmuludq $T4,$H4,$H0 # h4*s21011vpmuludq $T3,$H4,$H4 # h3*s21012vpaddq $H0,$D1,$D1 # d1 += h4*s21013vmovdqa 0x60(%rsp),$H3 # s3^21014vpaddq $H4,$D0,$D0 # d0 += h3*s210151016vmovdqa 0x80(%rsp),$H4 # s4^21017vpmuludq $T1,$H2,$H1 # h1*r31018vpmuludq $T0,$H2,$H2 # h0*r31019vpaddq $H1,$D4,$D4 # d4 += h1*r31020vpaddq $H2,$D3,$D3 # d3 += h0*r31021vpmuludq $T4,$H3,$H0 # h4*s31022vpmuludq $T3,$H3,$H1 # h3*s31023vpaddq $H0,$D2,$D2 # d2 += h4*s31024vmovdqu 16*0($inp),$H0 # load input1025vpaddq $H1,$D1,$D1 # d1 += h3*s31026vpmuludq $T2,$H3,$H3 # h2*s31027vpmuludq $T2,$H4,$T2 # h2*s41028vpaddq $H3,$D0,$D0 # d0 += h2*s310291030vmovdqu 16*1($inp),$H1 #1031vpaddq $T2,$D1,$D1 # d1 += h2*s41032vpmuludq $T3,$H4,$T3 # h3*s41033vpmuludq $T4,$H4,$T4 # h4*s41034vpsrldq \$6,$H0,$H2 # splat input1035vpaddq $T3,$D2,$D2 # d2 += h3*s41036vpaddq $T4,$D3,$D3 # d3 += h4*s41037vpsrldq \$6,$H1,$H3 #1038vpmuludq 0x70(%rsp),$T0,$T4 # h0*r41039vpmuludq $T1,$H4,$T0 # h1*s41040vpunpckhqdq $H1,$H0,$H4 # 41041vpaddq $T4,$D4,$D4 # d4 += h0*r41042vmovdqa -0x90(%r11),$T4 # r0^41043vpaddq $T0,$D0,$D0 # d0 += h1*s410441045vpunpcklqdq $H1,$H0,$H0 # 0:11046vpunpcklqdq $H3,$H2,$H3 # 2:310471048#vpsrlq \$40,$H4,$H4 # 41049vpsrldq \$`40/8`,$H4,$H4 # 41050vpsrlq \$26,$H0,$H11051vpand $MASK,$H0,$H0 # 01052vpsrlq \$4,$H3,$H21053vpand $MASK,$H1,$H1 # 11054vpand 0(%rcx),$H4,$H4 # .Lmask241055vpsrlq \$30,$H3,$H31056vpand $MASK,$H2,$H2 # 21057vpand $MASK,$H3,$H3 # 31058vpor 32(%rcx),$H4,$H4 # padbit, yes, always10591060vpaddq 0x00(%r11),$H0,$H0 # add hash value1061vpaddq 0x10(%r11),$H1,$H11062vpaddq 0x20(%r11),$H2,$H21063vpaddq 0x30(%r11),$H3,$H31064vpaddq 0x40(%r11),$H4,$H410651066lea 16*2($inp),%rax1067lea 16*4($inp),$inp1068sub \$64,$len1069cmovc %rax,$inp10701071################################################################1072# Now we accumulate (inp[0:1]+hash)*r^41073################################################################1074# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41075# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41076# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41077# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41078# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r410791080vpmuludq $H0,$T4,$T0 # h0*r01081vpmuludq $H1,$T4,$T1 # h1*r01082vpaddq $T0,$D0,$D01083vpaddq $T1,$D1,$D11084vmovdqa -0x80(%r11),$T2 # r1^41085vpmuludq $H2,$T4,$T0 # h2*r01086vpmuludq $H3,$T4,$T1 # h3*r01087vpaddq $T0,$D2,$D21088vpaddq $T1,$D3,$D31089vpmuludq $H4,$T4,$T4 # h4*r01090vpmuludq -0x70(%r11),$H4,$T0 # h4*s11091vpaddq $T4,$D4,$D410921093vpaddq $T0,$D0,$D0 # d0 += h4*s11094vpmuludq $H2,$T2,$T1 # h2*r11095vpmuludq $H3,$T2,$T0 # h3*r11096vpaddq $T1,$D3,$D3 # d3 += h2*r11097vmovdqa -0x60(%r11),$T3 # r2^41098vpaddq $T0,$D4,$D4 # d4 += h3*r11099vpmuludq $H1,$T2,$T1 # h1*r11100vpmuludq $H0,$T2,$T2 # h0*r11101vpaddq $T1,$D2,$D2 # d2 += h1*r11102vpaddq $T2,$D1,$D1 # d1 += h0*r111031104vmovdqa -0x50(%r11),$T4 # s2^41105vpmuludq $H2,$T3,$T0 # h2*r21106vpmuludq $H1,$T3,$T1 # h1*r21107vpaddq $T0,$D4,$D4 # d4 += h2*r21108vpaddq $T1,$D3,$D3 # d3 += h1*r21109vmovdqa -0x40(%r11),$T2 # r3^41110vpmuludq $H0,$T3,$T3 # h0*r21111vpmuludq $H4,$T4,$T0 # h4*s21112vpaddq $T3,$D2,$D2 # d2 += h0*r21113vpaddq $T0,$D1,$D1 # d1 += h4*s21114vmovdqa -0x30(%r11),$T3 # s3^41115vpmuludq $H3,$T4,$T4 # h3*s21116vpmuludq $H1,$T2,$T1 # h1*r31117vpaddq $T4,$D0,$D0 # d0 += h3*s211181119vmovdqa -0x10(%r11),$T4 # s4^41120vpaddq $T1,$D4,$D4 # d4 += h1*r31121vpmuludq $H0,$T2,$T2 # h0*r31122vpmuludq $H4,$T3,$T0 # h4*s31123vpaddq $T2,$D3,$D3 # d3 += h0*r31124vpaddq $T0,$D2,$D2 # d2 += h4*s31125vmovdqu 16*2($inp),$T0 # load input1126vpmuludq $H3,$T3,$T2 # h3*s31127vpmuludq $H2,$T3,$T3 # h2*s31128vpaddq $T2,$D1,$D1 # d1 += h3*s31129vmovdqu 16*3($inp),$T1 #1130vpaddq $T3,$D0,$D0 # d0 += h2*s311311132vpmuludq $H2,$T4,$H2 # h2*s41133vpmuludq $H3,$T4,$H3 # h3*s41134vpsrldq \$6,$T0,$T2 # splat input1135vpaddq $H2,$D1,$D1 # d1 += h2*s41136vpmuludq $H4,$T4,$H4 # h4*s41137vpsrldq \$6,$T1,$T3 #1138vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s41139vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s41140vpmuludq -0x20(%r11),$H0,$H4 # h0*r41141vpmuludq $H1,$T4,$H01142vpunpckhqdq $T1,$T0,$T4 # 41143vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r41144vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s411451146vpunpcklqdq $T1,$T0,$T0 # 0:11147vpunpcklqdq $T3,$T2,$T3 # 2:311481149#vpsrlq \$40,$T4,$T4 # 41150vpsrldq \$`40/8`,$T4,$T4 # 41151vpsrlq \$26,$T0,$T11152vmovdqa 0x00(%rsp),$D4 # preload r0^21153vpand $MASK,$T0,$T0 # 01154vpsrlq \$4,$T3,$T21155vpand $MASK,$T1,$T1 # 11156vpand 0(%rcx),$T4,$T4 # .Lmask241157vpsrlq \$30,$T3,$T31158vpand $MASK,$T2,$T2 # 21159vpand $MASK,$T3,$T3 # 31160vpor 32(%rcx),$T4,$T4 # padbit, yes, always11611162################################################################1163# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein1164# and P. Schwabe11651166vpsrlq \$26,$H3,$D31167vpand $MASK,$H3,$H31168vpaddq $D3,$H4,$H4 # h3 -> h411691170vpsrlq \$26,$H0,$D01171vpand $MASK,$H0,$H01172vpaddq $D0,$D1,$H1 # h0 -> h111731174vpsrlq \$26,$H4,$D01175vpand $MASK,$H4,$H411761177vpsrlq \$26,$H1,$D11178vpand $MASK,$H1,$H11179vpaddq $D1,$H2,$H2 # h1 -> h211801181vpaddq $D0,$H0,$H01182vpsllq \$2,$D0,$D01183vpaddq $D0,$H0,$H0 # h4 -> h011841185vpsrlq \$26,$H2,$D21186vpand $MASK,$H2,$H21187vpaddq $D2,$H3,$H3 # h2 -> h311881189vpsrlq \$26,$H0,$D01190vpand $MASK,$H0,$H01191vpaddq $D0,$H1,$H1 # h0 -> h111921193vpsrlq \$26,$H3,$D31194vpand $MASK,$H3,$H31195vpaddq $D3,$H4,$H4 # h3 -> h411961197ja .Loop_avx11981199.Lskip_loop_avx:1200################################################################1201# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^112021203vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x21204add \$32,$len1205jnz .Long_tail_avx12061207vpaddq $H2,$T2,$T21208vpaddq $H0,$T0,$T01209vpaddq $H1,$T1,$T11210vpaddq $H3,$T3,$T31211vpaddq $H4,$T4,$T412121213.Long_tail_avx:1214vmovdqa $H2,0x20(%r11)1215vmovdqa $H0,0x00(%r11)1216vmovdqa $H1,0x10(%r11)1217vmovdqa $H3,0x30(%r11)1218vmovdqa $H4,0x40(%r11)12191220# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41221# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41222# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41223# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41224# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r412251226vpmuludq $T2,$D4,$D2 # d2 = h2*r01227vpmuludq $T0,$D4,$D0 # d0 = h0*r01228vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n1229vpmuludq $T1,$D4,$D1 # d1 = h1*r01230vpmuludq $T3,$D4,$D3 # d3 = h3*r01231vpmuludq $T4,$D4,$D4 # d4 = h4*r012321233vpmuludq $T3,$H2,$H0 # h3*r11234vpaddq $H0,$D4,$D4 # d4 += h3*r11235vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n1236vpmuludq $T2,$H2,$H1 # h2*r11237vpaddq $H1,$D3,$D3 # d3 += h2*r11238vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n1239vpmuludq $T1,$H2,$H0 # h1*r11240vpaddq $H0,$D2,$D2 # d2 += h1*r11241vpmuludq $T0,$H2,$H2 # h0*r11242vpaddq $H2,$D1,$D1 # d1 += h0*r11243vpmuludq $T4,$H3,$H3 # h4*s11244vpaddq $H3,$D0,$D0 # d0 += h4*s112451246vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n1247vpmuludq $T2,$H4,$H1 # h2*r21248vpaddq $H1,$D4,$D4 # d4 += h2*r21249vpmuludq $T1,$H4,$H0 # h1*r21250vpaddq $H0,$D3,$D3 # d3 += h1*r21251vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n1252vpmuludq $T0,$H4,$H4 # h0*r21253vpaddq $H4,$D2,$D2 # d2 += h0*r21254vpmuludq $T4,$H2,$H1 # h4*s21255vpaddq $H1,$D1,$D1 # d1 += h4*s21256vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n1257vpmuludq $T3,$H2,$H2 # h3*s21258vpaddq $H2,$D0,$D0 # d0 += h3*s212591260vpmuludq $T1,$H3,$H0 # h1*r31261vpaddq $H0,$D4,$D4 # d4 += h1*r31262vpmuludq $T0,$H3,$H3 # h0*r31263vpaddq $H3,$D3,$D3 # d3 += h0*r31264vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n1265vpmuludq $T4,$H4,$H1 # h4*s31266vpaddq $H1,$D2,$D2 # d2 += h4*s31267vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n1268vpmuludq $T3,$H4,$H0 # h3*s31269vpaddq $H0,$D1,$D1 # d1 += h3*s31270vpmuludq $T2,$H4,$H4 # h2*s31271vpaddq $H4,$D0,$D0 # d0 += h2*s312721273vpmuludq $T0,$H2,$H2 # h0*r41274vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r41275vpmuludq $T4,$H3,$H1 # h4*s41276vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s41277vpmuludq $T3,$H3,$H0 # h3*s41278vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s41279vpmuludq $T2,$H3,$H1 # h2*s41280vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s41281vpmuludq $T1,$H3,$H3 # h1*s41282vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s412831284jz .Lshort_tail_avx12851286vmovdqu 16*0($inp),$H0 # load input1287vmovdqu 16*1($inp),$H112881289vpsrldq \$6,$H0,$H2 # splat input1290vpsrldq \$6,$H1,$H31291vpunpckhqdq $H1,$H0,$H4 # 41292vpunpcklqdq $H1,$H0,$H0 # 0:11293vpunpcklqdq $H3,$H2,$H3 # 2:312941295vpsrlq \$40,$H4,$H4 # 41296vpsrlq \$26,$H0,$H11297vpand $MASK,$H0,$H0 # 01298vpsrlq \$4,$H3,$H21299vpand $MASK,$H1,$H1 # 11300vpsrlq \$30,$H3,$H31301vpand $MASK,$H2,$H2 # 21302vpand $MASK,$H3,$H3 # 31303vpor 32(%rcx),$H4,$H4 # padbit, yes, always13041305vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x41306vpaddq 0x00(%r11),$H0,$H01307vpaddq 0x10(%r11),$H1,$H11308vpaddq 0x20(%r11),$H2,$H21309vpaddq 0x30(%r11),$H3,$H31310vpaddq 0x40(%r11),$H4,$H413111312################################################################1313# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate13141315vpmuludq $H0,$T4,$T0 # h0*r01316vpaddq $T0,$D0,$D0 # d0 += h0*r01317vpmuludq $H1,$T4,$T1 # h1*r01318vpaddq $T1,$D1,$D1 # d1 += h1*r01319vpmuludq $H2,$T4,$T0 # h2*r01320vpaddq $T0,$D2,$D2 # d2 += h2*r01321vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n1322vpmuludq $H3,$T4,$T1 # h3*r01323vpaddq $T1,$D3,$D3 # d3 += h3*r01324vpmuludq $H4,$T4,$T4 # h4*r01325vpaddq $T4,$D4,$D4 # d4 += h4*r013261327vpmuludq $H3,$T2,$T0 # h3*r11328vpaddq $T0,$D4,$D4 # d4 += h3*r11329vpshufd \$0x32,`16*2-64`($ctx),$T3 # s11330vpmuludq $H2,$T2,$T1 # h2*r11331vpaddq $T1,$D3,$D3 # d3 += h2*r11332vpshufd \$0x32,`16*3-64`($ctx),$T4 # r21333vpmuludq $H1,$T2,$T0 # h1*r11334vpaddq $T0,$D2,$D2 # d2 += h1*r11335vpmuludq $H0,$T2,$T2 # h0*r11336vpaddq $T2,$D1,$D1 # d1 += h0*r11337vpmuludq $H4,$T3,$T3 # h4*s11338vpaddq $T3,$D0,$D0 # d0 += h4*s113391340vpshufd \$0x32,`16*4-64`($ctx),$T2 # s21341vpmuludq $H2,$T4,$T1 # h2*r21342vpaddq $T1,$D4,$D4 # d4 += h2*r21343vpmuludq $H1,$T4,$T0 # h1*r21344vpaddq $T0,$D3,$D3 # d3 += h1*r21345vpshufd \$0x32,`16*5-64`($ctx),$T3 # r31346vpmuludq $H0,$T4,$T4 # h0*r21347vpaddq $T4,$D2,$D2 # d2 += h0*r21348vpmuludq $H4,$T2,$T1 # h4*s21349vpaddq $T1,$D1,$D1 # d1 += h4*s21350vpshufd \$0x32,`16*6-64`($ctx),$T4 # s31351vpmuludq $H3,$T2,$T2 # h3*s21352vpaddq $T2,$D0,$D0 # d0 += h3*s213531354vpmuludq $H1,$T3,$T0 # h1*r31355vpaddq $T0,$D4,$D4 # d4 += h1*r31356vpmuludq $H0,$T3,$T3 # h0*r31357vpaddq $T3,$D3,$D3 # d3 += h0*r31358vpshufd \$0x32,`16*7-64`($ctx),$T2 # r41359vpmuludq $H4,$T4,$T1 # h4*s31360vpaddq $T1,$D2,$D2 # d2 += h4*s31361vpshufd \$0x32,`16*8-64`($ctx),$T3 # s41362vpmuludq $H3,$T4,$T0 # h3*s31363vpaddq $T0,$D1,$D1 # d1 += h3*s31364vpmuludq $H2,$T4,$T4 # h2*s31365vpaddq $T4,$D0,$D0 # d0 += h2*s313661367vpmuludq $H0,$T2,$T2 # h0*r41368vpaddq $T2,$D4,$D4 # d4 += h0*r41369vpmuludq $H4,$T3,$T1 # h4*s41370vpaddq $T1,$D3,$D3 # d3 += h4*s41371vpmuludq $H3,$T3,$T0 # h3*s41372vpaddq $T0,$D2,$D2 # d2 += h3*s41373vpmuludq $H2,$T3,$T1 # h2*s41374vpaddq $T1,$D1,$D1 # d1 += h2*s41375vpmuludq $H1,$T3,$T3 # h1*s41376vpaddq $T3,$D0,$D0 # d0 += h1*s413771378.Lshort_tail_avx:1379################################################################1380# horizontal addition13811382vpsrldq \$8,$D4,$T41383vpsrldq \$8,$D3,$T31384vpsrldq \$8,$D1,$T11385vpsrldq \$8,$D0,$T01386vpsrldq \$8,$D2,$T21387vpaddq $T3,$D3,$D31388vpaddq $T4,$D4,$D41389vpaddq $T0,$D0,$D01390vpaddq $T1,$D1,$D11391vpaddq $T2,$D2,$D213921393################################################################1394# lazy reduction13951396vpsrlq \$26,$D3,$H31397vpand $MASK,$D3,$D31398vpaddq $H3,$D4,$D4 # h3 -> h413991400vpsrlq \$26,$D0,$H01401vpand $MASK,$D0,$D01402vpaddq $H0,$D1,$D1 # h0 -> h114031404vpsrlq \$26,$D4,$H41405vpand $MASK,$D4,$D414061407vpsrlq \$26,$D1,$H11408vpand $MASK,$D1,$D11409vpaddq $H1,$D2,$D2 # h1 -> h214101411vpaddq $H4,$D0,$D01412vpsllq \$2,$H4,$H41413vpaddq $H4,$D0,$D0 # h4 -> h014141415vpsrlq \$26,$D2,$H21416vpand $MASK,$D2,$D21417vpaddq $H2,$D3,$D3 # h2 -> h314181419vpsrlq \$26,$D0,$H01420vpand $MASK,$D0,$D01421vpaddq $H0,$D1,$D1 # h0 -> h114221423vpsrlq \$26,$D3,$H31424vpand $MASK,$D3,$D31425vpaddq $H3,$D4,$D4 # h3 -> h414261427vmovd $D0,`4*0-48-64`($ctx) # save partially reduced1428vmovd $D1,`4*1-48-64`($ctx)1429vmovd $D2,`4*2-48-64`($ctx)1430vmovd $D3,`4*3-48-64`($ctx)1431vmovd $D4,`4*4-48-64`($ctx)1432___1433$code.=<<___ if ($win64);1434vmovdqa 0x50(%r11),%xmm61435vmovdqa 0x60(%r11),%xmm71436vmovdqa 0x70(%r11),%xmm81437vmovdqa 0x80(%r11),%xmm91438vmovdqa 0x90(%r11),%xmm101439vmovdqa 0xa0(%r11),%xmm111440vmovdqa 0xb0(%r11),%xmm121441vmovdqa 0xc0(%r11),%xmm131442vmovdqa 0xd0(%r11),%xmm141443vmovdqa 0xe0(%r11),%xmm151444lea 0xf8(%r11),%rsp1445.Ldo_avx_epilogue:1446___1447$code.=<<___ if (!$win64);1448lea -8(%r10),%rsp1449.cfi_def_cfa_register %rsp1450___1451$code.=<<___;1452vzeroupper1453RET1454.cfi_endproc1455___1456&end_function("poly1305_blocks_avx");14571458&declare_function("poly1305_emit_avx", 32, 3);1459$code.=<<___;1460cmpl \$0,20($ctx) # is_base2_26?1461je .Lemit14621463mov 0($ctx),%eax # load hash value base 2^261464mov 4($ctx),%ecx1465mov 8($ctx),%r8d1466mov 12($ctx),%r11d1467mov 16($ctx),%r10d14681469shl \$26,%rcx # base 2^26 -> base 2^641470mov %r8,%r91471shl \$52,%r81472add %rcx,%rax1473shr \$12,%r91474add %rax,%r8 # h01475adc \$0,%r914761477shl \$14,%r111478mov %r10,%rax1479shr \$24,%r101480add %r11,%r91481shl \$40,%rax1482add %rax,%r9 # h11483adc \$0,%r10 # h214841485mov %r10,%rax # could be partially reduced, so reduce1486mov %r10,%rcx1487and \$3,%r101488shr \$2,%rax1489and \$-4,%rcx1490add %rcx,%rax1491add %rax,%r81492adc \$0,%r91493adc \$0,%r1014941495mov %r8,%rax1496add \$5,%r8 # compare to modulus1497mov %r9,%rcx1498adc \$0,%r91499adc \$0,%r101500shr \$2,%r10 # did 130-bit value overflow?1501cmovnz %r8,%rax1502cmovnz %r9,%rcx15031504add 0($nonce),%rax # accumulate nonce1505adc 8($nonce),%rcx1506mov %rax,0($mac) # write result1507mov %rcx,8($mac)15081509RET1510___1511&end_function("poly1305_emit_avx");15121513if ($avx>1) {15141515my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =1516map("%ymm$_",(0..15));1517my $S4=$MASK;15181519sub poly1305_blocks_avxN {1520my ($avx512) = @_;1521my $suffix = $avx512 ? "_avx512" : "";1522$code.=<<___;1523.cfi_startproc1524mov 20($ctx),%r8d # is_base2_261525cmp \$128,$len1526jae .Lblocks_avx2$suffix1527test %r8d,%r8d1528jz .Lblocks15291530.Lblocks_avx2$suffix:1531and \$-16,$len1532jz .Lno_data_avx2$suffix15331534vzeroupper15351536test %r8d,%r8d1537jz .Lbase2_64_avx2$suffix15381539test \$63,$len1540jz .Leven_avx2$suffix15411542push %rbp1543.cfi_push %rbp1544mov %rsp,%rbp1545push %rbx1546.cfi_push %rbx1547push %r121548.cfi_push %r121549push %r131550.cfi_push %r131551push %r141552.cfi_push %r141553push %r151554.cfi_push %r151555.Lblocks_avx2_body$suffix:15561557mov $len,%r15 # reassign $len15581559mov 0($ctx),$d1 # load hash value1560mov 8($ctx),$d21561mov 16($ctx),$h2#d15621563mov 24($ctx),$r0 # load r1564mov 32($ctx),$s115651566################################# base 2^26 -> base 2^641567mov $d1#d,$h0#d1568and \$`-1*(1<<31)`,$d11569mov $d2,$r1 # borrow $r11570mov $d2#d,$h1#d1571and \$`-1*(1<<31)`,$d215721573shr \$6,$d11574shl \$52,$r11575add $d1,$h01576shr \$12,$h11577shr \$18,$d21578add $r1,$h01579adc $d2,$h115801581mov $h2,$d11582shl \$40,$d11583shr \$24,$h21584add $d1,$h11585adc \$0,$h2 # can be partially reduced...15861587mov \$-4,$d2 # ... so reduce1588mov $h2,$d11589and $h2,$d21590shr \$2,$d11591and \$3,$h21592add $d2,$d1 # =*51593add $d1,$h01594adc \$0,$h11595adc \$0,$h215961597mov $s1,$r11598mov $s1,%rax1599shr \$2,$s11600add $r1,$s1 # s1 = r1 + (r1 >> 2)16011602.Lbase2_26_pre_avx2$suffix:1603add 0($inp),$h0 # accumulate input1604adc 8($inp),$h11605lea 16($inp),$inp1606adc $padbit,$h21607sub \$16,%r1516081609call __poly1305_block1610mov $r1,%rax16111612test \$63,%r151613jnz .Lbase2_26_pre_avx2$suffix16141615test $padbit,$padbit # if $padbit is zero,1616jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format16171618################################# base 2^64 -> base 2^261619mov $h0,%rax1620mov $h0,%rdx1621shr \$52,$h01622mov $h1,$r01623mov $h1,$r11624shr \$26,%rdx1625and \$0x3ffffff,%rax # h[0]1626shl \$12,$r01627and \$0x3ffffff,%rdx # h[1]1628shr \$14,$h11629or $r0,$h01630shl \$24,$h21631and \$0x3ffffff,$h0 # h[2]1632shr \$40,$r11633and \$0x3ffffff,$h1 # h[3]1634or $r1,$h2 # h[4]16351636test %r15,%r151637jz .Lstore_base2_26_avx2$suffix16381639vmovd %rax#d,%x#$H01640vmovd %rdx#d,%x#$H11641vmovd $h0#d,%x#$H21642vmovd $h1#d,%x#$H31643vmovd $h2#d,%x#$H41644jmp .Lproceed_avx2$suffix16451646.align 321647.Lstore_base2_64_avx2$suffix:1648mov $h0,0($ctx)1649mov $h1,8($ctx)1650mov $h2,16($ctx) # note that is_base2_26 is zeroed1651jmp .Ldone_avx2$suffix16521653.align 161654.Lstore_base2_26_avx2$suffix:1655mov %rax#d,0($ctx) # store hash value base 2^261656mov %rdx#d,4($ctx)1657mov $h0#d,8($ctx)1658mov $h1#d,12($ctx)1659mov $h2#d,16($ctx)1660.align 161661.Ldone_avx2$suffix:1662pop %r151663.cfi_restore %r151664pop %r141665.cfi_restore %r141666pop %r131667.cfi_restore %r131668pop %r121669.cfi_restore %r121670pop %rbx1671.cfi_restore %rbx1672pop %rbp1673.cfi_restore %rbp1674.Lno_data_avx2$suffix:1675.Lblocks_avx2_epilogue$suffix:1676RET1677.cfi_endproc16781679.align 321680.Lbase2_64_avx2$suffix:1681.cfi_startproc1682push %rbp1683.cfi_push %rbp1684mov %rsp,%rbp1685push %rbx1686.cfi_push %rbx1687push %r121688.cfi_push %r121689push %r131690.cfi_push %r131691push %r141692.cfi_push %r141693push %r151694.cfi_push %r151695.Lbase2_64_avx2_body$suffix:16961697mov $len,%r15 # reassign $len16981699mov 24($ctx),$r0 # load r1700mov 32($ctx),$s117011702mov 0($ctx),$h0 # load hash value1703mov 8($ctx),$h11704mov 16($ctx),$h2#d17051706mov $s1,$r11707mov $s1,%rax1708shr \$2,$s11709add $r1,$s1 # s1 = r1 + (r1 >> 2)17101711test \$63,$len1712jz .Linit_avx2$suffix17131714.Lbase2_64_pre_avx2$suffix:1715add 0($inp),$h0 # accumulate input1716adc 8($inp),$h11717lea 16($inp),$inp1718adc $padbit,$h21719sub \$16,%r1517201721call __poly1305_block1722mov $r1,%rax17231724test \$63,%r151725jnz .Lbase2_64_pre_avx2$suffix17261727.Linit_avx2$suffix:1728################################# base 2^64 -> base 2^261729mov $h0,%rax1730mov $h0,%rdx1731shr \$52,$h01732mov $h1,$d11733mov $h1,$d21734shr \$26,%rdx1735and \$0x3ffffff,%rax # h[0]1736shl \$12,$d11737and \$0x3ffffff,%rdx # h[1]1738shr \$14,$h11739or $d1,$h01740shl \$24,$h21741and \$0x3ffffff,$h0 # h[2]1742shr \$40,$d21743and \$0x3ffffff,$h1 # h[3]1744or $d2,$h2 # h[4]17451746vmovd %rax#d,%x#$H01747vmovd %rdx#d,%x#$H11748vmovd $h0#d,%x#$H21749vmovd $h1#d,%x#$H31750vmovd $h2#d,%x#$H41751movl \$1,20($ctx) # set is_base2_2617521753call __poly1305_init_avx17541755.Lproceed_avx2$suffix:1756mov %r15,$len # restore $len1757___1758$code.=<<___ if (!$kernel);1759mov OPENSSL_ia32cap_P+8(%rip),%r9d1760mov \$`(1<<31|1<<30|1<<16)`,%r11d1761___1762$code.=<<___;1763pop %r151764.cfi_restore %r151765pop %r141766.cfi_restore %r141767pop %r131768.cfi_restore %r131769pop %r121770.cfi_restore %r121771pop %rbx1772.cfi_restore %rbx1773pop %rbp1774.cfi_restore %rbp1775.Lbase2_64_avx2_epilogue$suffix:1776jmp .Ldo_avx2$suffix1777.cfi_endproc17781779.align 321780.Leven_avx2$suffix:1781.cfi_startproc1782___1783$code.=<<___ if (!$kernel);1784mov OPENSSL_ia32cap_P+8(%rip),%r9d1785___1786$code.=<<___;1787vmovd 4*0($ctx),%x#$H0 # load hash value base 2^261788vmovd 4*1($ctx),%x#$H11789vmovd 4*2($ctx),%x#$H21790vmovd 4*3($ctx),%x#$H31791vmovd 4*4($ctx),%x#$H417921793.Ldo_avx2$suffix:1794___1795$code.=<<___ if (!$kernel && $avx>2);1796cmp \$512,$len1797jb .Lskip_avx5121798and %r11d,%r9d1799test \$`1<<16`,%r9d # check for AVX512F1800jnz .Lblocks_avx5121801.Lskip_avx512$suffix:1802___1803$code.=<<___ if ($avx > 2 && $avx512 && $kernel);1804cmp \$512,$len1805jae .Lblocks_avx5121806___1807$code.=<<___ if (!$win64);1808lea 8(%rsp),%r101809.cfi_def_cfa_register %r101810sub \$0x128,%rsp1811___1812$code.=<<___ if ($win64);1813lea 8(%rsp),%r101814sub \$0x1c8,%rsp1815vmovdqa %xmm6,-0xb0(%r10)1816vmovdqa %xmm7,-0xa0(%r10)1817vmovdqa %xmm8,-0x90(%r10)1818vmovdqa %xmm9,-0x80(%r10)1819vmovdqa %xmm10,-0x70(%r10)1820vmovdqa %xmm11,-0x60(%r10)1821vmovdqa %xmm12,-0x50(%r10)1822vmovdqa %xmm13,-0x40(%r10)1823vmovdqa %xmm14,-0x30(%r10)1824vmovdqa %xmm15,-0x20(%r10)1825.Ldo_avx2_body$suffix:1826___1827$code.=<<___;1828lea .Lconst(%rip),%rcx1829lea 48+64($ctx),$ctx # size optimization1830vmovdqa 96(%rcx),$T0 # .Lpermd_avx218311832# expand and copy pre-calculated table to stack1833vmovdqu `16*0-64`($ctx),%x#$T21834and \$-512,%rsp1835vmovdqu `16*1-64`($ctx),%x#$T31836vmovdqu `16*2-64`($ctx),%x#$T41837vmovdqu `16*3-64`($ctx),%x#$D01838vmovdqu `16*4-64`($ctx),%x#$D11839vmovdqu `16*5-64`($ctx),%x#$D21840lea 0x90(%rsp),%rax # size optimization1841vmovdqu `16*6-64`($ctx),%x#$D31842vpermd $T2,$T0,$T2 # 00003412 -> 142434441843vmovdqu `16*7-64`($ctx),%x#$D41844vpermd $T3,$T0,$T31845vmovdqu `16*8-64`($ctx),%x#$MASK1846vpermd $T4,$T0,$T41847vmovdqa $T2,0x00(%rsp)1848vpermd $D0,$T0,$D01849vmovdqa $T3,0x20-0x90(%rax)1850vpermd $D1,$T0,$D11851vmovdqa $T4,0x40-0x90(%rax)1852vpermd $D2,$T0,$D21853vmovdqa $D0,0x60-0x90(%rax)1854vpermd $D3,$T0,$D31855vmovdqa $D1,0x80-0x90(%rax)1856vpermd $D4,$T0,$D41857vmovdqa $D2,0xa0-0x90(%rax)1858vpermd $MASK,$T0,$MASK1859vmovdqa $D3,0xc0-0x90(%rax)1860vmovdqa $D4,0xe0-0x90(%rax)1861vmovdqa $MASK,0x100-0x90(%rax)1862vmovdqa 64(%rcx),$MASK # .Lmask2618631864################################################################1865# load input1866vmovdqu 16*0($inp),%x#$T01867vmovdqu 16*1($inp),%x#$T11868vinserti128 \$1,16*2($inp),$T0,$T01869vinserti128 \$1,16*3($inp),$T1,$T11870lea 16*4($inp),$inp18711872vpsrldq \$6,$T0,$T2 # splat input1873vpsrldq \$6,$T1,$T31874vpunpckhqdq $T1,$T0,$T4 # 41875vpunpcklqdq $T3,$T2,$T2 # 2:31876vpunpcklqdq $T1,$T0,$T0 # 0:118771878vpsrlq \$30,$T2,$T31879vpsrlq \$4,$T2,$T21880vpsrlq \$26,$T0,$T11881vpsrlq \$40,$T4,$T4 # 41882vpand $MASK,$T2,$T2 # 21883vpand $MASK,$T0,$T0 # 01884vpand $MASK,$T1,$T1 # 11885vpand $MASK,$T3,$T3 # 31886vpor 32(%rcx),$T4,$T4 # padbit, yes, always18871888vpaddq $H2,$T2,$H2 # accumulate input1889sub \$64,$len1890jz .Ltail_avx2$suffix1891jmp .Loop_avx2$suffix18921893.align 321894.Loop_avx2$suffix:1895################################################################1896# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^41897# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^31898# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^21899# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^11900# \________/\__________/1901################################################################1902#vpaddq $H2,$T2,$H2 # accumulate input1903vpaddq $H0,$T0,$H01904vmovdqa `32*0`(%rsp),$T0 # r0^41905vpaddq $H1,$T1,$H11906vmovdqa `32*1`(%rsp),$T1 # r1^41907vpaddq $H3,$T3,$H31908vmovdqa `32*3`(%rsp),$T2 # r2^41909vpaddq $H4,$T4,$H41910vmovdqa `32*6-0x90`(%rax),$T3 # s3^41911vmovdqa `32*8-0x90`(%rax),$S4 # s4^419121913# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41914# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41915# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41916# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41917# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r41918#1919# however, as h2 is "chronologically" first one available pull1920# corresponding operations up, so it's1921#1922# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r41923# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r41924# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41925# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r31926# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r419271928vpmuludq $H2,$T0,$D2 # d2 = h2*r01929vpmuludq $H2,$T1,$D3 # d3 = h2*r11930vpmuludq $H2,$T2,$D4 # d4 = h2*r21931vpmuludq $H2,$T3,$D0 # d0 = h2*s31932vpmuludq $H2,$S4,$D1 # d1 = h2*s419331934vpmuludq $H0,$T1,$T4 # h0*r11935vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp1936vpaddq $T4,$D1,$D1 # d1 += h0*r11937vpaddq $H2,$D2,$D2 # d2 += h1*r11938vpmuludq $H3,$T1,$T4 # h3*r11939vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s11940vpaddq $T4,$D4,$D4 # d4 += h3*r11941vpaddq $H2,$D0,$D0 # d0 += h4*s11942vmovdqa `32*4-0x90`(%rax),$T1 # s219431944vpmuludq $H0,$T0,$T4 # h0*r01945vpmuludq $H1,$T0,$H2 # h1*r01946vpaddq $T4,$D0,$D0 # d0 += h0*r01947vpaddq $H2,$D1,$D1 # d1 += h1*r01948vpmuludq $H3,$T0,$T4 # h3*r01949vpmuludq $H4,$T0,$H2 # h4*r01950vmovdqu 16*0($inp),%x#$T0 # load input1951vpaddq $T4,$D3,$D3 # d3 += h3*r01952vpaddq $H2,$D4,$D4 # d4 += h4*r01953vinserti128 \$1,16*2($inp),$T0,$T019541955vpmuludq $H3,$T1,$T4 # h3*s21956vpmuludq $H4,$T1,$H2 # h4*s21957vmovdqu 16*1($inp),%x#$T11958vpaddq $T4,$D0,$D0 # d0 += h3*s21959vpaddq $H2,$D1,$D1 # d1 += h4*s21960vmovdqa `32*5-0x90`(%rax),$H2 # r31961vpmuludq $H1,$T2,$T4 # h1*r21962vpmuludq $H0,$T2,$T2 # h0*r21963vpaddq $T4,$D3,$D3 # d3 += h1*r21964vpaddq $T2,$D2,$D2 # d2 += h0*r21965vinserti128 \$1,16*3($inp),$T1,$T11966lea 16*4($inp),$inp19671968vpmuludq $H1,$H2,$T4 # h1*r31969vpmuludq $H0,$H2,$H2 # h0*r31970vpsrldq \$6,$T0,$T2 # splat input1971vpaddq $T4,$D4,$D4 # d4 += h1*r31972vpaddq $H2,$D3,$D3 # d3 += h0*r31973vpmuludq $H3,$T3,$T4 # h3*s31974vpmuludq $H4,$T3,$H2 # h4*s31975vpsrldq \$6,$T1,$T31976vpaddq $T4,$D1,$D1 # d1 += h3*s31977vpaddq $H2,$D2,$D2 # d2 += h4*s31978vpunpckhqdq $T1,$T0,$T4 # 419791980vpmuludq $H3,$S4,$H3 # h3*s41981vpmuludq $H4,$S4,$H4 # h4*s41982vpunpcklqdq $T1,$T0,$T0 # 0:11983vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r41984vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r41985vpunpcklqdq $T3,$T2,$T3 # 2:31986vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r41987vpmuludq $H1,$S4,$H0 # h1*s41988vmovdqa 64(%rcx),$MASK # .Lmask261989vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r41990vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s419911992################################################################1993# lazy reduction (interleaved with tail of input splat)19941995vpsrlq \$26,$H3,$D31996vpand $MASK,$H3,$H31997vpaddq $D3,$H4,$H4 # h3 -> h419981999vpsrlq \$26,$H0,$D02000vpand $MASK,$H0,$H02001vpaddq $D0,$D1,$H1 # h0 -> h120022003vpsrlq \$26,$H4,$D42004vpand $MASK,$H4,$H420052006vpsrlq \$4,$T3,$T220072008vpsrlq \$26,$H1,$D12009vpand $MASK,$H1,$H12010vpaddq $D1,$H2,$H2 # h1 -> h220112012vpaddq $D4,$H0,$H02013vpsllq \$2,$D4,$D42014vpaddq $D4,$H0,$H0 # h4 -> h020152016vpand $MASK,$T2,$T2 # 22017vpsrlq \$26,$T0,$T120182019vpsrlq \$26,$H2,$D22020vpand $MASK,$H2,$H22021vpaddq $D2,$H3,$H3 # h2 -> h320222023vpaddq $T2,$H2,$H2 # modulo-scheduled2024vpsrlq \$30,$T3,$T320252026vpsrlq \$26,$H0,$D02027vpand $MASK,$H0,$H02028vpaddq $D0,$H1,$H1 # h0 -> h120292030vpsrlq \$40,$T4,$T4 # 420312032vpsrlq \$26,$H3,$D32033vpand $MASK,$H3,$H32034vpaddq $D3,$H4,$H4 # h3 -> h420352036vpand $MASK,$T0,$T0 # 02037vpand $MASK,$T1,$T1 # 12038vpand $MASK,$T3,$T3 # 32039vpor 32(%rcx),$T4,$T4 # padbit, yes, always20402041sub \$64,$len2042jnz .Loop_avx2$suffix20432044.byte 0x66,0x902045.Ltail_avx2$suffix:2046################################################################2047# while above multiplications were by r^4 in all lanes, in last2048# iteration we multiply least significant lane by r^4 and most2049# significant one by r, so copy of above except that references2050# to the precomputed table are displaced by 4...20512052#vpaddq $H2,$T2,$H2 # accumulate input2053vpaddq $H0,$T0,$H02054vmovdqu `32*0+4`(%rsp),$T0 # r0^42055vpaddq $H1,$T1,$H12056vmovdqu `32*1+4`(%rsp),$T1 # r1^42057vpaddq $H3,$T3,$H32058vmovdqu `32*3+4`(%rsp),$T2 # r2^42059vpaddq $H4,$T4,$H42060vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^42061vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^420622063vpmuludq $H2,$T0,$D2 # d2 = h2*r02064vpmuludq $H2,$T1,$D3 # d3 = h2*r12065vpmuludq $H2,$T2,$D4 # d4 = h2*r22066vpmuludq $H2,$T3,$D0 # d0 = h2*s32067vpmuludq $H2,$S4,$D1 # d1 = h2*s420682069vpmuludq $H0,$T1,$T4 # h0*r12070vpmuludq $H1,$T1,$H2 # h1*r12071vpaddq $T4,$D1,$D1 # d1 += h0*r12072vpaddq $H2,$D2,$D2 # d2 += h1*r12073vpmuludq $H3,$T1,$T4 # h3*r12074vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s12075vpaddq $T4,$D4,$D4 # d4 += h3*r12076vpaddq $H2,$D0,$D0 # d0 += h4*s120772078vpmuludq $H0,$T0,$T4 # h0*r02079vpmuludq $H1,$T0,$H2 # h1*r02080vpaddq $T4,$D0,$D0 # d0 += h0*r02081vmovdqu `32*4+4-0x90`(%rax),$T1 # s22082vpaddq $H2,$D1,$D1 # d1 += h1*r02083vpmuludq $H3,$T0,$T4 # h3*r02084vpmuludq $H4,$T0,$H2 # h4*r02085vpaddq $T4,$D3,$D3 # d3 += h3*r02086vpaddq $H2,$D4,$D4 # d4 += h4*r020872088vpmuludq $H3,$T1,$T4 # h3*s22089vpmuludq $H4,$T1,$H2 # h4*s22090vpaddq $T4,$D0,$D0 # d0 += h3*s22091vpaddq $H2,$D1,$D1 # d1 += h4*s22092vmovdqu `32*5+4-0x90`(%rax),$H2 # r32093vpmuludq $H1,$T2,$T4 # h1*r22094vpmuludq $H0,$T2,$T2 # h0*r22095vpaddq $T4,$D3,$D3 # d3 += h1*r22096vpaddq $T2,$D2,$D2 # d2 += h0*r220972098vpmuludq $H1,$H2,$T4 # h1*r32099vpmuludq $H0,$H2,$H2 # h0*r32100vpaddq $T4,$D4,$D4 # d4 += h1*r32101vpaddq $H2,$D3,$D3 # d3 += h0*r32102vpmuludq $H3,$T3,$T4 # h3*s32103vpmuludq $H4,$T3,$H2 # h4*s32104vpaddq $T4,$D1,$D1 # d1 += h3*s32105vpaddq $H2,$D2,$D2 # d2 += h4*s321062107vpmuludq $H3,$S4,$H3 # h3*s42108vpmuludq $H4,$S4,$H4 # h4*s42109vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r42110vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r42111vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r42112vpmuludq $H1,$S4,$H0 # h1*s42113vmovdqa 64(%rcx),$MASK # .Lmask262114vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r42115vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s421162117################################################################2118# horizontal addition21192120vpsrldq \$8,$D1,$T12121vpsrldq \$8,$H2,$T22122vpsrldq \$8,$H3,$T32123vpsrldq \$8,$H4,$T42124vpsrldq \$8,$H0,$T02125vpaddq $T1,$D1,$D12126vpaddq $T2,$H2,$H22127vpaddq $T3,$H3,$H32128vpaddq $T4,$H4,$H42129vpaddq $T0,$H0,$H021302131vpermq \$0x2,$H3,$T32132vpermq \$0x2,$H4,$T42133vpermq \$0x2,$H0,$T02134vpermq \$0x2,$D1,$T12135vpermq \$0x2,$H2,$T22136vpaddq $T3,$H3,$H32137vpaddq $T4,$H4,$H42138vpaddq $T0,$H0,$H02139vpaddq $T1,$D1,$D12140vpaddq $T2,$H2,$H221412142################################################################2143# lazy reduction21442145vpsrlq \$26,$H3,$D32146vpand $MASK,$H3,$H32147vpaddq $D3,$H4,$H4 # h3 -> h421482149vpsrlq \$26,$H0,$D02150vpand $MASK,$H0,$H02151vpaddq $D0,$D1,$H1 # h0 -> h121522153vpsrlq \$26,$H4,$D42154vpand $MASK,$H4,$H421552156vpsrlq \$26,$H1,$D12157vpand $MASK,$H1,$H12158vpaddq $D1,$H2,$H2 # h1 -> h221592160vpaddq $D4,$H0,$H02161vpsllq \$2,$D4,$D42162vpaddq $D4,$H0,$H0 # h4 -> h021632164vpsrlq \$26,$H2,$D22165vpand $MASK,$H2,$H22166vpaddq $D2,$H3,$H3 # h2 -> h321672168vpsrlq \$26,$H0,$D02169vpand $MASK,$H0,$H02170vpaddq $D0,$H1,$H1 # h0 -> h121712172vpsrlq \$26,$H3,$D32173vpand $MASK,$H3,$H32174vpaddq $D3,$H4,$H4 # h3 -> h421752176vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced2177vmovd %x#$H1,`4*1-48-64`($ctx)2178vmovd %x#$H2,`4*2-48-64`($ctx)2179vmovd %x#$H3,`4*3-48-64`($ctx)2180vmovd %x#$H4,`4*4-48-64`($ctx)2181___2182$code.=<<___ if ($win64);2183vmovdqa -0xb0(%r10),%xmm62184vmovdqa -0xa0(%r10),%xmm72185vmovdqa -0x90(%r10),%xmm82186vmovdqa -0x80(%r10),%xmm92187vmovdqa -0x70(%r10),%xmm102188vmovdqa -0x60(%r10),%xmm112189vmovdqa -0x50(%r10),%xmm122190vmovdqa -0x40(%r10),%xmm132191vmovdqa -0x30(%r10),%xmm142192vmovdqa -0x20(%r10),%xmm152193lea -8(%r10),%rsp2194.Ldo_avx2_epilogue$suffix:2195___2196$code.=<<___ if (!$win64);2197lea -8(%r10),%rsp2198.cfi_def_cfa_register %rsp2199___2200$code.=<<___;2201vzeroupper2202RET2203.cfi_endproc2204___2205if($avx > 2 && $avx512) {2206my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));2207my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));2208my $PADBIT="%zmm30";22092210map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain2211map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));2212map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));2213map(s/%y/%z/,($MASK));22142215$code.=<<___;2216.cfi_startproc2217.Lblocks_avx512:2218mov \$15,%eax2219kmovw %eax,%k22220___2221$code.=<<___ if (!$win64);2222lea 8(%rsp),%r102223.cfi_def_cfa_register %r102224sub \$0x128,%rsp2225___2226$code.=<<___ if ($win64);2227lea 8(%rsp),%r102228sub \$0x1c8,%rsp2229vmovdqa %xmm6,-0xb0(%r10)2230vmovdqa %xmm7,-0xa0(%r10)2231vmovdqa %xmm8,-0x90(%r10)2232vmovdqa %xmm9,-0x80(%r10)2233vmovdqa %xmm10,-0x70(%r10)2234vmovdqa %xmm11,-0x60(%r10)2235vmovdqa %xmm12,-0x50(%r10)2236vmovdqa %xmm13,-0x40(%r10)2237vmovdqa %xmm14,-0x30(%r10)2238vmovdqa %xmm15,-0x20(%r10)2239.Ldo_avx512_body:2240___2241$code.=<<___;2242lea .Lconst(%rip),%rcx2243lea 48+64($ctx),$ctx # size optimization2244vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx222452246# expand pre-calculated table2247vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}2248and \$-512,%rsp2249vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}2250mov \$0x20,%rax2251vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}2252vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}2253vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}2254vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}2255vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}2256vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}2257vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}2258vpermd $D0,$T2,$R0 # 00003412 -> 142434442259vpbroadcastq 64(%rcx),$MASK # .Lmask262260vpermd $D1,$T2,$R12261vpermd $T0,$T2,$S12262vpermd $D2,$T2,$R22263vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 02264vpsrlq \$32,$R0,$T0 # 14243444 -> 010203042265vpermd $T1,$T2,$S22266vmovdqu64 $R1,0x00(%rsp,%rax){%k2}2267vpsrlq \$32,$R1,$T12268vpermd $D3,$T2,$R32269vmovdqa64 $S1,0x40(%rsp){%k2}2270vpermd $T3,$T2,$S32271vpermd $D4,$T2,$R42272vmovdqu64 $R2,0x40(%rsp,%rax){%k2}2273vpermd $T4,$T2,$S42274vmovdqa64 $S2,0x80(%rsp){%k2}2275vmovdqu64 $R3,0x80(%rsp,%rax){%k2}2276vmovdqa64 $S3,0xc0(%rsp){%k2}2277vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}2278vmovdqa64 $S4,0x100(%rsp){%k2}22792280################################################################2281# calculate 5th through 8th powers of the key2282#2283# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r12284# d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r22285# d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r32286# d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r42287# d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r022882289vpmuludq $T0,$R0,$D0 # d0 = r0'*r02290vpmuludq $T0,$R1,$D1 # d1 = r0'*r12291vpmuludq $T0,$R2,$D2 # d2 = r0'*r22292vpmuludq $T0,$R3,$D3 # d3 = r0'*r32293vpmuludq $T0,$R4,$D4 # d4 = r0'*r42294vpsrlq \$32,$R2,$T222952296vpmuludq $T1,$S4,$M02297vpmuludq $T1,$R0,$M12298vpmuludq $T1,$R1,$M22299vpmuludq $T1,$R2,$M32300vpmuludq $T1,$R3,$M42301vpsrlq \$32,$R3,$T32302vpaddq $M0,$D0,$D0 # d0 += r1'*5*r42303vpaddq $M1,$D1,$D1 # d1 += r1'*r02304vpaddq $M2,$D2,$D2 # d2 += r1'*r12305vpaddq $M3,$D3,$D3 # d3 += r1'*r22306vpaddq $M4,$D4,$D4 # d4 += r1'*r323072308vpmuludq $T2,$S3,$M02309vpmuludq $T2,$S4,$M12310vpmuludq $T2,$R1,$M32311vpmuludq $T2,$R2,$M42312vpmuludq $T2,$R0,$M22313vpsrlq \$32,$R4,$T42314vpaddq $M0,$D0,$D0 # d0 += r2'*5*r32315vpaddq $M1,$D1,$D1 # d1 += r2'*5*r42316vpaddq $M3,$D3,$D3 # d3 += r2'*r12317vpaddq $M4,$D4,$D4 # d4 += r2'*r22318vpaddq $M2,$D2,$D2 # d2 += r2'*r023192320vpmuludq $T3,$S2,$M02321vpmuludq $T3,$R0,$M32322vpmuludq $T3,$R1,$M42323vpmuludq $T3,$S3,$M12324vpmuludq $T3,$S4,$M22325vpaddq $M0,$D0,$D0 # d0 += r3'*5*r22326vpaddq $M3,$D3,$D3 # d3 += r3'*r02327vpaddq $M4,$D4,$D4 # d4 += r3'*r12328vpaddq $M1,$D1,$D1 # d1 += r3'*5*r32329vpaddq $M2,$D2,$D2 # d2 += r3'*5*r423302331vpmuludq $T4,$S4,$M32332vpmuludq $T4,$R0,$M42333vpmuludq $T4,$S1,$M02334vpmuludq $T4,$S2,$M12335vpmuludq $T4,$S3,$M22336vpaddq $M3,$D3,$D3 # d3 += r2'*5*r42337vpaddq $M4,$D4,$D4 # d4 += r2'*r02338vpaddq $M0,$D0,$D0 # d0 += r2'*5*r12339vpaddq $M1,$D1,$D1 # d1 += r2'*5*r22340vpaddq $M2,$D2,$D2 # d2 += r2'*5*r323412342################################################################2343# load input2344vmovdqu64 16*0($inp),%z#$T32345vmovdqu64 16*4($inp),%z#$T42346lea 16*8($inp),$inp23472348################################################################2349# lazy reduction23502351vpsrlq \$26,$D3,$M32352vpandq $MASK,$D3,$D32353vpaddq $M3,$D4,$D4 # d3 -> d423542355vpsrlq \$26,$D0,$M02356vpandq $MASK,$D0,$D02357vpaddq $M0,$D1,$D1 # d0 -> d123582359vpsrlq \$26,$D4,$M42360vpandq $MASK,$D4,$D423612362vpsrlq \$26,$D1,$M12363vpandq $MASK,$D1,$D12364vpaddq $M1,$D2,$D2 # d1 -> d223652366vpaddq $M4,$D0,$D02367vpsllq \$2,$M4,$M42368vpaddq $M4,$D0,$D0 # d4 -> d023692370vpsrlq \$26,$D2,$M22371vpandq $MASK,$D2,$D22372vpaddq $M2,$D3,$D3 # d2 -> d323732374vpsrlq \$26,$D0,$M02375vpandq $MASK,$D0,$D02376vpaddq $M0,$D1,$D1 # d0 -> d123772378vpsrlq \$26,$D3,$M32379vpandq $MASK,$D3,$D32380vpaddq $M3,$D4,$D4 # d3 -> d423812382################################################################2383# at this point we have 14243444 in $R0-$S4 and 05060708 in2384# $D0-$D4, ...23852386vpunpcklqdq $T4,$T3,$T0 # transpose input2387vpunpckhqdq $T4,$T3,$T423882389# ... since input 64-bit lanes are ordered as 73625140, we could2390# "vperm" it to 76543210 (here and in each loop iteration), *or*2391# we could just flow along, hence the goal for $R0-$S4 is2392# 1858286838784888 ...23932394vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:2395mov \$0x7777,%eax2396kmovw %eax,%k123972398vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---2399vpermd $R1,$M0,$R12400vpermd $R2,$M0,$R22401vpermd $R3,$M0,$R32402vpermd $R4,$M0,$R424032404vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 18582868387848882405vpermd $D1,$M0,${R1}{%k1}2406vpermd $D2,$M0,${R2}{%k1}2407vpermd $D3,$M0,${R3}{%k1}2408vpermd $D4,$M0,${R4}{%k1}24092410vpslld \$2,$R1,$S1 # *52411vpslld \$2,$R2,$S22412vpslld \$2,$R3,$S32413vpslld \$2,$R4,$S42414vpaddd $R1,$S1,$S12415vpaddd $R2,$S2,$S22416vpaddd $R3,$S3,$S32417vpaddd $R4,$S4,$S424182419vpbroadcastq 32(%rcx),$PADBIT # .L12924202421vpsrlq \$52,$T0,$T2 # splat input2422vpsllq \$12,$T4,$T32423vporq $T3,$T2,$T22424vpsrlq \$26,$T0,$T12425vpsrlq \$14,$T4,$T32426vpsrlq \$40,$T4,$T4 # 42427vpandq $MASK,$T2,$T2 # 22428vpandq $MASK,$T0,$T0 # 02429#vpandq $MASK,$T1,$T1 # 12430#vpandq $MASK,$T3,$T3 # 32431#vporq $PADBIT,$T4,$T4 # padbit, yes, always24322433vpaddq $H2,$T2,$H2 # accumulate input2434sub \$192,$len2435jbe .Ltail_avx5122436jmp .Loop_avx51224372438.align 322439.Loop_avx512:2440################################################################2441# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^82442# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^72443# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^62444# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^52445# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^42446# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^32447# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^22448# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^12449# \________/\___________/2450################################################################2451#vpaddq $H2,$T2,$H2 # accumulate input24522453# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r42454# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r42455# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r42456# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r42457# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r42458#2459# however, as h2 is "chronologically" first one available pull2460# corresponding operations up, so it's2461#2462# d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r42463# d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r02464# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r12465# d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r22466# d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r324672468vpmuludq $H2,$R1,$D3 # d3 = h2*r12469vpaddq $H0,$T0,$H02470vpmuludq $H2,$R2,$D4 # d4 = h2*r22471vpandq $MASK,$T1,$T1 # 12472vpmuludq $H2,$S3,$D0 # d0 = h2*s32473vpandq $MASK,$T3,$T3 # 32474vpmuludq $H2,$S4,$D1 # d1 = h2*s42475vporq $PADBIT,$T4,$T4 # padbit, yes, always2476vpmuludq $H2,$R0,$D2 # d2 = h2*r02477vpaddq $H1,$T1,$H1 # accumulate input2478vpaddq $H3,$T3,$H32479vpaddq $H4,$T4,$H424802481vmovdqu64 16*0($inp),$T3 # load input2482vmovdqu64 16*4($inp),$T42483lea 16*8($inp),$inp2484vpmuludq $H0,$R3,$M32485vpmuludq $H0,$R4,$M42486vpmuludq $H0,$R0,$M02487vpmuludq $H0,$R1,$M12488vpaddq $M3,$D3,$D3 # d3 += h0*r32489vpaddq $M4,$D4,$D4 # d4 += h0*r42490vpaddq $M0,$D0,$D0 # d0 += h0*r02491vpaddq $M1,$D1,$D1 # d1 += h0*r124922493vpmuludq $H1,$R2,$M32494vpmuludq $H1,$R3,$M42495vpmuludq $H1,$S4,$M02496vpmuludq $H0,$R2,$M22497vpaddq $M3,$D3,$D3 # d3 += h1*r22498vpaddq $M4,$D4,$D4 # d4 += h1*r32499vpaddq $M0,$D0,$D0 # d0 += h1*s42500vpaddq $M2,$D2,$D2 # d2 += h0*r225012502vpunpcklqdq $T4,$T3,$T0 # transpose input2503vpunpckhqdq $T4,$T3,$T425042505vpmuludq $H3,$R0,$M32506vpmuludq $H3,$R1,$M42507vpmuludq $H1,$R0,$M12508vpmuludq $H1,$R1,$M22509vpaddq $M3,$D3,$D3 # d3 += h3*r02510vpaddq $M4,$D4,$D4 # d4 += h3*r12511vpaddq $M1,$D1,$D1 # d1 += h1*r02512vpaddq $M2,$D2,$D2 # d2 += h1*r125132514vpmuludq $H4,$S4,$M32515vpmuludq $H4,$R0,$M42516vpmuludq $H3,$S2,$M02517vpmuludq $H3,$S3,$M12518vpaddq $M3,$D3,$D3 # d3 += h4*s42519vpmuludq $H3,$S4,$M22520vpaddq $M4,$D4,$D4 # d4 += h4*r02521vpaddq $M0,$D0,$D0 # d0 += h3*s22522vpaddq $M1,$D1,$D1 # d1 += h3*s32523vpaddq $M2,$D2,$D2 # d2 += h3*s425242525vpmuludq $H4,$S1,$M02526vpmuludq $H4,$S2,$M12527vpmuludq $H4,$S3,$M22528vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s12529vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s22530vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s325312532################################################################2533# lazy reduction (interleaved with input splat)25342535vpsrlq \$52,$T0,$T2 # splat input2536vpsllq \$12,$T4,$T325372538vpsrlq \$26,$D3,$H32539vpandq $MASK,$D3,$D32540vpaddq $H3,$D4,$H4 # h3 -> h425412542vporq $T3,$T2,$T225432544vpsrlq \$26,$H0,$D02545vpandq $MASK,$H0,$H02546vpaddq $D0,$H1,$H1 # h0 -> h125472548vpandq $MASK,$T2,$T2 # 225492550vpsrlq \$26,$H4,$D42551vpandq $MASK,$H4,$H425522553vpsrlq \$26,$H1,$D12554vpandq $MASK,$H1,$H12555vpaddq $D1,$H2,$H2 # h1 -> h225562557vpaddq $D4,$H0,$H02558vpsllq \$2,$D4,$D42559vpaddq $D4,$H0,$H0 # h4 -> h025602561vpaddq $T2,$H2,$H2 # modulo-scheduled2562vpsrlq \$26,$T0,$T125632564vpsrlq \$26,$H2,$D22565vpandq $MASK,$H2,$H22566vpaddq $D2,$D3,$H3 # h2 -> h325672568vpsrlq \$14,$T4,$T325692570vpsrlq \$26,$H0,$D02571vpandq $MASK,$H0,$H02572vpaddq $D0,$H1,$H1 # h0 -> h125732574vpsrlq \$40,$T4,$T4 # 425752576vpsrlq \$26,$H3,$D32577vpandq $MASK,$H3,$H32578vpaddq $D3,$H4,$H4 # h3 -> h425792580vpandq $MASK,$T0,$T0 # 02581#vpandq $MASK,$T1,$T1 # 12582#vpandq $MASK,$T3,$T3 # 32583#vporq $PADBIT,$T4,$T4 # padbit, yes, always25842585sub \$128,$len2586ja .Loop_avx51225872588.Ltail_avx512:2589################################################################2590# while above multiplications were by r^8 in all lanes, in last2591# iteration we multiply least significant lane by r^8 and most2592# significant one by r, that's why table gets shifted...25932594vpsrlq \$32,$R0,$R0 # 01050206030704082595vpsrlq \$32,$R1,$R12596vpsrlq \$32,$R2,$R22597vpsrlq \$32,$S3,$S32598vpsrlq \$32,$S4,$S42599vpsrlq \$32,$R3,$R32600vpsrlq \$32,$R4,$R42601vpsrlq \$32,$S1,$S12602vpsrlq \$32,$S2,$S226032604################################################################2605# load either next or last 64 byte of input2606lea ($inp,$len),$inp26072608#vpaddq $H2,$T2,$H2 # accumulate input2609vpaddq $H0,$T0,$H026102611vpmuludq $H2,$R1,$D3 # d3 = h2*r12612vpmuludq $H2,$R2,$D4 # d4 = h2*r22613vpmuludq $H2,$S3,$D0 # d0 = h2*s32614vpandq $MASK,$T1,$T1 # 12615vpmuludq $H2,$S4,$D1 # d1 = h2*s42616vpandq $MASK,$T3,$T3 # 32617vpmuludq $H2,$R0,$D2 # d2 = h2*r02618vporq $PADBIT,$T4,$T4 # padbit, yes, always2619vpaddq $H1,$T1,$H1 # accumulate input2620vpaddq $H3,$T3,$H32621vpaddq $H4,$T4,$H426222623vmovdqu 16*0($inp),%x#$T02624vpmuludq $H0,$R3,$M32625vpmuludq $H0,$R4,$M42626vpmuludq $H0,$R0,$M02627vpmuludq $H0,$R1,$M12628vpaddq $M3,$D3,$D3 # d3 += h0*r32629vpaddq $M4,$D4,$D4 # d4 += h0*r42630vpaddq $M0,$D0,$D0 # d0 += h0*r02631vpaddq $M1,$D1,$D1 # d1 += h0*r126322633vmovdqu 16*1($inp),%x#$T12634vpmuludq $H1,$R2,$M32635vpmuludq $H1,$R3,$M42636vpmuludq $H1,$S4,$M02637vpmuludq $H0,$R2,$M22638vpaddq $M3,$D3,$D3 # d3 += h1*r22639vpaddq $M4,$D4,$D4 # d4 += h1*r32640vpaddq $M0,$D0,$D0 # d0 += h1*s42641vpaddq $M2,$D2,$D2 # d2 += h0*r226422643vinserti128 \$1,16*2($inp),%y#$T0,%y#$T02644vpmuludq $H3,$R0,$M32645vpmuludq $H3,$R1,$M42646vpmuludq $H1,$R0,$M12647vpmuludq $H1,$R1,$M22648vpaddq $M3,$D3,$D3 # d3 += h3*r02649vpaddq $M4,$D4,$D4 # d4 += h3*r12650vpaddq $M1,$D1,$D1 # d1 += h1*r02651vpaddq $M2,$D2,$D2 # d2 += h1*r126522653vinserti128 \$1,16*3($inp),%y#$T1,%y#$T12654vpmuludq $H4,$S4,$M32655vpmuludq $H4,$R0,$M42656vpmuludq $H3,$S2,$M02657vpmuludq $H3,$S3,$M12658vpmuludq $H3,$S4,$M22659vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s42660vpaddq $M4,$D4,$D4 # d4 += h4*r02661vpaddq $M0,$D0,$D0 # d0 += h3*s22662vpaddq $M1,$D1,$D1 # d1 += h3*s32663vpaddq $M2,$D2,$D2 # d2 += h3*s426642665vpmuludq $H4,$S1,$M02666vpmuludq $H4,$S2,$M12667vpmuludq $H4,$S3,$M22668vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s12669vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s22670vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s326712672################################################################2673# horizontal addition26742675mov \$1,%eax2676vpermq \$0xb1,$H3,$D32677vpermq \$0xb1,$D4,$H42678vpermq \$0xb1,$H0,$D02679vpermq \$0xb1,$H1,$D12680vpermq \$0xb1,$H2,$D22681vpaddq $D3,$H3,$H32682vpaddq $D4,$H4,$H42683vpaddq $D0,$H0,$H02684vpaddq $D1,$H1,$H12685vpaddq $D2,$H2,$H226862687kmovw %eax,%k32688vpermq \$0x2,$H3,$D32689vpermq \$0x2,$H4,$D42690vpermq \$0x2,$H0,$D02691vpermq \$0x2,$H1,$D12692vpermq \$0x2,$H2,$D22693vpaddq $D3,$H3,$H32694vpaddq $D4,$H4,$H42695vpaddq $D0,$H0,$H02696vpaddq $D1,$H1,$H12697vpaddq $D2,$H2,$H226982699vextracti64x4 \$0x1,$H3,%y#$D32700vextracti64x4 \$0x1,$H4,%y#$D42701vextracti64x4 \$0x1,$H0,%y#$D02702vextracti64x4 \$0x1,$H1,%y#$D12703vextracti64x4 \$0x1,$H2,%y#$D22704vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case2705vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx22706vpaddq $D0,$H0,${H0}{%k3}{z}2707vpaddq $D1,$H1,${H1}{%k3}{z}2708vpaddq $D2,$H2,${H2}{%k3}{z}2709___2710map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));2711map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));2712$code.=<<___;2713################################################################2714# lazy reduction (interleaved with input splat)27152716vpsrlq \$26,$H3,$D32717vpand $MASK,$H3,$H32718vpsrldq \$6,$T0,$T2 # splat input2719vpsrldq \$6,$T1,$T32720vpunpckhqdq $T1,$T0,$T4 # 42721vpaddq $D3,$H4,$H4 # h3 -> h427222723vpsrlq \$26,$H0,$D02724vpand $MASK,$H0,$H02725vpunpcklqdq $T3,$T2,$T2 # 2:32726vpunpcklqdq $T1,$T0,$T0 # 0:12727vpaddq $D0,$H1,$H1 # h0 -> h127282729vpsrlq \$26,$H4,$D42730vpand $MASK,$H4,$H427312732vpsrlq \$26,$H1,$D12733vpand $MASK,$H1,$H12734vpsrlq \$30,$T2,$T32735vpsrlq \$4,$T2,$T22736vpaddq $D1,$H2,$H2 # h1 -> h227372738vpaddq $D4,$H0,$H02739vpsllq \$2,$D4,$D42740vpsrlq \$26,$T0,$T12741vpsrlq \$40,$T4,$T4 # 42742vpaddq $D4,$H0,$H0 # h4 -> h027432744vpsrlq \$26,$H2,$D22745vpand $MASK,$H2,$H22746vpand $MASK,$T2,$T2 # 22747vpand $MASK,$T0,$T0 # 02748vpaddq $D2,$H3,$H3 # h2 -> h327492750vpsrlq \$26,$H0,$D02751vpand $MASK,$H0,$H02752vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx22753vpand $MASK,$T1,$T1 # 12754vpaddq $D0,$H1,$H1 # h0 -> h127552756vpsrlq \$26,$H3,$D32757vpand $MASK,$H3,$H32758vpand $MASK,$T3,$T3 # 32759vpor 32(%rcx),$T4,$T4 # padbit, yes, always2760vpaddq $D3,$H4,$H4 # h3 -> h427612762lea 0x90(%rsp),%rax # size optimization for .Ltail_avx22763add \$64,$len2764jnz .Ltail_avx2$suffix27652766vpsubq $T2,$H2,$H2 # undo input accumulation2767vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced2768vmovd %x#$H1,`4*1-48-64`($ctx)2769vmovd %x#$H2,`4*2-48-64`($ctx)2770vmovd %x#$H3,`4*3-48-64`($ctx)2771vmovd %x#$H4,`4*4-48-64`($ctx)2772vzeroall2773___2774$code.=<<___ if ($win64);2775movdqa -0xb0(%r10),%xmm62776movdqa -0xa0(%r10),%xmm72777movdqa -0x90(%r10),%xmm82778movdqa -0x80(%r10),%xmm92779movdqa -0x70(%r10),%xmm102780movdqa -0x60(%r10),%xmm112781movdqa -0x50(%r10),%xmm122782movdqa -0x40(%r10),%xmm132783movdqa -0x30(%r10),%xmm142784movdqa -0x20(%r10),%xmm152785lea -8(%r10),%rsp2786.Ldo_avx512_epilogue:2787___2788$code.=<<___ if (!$win64);2789lea -8(%r10),%rsp2790.cfi_def_cfa_register %rsp2791___2792$code.=<<___;2793RET2794.cfi_endproc2795___27962797}27982799}28002801&declare_function("poly1305_blocks_avx2", 32, 4);2802poly1305_blocks_avxN(0);2803&end_function("poly1305_blocks_avx2");28042805#######################################################################2806if ($avx>2) {2807# On entry we have input length divisible by 64. But since inner loop2808# processes 128 bytes per iteration, cases when length is not divisible2809# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this2810# reason stack layout is kept identical to poly1305_blocks_avx2. If not2811# for this tail, we wouldn't have to even allocate stack frame...28122813&declare_function("poly1305_blocks_avx512", 32, 4);2814poly1305_blocks_avxN(1);2815&end_function("poly1305_blocks_avx512");28162817if (!$kernel && $avx>3) {2818########################################################################2819# VPMADD52 version using 2^44 radix.2820#2821# One can argue that base 2^52 would be more natural. Well, even though2822# some operations would be more natural, one has to recognize couple of2823# things. Base 2^52 doesn't provide advantage over base 2^44 if you look2824# at amount of multiply-n-accumulate operations. Secondly, it makes it2825# impossible to pre-compute multiples of 5 [referred to as s[]/sN in2826# reference implementations], which means that more such operations2827# would have to be performed in inner loop, which in turn makes critical2828# path longer. In other words, even though base 2^44 reduction might2829# look less elegant, overall critical path is actually shorter...28302831########################################################################2832# Layout of opaque area is following.2833#2834# unsigned __int64 h[3]; # current hash value base 2^442835# unsigned __int64 s[2]; # key value*20 base 2^442836# unsigned __int64 r[3]; # key value base 2^442837# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];2838# # r^n positions reflect2839# # placement in register, not2840# # memory, R[3] is R[1]*2028412842$code.=<<___;2843.type poly1305_init_base2_44,\@function,32844.align 322845poly1305_init_base2_44:2846xor %eax,%eax2847mov %rax,0($ctx) # initialize hash value2848mov %rax,8($ctx)2849mov %rax,16($ctx)28502851.Linit_base2_44:2852lea poly1305_blocks_vpmadd52(%rip),%r102853lea poly1305_emit_base2_44(%rip),%r1128542855mov \$0x0ffffffc0fffffff,%rax2856mov \$0x0ffffffc0ffffffc,%rcx2857and 0($inp),%rax2858mov \$0x00000fffffffffff,%r82859and 8($inp),%rcx2860mov \$0x00000fffffffffff,%r92861and %rax,%r82862shrd \$44,%rcx,%rax2863mov %r8,40($ctx) # r02864and %r9,%rax2865shr \$24,%rcx2866mov %rax,48($ctx) # r12867lea (%rax,%rax,4),%rax # *52868mov %rcx,56($ctx) # r22869shl \$2,%rax # magic <<22870lea (%rcx,%rcx,4),%rcx # *52871shl \$2,%rcx # magic <<22872mov %rax,24($ctx) # s12873mov %rcx,32($ctx) # s22874movq \$-1,64($ctx) # write impossible value2875___2876$code.=<<___ if ($flavour !~ /elf32/);2877mov %r10,0(%rdx)2878mov %r11,8(%rdx)2879___2880$code.=<<___ if ($flavour =~ /elf32/);2881mov %r10d,0(%rdx)2882mov %r11d,4(%rdx)2883___2884$code.=<<___;2885mov \$1,%eax2886RET2887.size poly1305_init_base2_44,.-poly1305_init_base2_442888___2889{2890my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));2891my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));2892my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));28932894$code.=<<___;2895.type poly1305_blocks_vpmadd52,\@function,42896.align 322897poly1305_blocks_vpmadd52:2898shr \$4,$len2899jz .Lno_data_vpmadd52 # too short29002901shl \$40,$padbit2902mov 64($ctx),%r8 # peek on power of the key29032904# if powers of the key are not calculated yet, process up to 32905# blocks with this single-block subroutine, otherwise ensure that2906# length is divisible by 2 blocks and pass the rest down to next2907# subroutine...29082909mov \$3,%rax2910mov \$1,%r102911cmp \$4,$len # is input long2912cmovae %r10,%rax2913test %r8,%r8 # is power value impossible?2914cmovns %r10,%rax29152916and $len,%rax # is input of favourable length?2917jz .Lblocks_vpmadd52_4x29182919sub %rax,$len2920mov \$7,%r10d2921mov \$1,%r11d2922kmovw %r10d,%k72923lea .L2_44_inp_permd(%rip),%r102924kmovw %r11d,%k129252926vmovq $padbit,%x#$PAD2927vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd2928vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift2929vpermq \$0xcf,$PAD,$PAD2930vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask29312932vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value2933vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys2934vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}2935vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}29362937vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt2938vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft29392940jmp .Loop_vpmadd5229412942.align 322943.Loop_vpmadd52:2944vmovdqu32 0($inp),%x#$T0 # load input as ----32102945lea 16($inp),$inp29462947vpermd $T0,$inp_permd,$T0 # ----3210 -> --3221102948vpsrlvq $inp_shift,$T0,$T02949vpandq $reduc_mask,$T0,$T02950vporq $PAD,$T0,$T029512952vpaddq $T0,$Dlo,$Dlo # accumulate input29532954vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value2955vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}2956vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}29572958vpxord $Dlo,$Dlo,$Dlo2959vpxord $Dhi,$Dhi,$Dhi29602961vpmadd52luq $r2r1r0,$H0,$Dlo2962vpmadd52huq $r2r1r0,$H0,$Dhi29632964vpmadd52luq $r1r0s2,$H1,$Dlo2965vpmadd52huq $r1r0s2,$H1,$Dhi29662967vpmadd52luq $r0s2s1,$H2,$Dlo2968vpmadd52huq $r0s2s1,$H2,$Dhi29692970vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword2971vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword2972vpandq $reduc_mask,$Dlo,$Dlo29732974vpaddq $T0,$Dhi,$Dhi29752976vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword29772978vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)29792980vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word2981vpandq $reduc_mask,$Dlo,$Dlo29822983vpermq \$0b10010011,$T0,$T029842985vpaddq $T0,$Dlo,$Dlo29862987vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}29882989vpaddq $T0,$Dlo,$Dlo2990vpsllq \$2,$T0,$T029912992vpaddq $T0,$Dlo,$Dlo29932994dec %rax # len-=162995jnz .Loop_vpmadd5229962997vmovdqu64 $Dlo,0($ctx){%k7} # store hash value29982999test $len,$len3000jnz .Lblocks_vpmadd52_4x30013002.Lno_data_vpmadd52:3003RET3004.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd523005___3006}3007{3008########################################################################3009# As implied by its name 4x subroutine processes 4 blocks in parallel3010# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power3011# and is handled in 256-bit %ymm registers.30123013my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));3014my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));3015my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));30163017$code.=<<___;3018.type poly1305_blocks_vpmadd52_4x,\@function,43019.align 323020poly1305_blocks_vpmadd52_4x:3021shr \$4,$len3022jz .Lno_data_vpmadd52_4x # too short30233024shl \$40,$padbit3025mov 64($ctx),%r8 # peek on power of the key30263027.Lblocks_vpmadd52_4x:3028vpbroadcastq $padbit,$PAD30293030vmovdqa64 .Lx_mask44(%rip),$mask443031mov \$5,%eax3032vmovdqa64 .Lx_mask42(%rip),$mask423033kmovw %eax,%k1 # used in 2x path30343035test %r8,%r8 # is power value impossible?3036js .Linit_vpmadd52 # if it is, then init R[4]30373038vmovq 0($ctx),%x#$H0 # load current hash value3039vmovq 8($ctx),%x#$H13040vmovq 16($ctx),%x#$H230413042test \$3,$len # is length 4*n+2?3043jnz .Lblocks_vpmadd52_2x_do30443045.Lblocks_vpmadd52_4x_do:3046vpbroadcastq 64($ctx),$R0 # load 4th power of the key3047vpbroadcastq 96($ctx),$R13048vpbroadcastq 128($ctx),$R23049vpbroadcastq 160($ctx),$S130503051.Lblocks_vpmadd52_4x_key_loaded:3052vpsllq \$2,$R2,$S2 # S2 = R2*5*43053vpaddq $R2,$S2,$S23054vpsllq \$2,$S2,$S230553056test \$7,$len # is len 8*n?3057jz .Lblocks_vpmadd52_8x30583059vmovdqu64 16*0($inp),$T2 # load data3060vmovdqu64 16*2($inp),$T33061lea 16*4($inp),$inp30623063vpunpcklqdq $T3,$T2,$T1 # transpose data3064vpunpckhqdq $T3,$T2,$T330653066# at this point 64-bit lanes are ordered as 3-1-2-030673068vpsrlq \$24,$T3,$T2 # splat the data3069vporq $PAD,$T2,$T23070vpaddq $T2,$H2,$H2 # accumulate input3071vpandq $mask44,$T1,$T03072vpsrlq \$44,$T1,$T13073vpsllq \$20,$T3,$T33074vporq $T3,$T1,$T13075vpandq $mask44,$T1,$T130763077sub \$4,$len3078jz .Ltail_vpmadd52_4x3079jmp .Loop_vpmadd52_4x3080ud230813082.align 323083.Linit_vpmadd52:3084vmovq 24($ctx),%x#$S1 # load key3085vmovq 56($ctx),%x#$H23086vmovq 32($ctx),%x#$S23087vmovq 40($ctx),%x#$R03088vmovq 48($ctx),%x#$R130893090vmovdqa $R0,$H03091vmovdqa $R1,$H13092vmovdqa $H2,$R230933094mov \$2,%eax30953096.Lmul_init_vpmadd52:3097vpxorq $D0lo,$D0lo,$D0lo3098vpmadd52luq $H2,$S1,$D0lo3099vpxorq $D0hi,$D0hi,$D0hi3100vpmadd52huq $H2,$S1,$D0hi3101vpxorq $D1lo,$D1lo,$D1lo3102vpmadd52luq $H2,$S2,$D1lo3103vpxorq $D1hi,$D1hi,$D1hi3104vpmadd52huq $H2,$S2,$D1hi3105vpxorq $D2lo,$D2lo,$D2lo3106vpmadd52luq $H2,$R0,$D2lo3107vpxorq $D2hi,$D2hi,$D2hi3108vpmadd52huq $H2,$R0,$D2hi31093110vpmadd52luq $H0,$R0,$D0lo3111vpmadd52huq $H0,$R0,$D0hi3112vpmadd52luq $H0,$R1,$D1lo3113vpmadd52huq $H0,$R1,$D1hi3114vpmadd52luq $H0,$R2,$D2lo3115vpmadd52huq $H0,$R2,$D2hi31163117vpmadd52luq $H1,$S2,$D0lo3118vpmadd52huq $H1,$S2,$D0hi3119vpmadd52luq $H1,$R0,$D1lo3120vpmadd52huq $H1,$R0,$D1hi3121vpmadd52luq $H1,$R1,$D2lo3122vpmadd52huq $H1,$R1,$D2hi31233124################################################################3125# partial reduction3126vpsrlq \$44,$D0lo,$tmp3127vpsllq \$8,$D0hi,$D0hi3128vpandq $mask44,$D0lo,$H03129vpaddq $tmp,$D0hi,$D0hi31303131vpaddq $D0hi,$D1lo,$D1lo31323133vpsrlq \$44,$D1lo,$tmp3134vpsllq \$8,$D1hi,$D1hi3135vpandq $mask44,$D1lo,$H13136vpaddq $tmp,$D1hi,$D1hi31373138vpaddq $D1hi,$D2lo,$D2lo31393140vpsrlq \$42,$D2lo,$tmp3141vpsllq \$10,$D2hi,$D2hi3142vpandq $mask42,$D2lo,$H23143vpaddq $tmp,$D2hi,$D2hi31443145vpaddq $D2hi,$H0,$H03146vpsllq \$2,$D2hi,$D2hi31473148vpaddq $D2hi,$H0,$H031493150vpsrlq \$44,$H0,$tmp # additional step3151vpandq $mask44,$H0,$H031523153vpaddq $tmp,$H1,$H131543155dec %eax3156jz .Ldone_init_vpmadd5231573158vpunpcklqdq $R1,$H1,$R1 # 1,23159vpbroadcastq %x#$H1,%x#$H1 # 2,23160vpunpcklqdq $R2,$H2,$R23161vpbroadcastq %x#$H2,%x#$H23162vpunpcklqdq $R0,$H0,$R03163vpbroadcastq %x#$H0,%x#$H031643165vpsllq \$2,$R1,$S1 # S1 = R1*5*43166vpsllq \$2,$R2,$S2 # S2 = R2*5*43167vpaddq $R1,$S1,$S13168vpaddq $R2,$S2,$S23169vpsllq \$2,$S1,$S13170vpsllq \$2,$S2,$S231713172jmp .Lmul_init_vpmadd523173ud231743175.align 323176.Ldone_init_vpmadd52:3177vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,43178vinserti128 \$1,%x#$R2,$H2,$R23179vinserti128 \$1,%x#$R0,$H0,$R031803181vpermq \$0b11011000,$R1,$R1 # 1,3,2,43182vpermq \$0b11011000,$R2,$R23183vpermq \$0b11011000,$R0,$R031843185vpsllq \$2,$R1,$S1 # S1 = R1*5*43186vpaddq $R1,$S1,$S13187vpsllq \$2,$S1,$S131883189vmovq 0($ctx),%x#$H0 # load current hash value3190vmovq 8($ctx),%x#$H13191vmovq 16($ctx),%x#$H231923193test \$3,$len # is length 4*n+2?3194jnz .Ldone_init_vpmadd52_2x31953196vmovdqu64 $R0,64($ctx) # save key powers3197vpbroadcastq %x#$R0,$R0 # broadcast 4th power3198vmovdqu64 $R1,96($ctx)3199vpbroadcastq %x#$R1,$R13200vmovdqu64 $R2,128($ctx)3201vpbroadcastq %x#$R2,$R23202vmovdqu64 $S1,160($ctx)3203vpbroadcastq %x#$S1,$S132043205jmp .Lblocks_vpmadd52_4x_key_loaded3206ud232073208.align 323209.Ldone_init_vpmadd52_2x:3210vmovdqu64 $R0,64($ctx) # save key powers3211vpsrldq \$8,$R0,$R0 # 0-1-0-23212vmovdqu64 $R1,96($ctx)3213vpsrldq \$8,$R1,$R13214vmovdqu64 $R2,128($ctx)3215vpsrldq \$8,$R2,$R23216vmovdqu64 $S1,160($ctx)3217vpsrldq \$8,$S1,$S13218jmp .Lblocks_vpmadd52_2x_key_loaded3219ud232203221.align 323222.Lblocks_vpmadd52_2x_do:3223vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers3224vmovdqu64 160+8($ctx),${S1}{%k1}{z}3225vmovdqu64 64+8($ctx),${R0}{%k1}{z}3226vmovdqu64 96+8($ctx),${R1}{%k1}{z}32273228.Lblocks_vpmadd52_2x_key_loaded:3229vmovdqu64 16*0($inp),$T2 # load data3230vpxorq $T3,$T3,$T33231lea 16*2($inp),$inp32323233vpunpcklqdq $T3,$T2,$T1 # transpose data3234vpunpckhqdq $T3,$T2,$T332353236# at this point 64-bit lanes are ordered as x-1-x-032373238vpsrlq \$24,$T3,$T2 # splat the data3239vporq $PAD,$T2,$T23240vpaddq $T2,$H2,$H2 # accumulate input3241vpandq $mask44,$T1,$T03242vpsrlq \$44,$T1,$T13243vpsllq \$20,$T3,$T33244vporq $T3,$T1,$T13245vpandq $mask44,$T1,$T132463247jmp .Ltail_vpmadd52_2x3248ud232493250.align 323251.Loop_vpmadd52_4x:3252#vpaddq $T2,$H2,$H2 # accumulate input3253vpaddq $T0,$H0,$H03254vpaddq $T1,$H1,$H132553256vpxorq $D0lo,$D0lo,$D0lo3257vpmadd52luq $H2,$S1,$D0lo3258vpxorq $D0hi,$D0hi,$D0hi3259vpmadd52huq $H2,$S1,$D0hi3260vpxorq $D1lo,$D1lo,$D1lo3261vpmadd52luq $H2,$S2,$D1lo3262vpxorq $D1hi,$D1hi,$D1hi3263vpmadd52huq $H2,$S2,$D1hi3264vpxorq $D2lo,$D2lo,$D2lo3265vpmadd52luq $H2,$R0,$D2lo3266vpxorq $D2hi,$D2hi,$D2hi3267vpmadd52huq $H2,$R0,$D2hi32683269vmovdqu64 16*0($inp),$T2 # load data3270vmovdqu64 16*2($inp),$T33271lea 16*4($inp),$inp3272vpmadd52luq $H0,$R0,$D0lo3273vpmadd52huq $H0,$R0,$D0hi3274vpmadd52luq $H0,$R1,$D1lo3275vpmadd52huq $H0,$R1,$D1hi3276vpmadd52luq $H0,$R2,$D2lo3277vpmadd52huq $H0,$R2,$D2hi32783279vpunpcklqdq $T3,$T2,$T1 # transpose data3280vpunpckhqdq $T3,$T2,$T33281vpmadd52luq $H1,$S2,$D0lo3282vpmadd52huq $H1,$S2,$D0hi3283vpmadd52luq $H1,$R0,$D1lo3284vpmadd52huq $H1,$R0,$D1hi3285vpmadd52luq $H1,$R1,$D2lo3286vpmadd52huq $H1,$R1,$D2hi32873288################################################################3289# partial reduction (interleaved with data splat)3290vpsrlq \$44,$D0lo,$tmp3291vpsllq \$8,$D0hi,$D0hi3292vpandq $mask44,$D0lo,$H03293vpaddq $tmp,$D0hi,$D0hi32943295vpsrlq \$24,$T3,$T23296vporq $PAD,$T2,$T23297vpaddq $D0hi,$D1lo,$D1lo32983299vpsrlq \$44,$D1lo,$tmp3300vpsllq \$8,$D1hi,$D1hi3301vpandq $mask44,$D1lo,$H13302vpaddq $tmp,$D1hi,$D1hi33033304vpandq $mask44,$T1,$T03305vpsrlq \$44,$T1,$T13306vpsllq \$20,$T3,$T33307vpaddq $D1hi,$D2lo,$D2lo33083309vpsrlq \$42,$D2lo,$tmp3310vpsllq \$10,$D2hi,$D2hi3311vpandq $mask42,$D2lo,$H23312vpaddq $tmp,$D2hi,$D2hi33133314vpaddq $T2,$H2,$H2 # accumulate input3315vpaddq $D2hi,$H0,$H03316vpsllq \$2,$D2hi,$D2hi33173318vpaddq $D2hi,$H0,$H03319vporq $T3,$T1,$T13320vpandq $mask44,$T1,$T133213322vpsrlq \$44,$H0,$tmp # additional step3323vpandq $mask44,$H0,$H033243325vpaddq $tmp,$H1,$H133263327sub \$4,$len # len-=643328jnz .Loop_vpmadd52_4x33293330.Ltail_vpmadd52_4x:3331vmovdqu64 128($ctx),$R2 # load all key powers3332vmovdqu64 160($ctx),$S13333vmovdqu64 64($ctx),$R03334vmovdqu64 96($ctx),$R133353336.Ltail_vpmadd52_2x:3337vpsllq \$2,$R2,$S2 # S2 = R2*5*43338vpaddq $R2,$S2,$S23339vpsllq \$2,$S2,$S233403341#vpaddq $T2,$H2,$H2 # accumulate input3342vpaddq $T0,$H0,$H03343vpaddq $T1,$H1,$H133443345vpxorq $D0lo,$D0lo,$D0lo3346vpmadd52luq $H2,$S1,$D0lo3347vpxorq $D0hi,$D0hi,$D0hi3348vpmadd52huq $H2,$S1,$D0hi3349vpxorq $D1lo,$D1lo,$D1lo3350vpmadd52luq $H2,$S2,$D1lo3351vpxorq $D1hi,$D1hi,$D1hi3352vpmadd52huq $H2,$S2,$D1hi3353vpxorq $D2lo,$D2lo,$D2lo3354vpmadd52luq $H2,$R0,$D2lo3355vpxorq $D2hi,$D2hi,$D2hi3356vpmadd52huq $H2,$R0,$D2hi33573358vpmadd52luq $H0,$R0,$D0lo3359vpmadd52huq $H0,$R0,$D0hi3360vpmadd52luq $H0,$R1,$D1lo3361vpmadd52huq $H0,$R1,$D1hi3362vpmadd52luq $H0,$R2,$D2lo3363vpmadd52huq $H0,$R2,$D2hi33643365vpmadd52luq $H1,$S2,$D0lo3366vpmadd52huq $H1,$S2,$D0hi3367vpmadd52luq $H1,$R0,$D1lo3368vpmadd52huq $H1,$R0,$D1hi3369vpmadd52luq $H1,$R1,$D2lo3370vpmadd52huq $H1,$R1,$D2hi33713372################################################################3373# horizontal addition33743375mov \$1,%eax3376kmovw %eax,%k13377vpsrldq \$8,$D0lo,$T03378vpsrldq \$8,$D0hi,$H03379vpsrldq \$8,$D1lo,$T13380vpsrldq \$8,$D1hi,$H13381vpaddq $T0,$D0lo,$D0lo3382vpaddq $H0,$D0hi,$D0hi3383vpsrldq \$8,$D2lo,$T23384vpsrldq \$8,$D2hi,$H23385vpaddq $T1,$D1lo,$D1lo3386vpaddq $H1,$D1hi,$D1hi3387vpermq \$0x2,$D0lo,$T03388vpermq \$0x2,$D0hi,$H03389vpaddq $T2,$D2lo,$D2lo3390vpaddq $H2,$D2hi,$D2hi33913392vpermq \$0x2,$D1lo,$T13393vpermq \$0x2,$D1hi,$H13394vpaddq $T0,$D0lo,${D0lo}{%k1}{z}3395vpaddq $H0,$D0hi,${D0hi}{%k1}{z}3396vpermq \$0x2,$D2lo,$T23397vpermq \$0x2,$D2hi,$H23398vpaddq $T1,$D1lo,${D1lo}{%k1}{z}3399vpaddq $H1,$D1hi,${D1hi}{%k1}{z}3400vpaddq $T2,$D2lo,${D2lo}{%k1}{z}3401vpaddq $H2,$D2hi,${D2hi}{%k1}{z}34023403################################################################3404# partial reduction3405vpsrlq \$44,$D0lo,$tmp3406vpsllq \$8,$D0hi,$D0hi3407vpandq $mask44,$D0lo,$H03408vpaddq $tmp,$D0hi,$D0hi34093410vpaddq $D0hi,$D1lo,$D1lo34113412vpsrlq \$44,$D1lo,$tmp3413vpsllq \$8,$D1hi,$D1hi3414vpandq $mask44,$D1lo,$H13415vpaddq $tmp,$D1hi,$D1hi34163417vpaddq $D1hi,$D2lo,$D2lo34183419vpsrlq \$42,$D2lo,$tmp3420vpsllq \$10,$D2hi,$D2hi3421vpandq $mask42,$D2lo,$H23422vpaddq $tmp,$D2hi,$D2hi34233424vpaddq $D2hi,$H0,$H03425vpsllq \$2,$D2hi,$D2hi34263427vpaddq $D2hi,$H0,$H034283429vpsrlq \$44,$H0,$tmp # additional step3430vpandq $mask44,$H0,$H034313432vpaddq $tmp,$H1,$H13433# at this point $len is3434# either 4*n+2 or 0...3435sub \$2,$len # len-=323436ja .Lblocks_vpmadd52_4x_do34373438vmovq %x#$H0,0($ctx)3439vmovq %x#$H1,8($ctx)3440vmovq %x#$H2,16($ctx)3441vzeroall34423443.Lno_data_vpmadd52_4x:3444RET3445.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x3446___3447}3448{3449########################################################################3450# As implied by its name 8x subroutine processes 8 blocks in parallel...3451# This is intermediate version, as it's used only in cases when input3452# length is either 8*n, 8*n+1 or 8*n+2...34533454my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));3455my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));3456my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));3457my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));34583459$code.=<<___;3460.type poly1305_blocks_vpmadd52_8x,\@function,43461.align 323462poly1305_blocks_vpmadd52_8x:3463shr \$4,$len3464jz .Lno_data_vpmadd52_8x # too short34653466shl \$40,$padbit3467mov 64($ctx),%r8 # peek on power of the key34683469vmovdqa64 .Lx_mask44(%rip),$mask443470vmovdqa64 .Lx_mask42(%rip),$mask4234713472test %r8,%r8 # is power value impossible?3473js .Linit_vpmadd52 # if it is, then init R[4]34743475vmovq 0($ctx),%x#$H0 # load current hash value3476vmovq 8($ctx),%x#$H13477vmovq 16($ctx),%x#$H234783479.Lblocks_vpmadd52_8x:3480################################################################3481# fist we calculate more key powers34823483vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers3484vmovdqu64 160($ctx),$S13485vmovdqu64 64($ctx),$R03486vmovdqu64 96($ctx),$R134873488vpsllq \$2,$R2,$S2 # S2 = R2*5*43489vpaddq $R2,$S2,$S23490vpsllq \$2,$S2,$S234913492vpbroadcastq %x#$R2,$RR2 # broadcast 4th power3493vpbroadcastq %x#$R0,$RR03494vpbroadcastq %x#$R1,$RR134953496vpxorq $D0lo,$D0lo,$D0lo3497vpmadd52luq $RR2,$S1,$D0lo3498vpxorq $D0hi,$D0hi,$D0hi3499vpmadd52huq $RR2,$S1,$D0hi3500vpxorq $D1lo,$D1lo,$D1lo3501vpmadd52luq $RR2,$S2,$D1lo3502vpxorq $D1hi,$D1hi,$D1hi3503vpmadd52huq $RR2,$S2,$D1hi3504vpxorq $D2lo,$D2lo,$D2lo3505vpmadd52luq $RR2,$R0,$D2lo3506vpxorq $D2hi,$D2hi,$D2hi3507vpmadd52huq $RR2,$R0,$D2hi35083509vpmadd52luq $RR0,$R0,$D0lo3510vpmadd52huq $RR0,$R0,$D0hi3511vpmadd52luq $RR0,$R1,$D1lo3512vpmadd52huq $RR0,$R1,$D1hi3513vpmadd52luq $RR0,$R2,$D2lo3514vpmadd52huq $RR0,$R2,$D2hi35153516vpmadd52luq $RR1,$S2,$D0lo3517vpmadd52huq $RR1,$S2,$D0hi3518vpmadd52luq $RR1,$R0,$D1lo3519vpmadd52huq $RR1,$R0,$D1hi3520vpmadd52luq $RR1,$R1,$D2lo3521vpmadd52huq $RR1,$R1,$D2hi35223523################################################################3524# partial reduction3525vpsrlq \$44,$D0lo,$tmp3526vpsllq \$8,$D0hi,$D0hi3527vpandq $mask44,$D0lo,$RR03528vpaddq $tmp,$D0hi,$D0hi35293530vpaddq $D0hi,$D1lo,$D1lo35313532vpsrlq \$44,$D1lo,$tmp3533vpsllq \$8,$D1hi,$D1hi3534vpandq $mask44,$D1lo,$RR13535vpaddq $tmp,$D1hi,$D1hi35363537vpaddq $D1hi,$D2lo,$D2lo35383539vpsrlq \$42,$D2lo,$tmp3540vpsllq \$10,$D2hi,$D2hi3541vpandq $mask42,$D2lo,$RR23542vpaddq $tmp,$D2hi,$D2hi35433544vpaddq $D2hi,$RR0,$RR03545vpsllq \$2,$D2hi,$D2hi35463547vpaddq $D2hi,$RR0,$RR035483549vpsrlq \$44,$RR0,$tmp # additional step3550vpandq $mask44,$RR0,$RR035513552vpaddq $tmp,$RR1,$RR135533554################################################################3555# At this point Rx holds 1324 powers, RRx - 5768, and the goal3556# is 15263748, which reflects how data is loaded...35573558vpunpcklqdq $R2,$RR2,$T2 # 37483559vpunpckhqdq $R2,$RR2,$R2 # 15263560vpunpcklqdq $R0,$RR0,$T03561vpunpckhqdq $R0,$RR0,$R03562vpunpcklqdq $R1,$RR1,$T13563vpunpckhqdq $R1,$RR1,$R13564___3565######## switch to %zmm3566map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);3567map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);3568map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);3569map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);35703571$code.=<<___;3572vshufi64x2 \$0x44,$R2,$T2,$RR2 # 152637483573vshufi64x2 \$0x44,$R0,$T0,$RR03574vshufi64x2 \$0x44,$R1,$T1,$RR135753576vmovdqu64 16*0($inp),$T2 # load data3577vmovdqu64 16*4($inp),$T33578lea 16*8($inp),$inp35793580vpsllq \$2,$RR2,$SS2 # S2 = R2*5*43581vpsllq \$2,$RR1,$SS1 # S1 = R1*5*43582vpaddq $RR2,$SS2,$SS23583vpaddq $RR1,$SS1,$SS13584vpsllq \$2,$SS2,$SS23585vpsllq \$2,$SS1,$SS135863587vpbroadcastq $padbit,$PAD3588vpbroadcastq %x#$mask44,$mask443589vpbroadcastq %x#$mask42,$mask4235903591vpbroadcastq %x#$SS1,$S1 # broadcast 8th power3592vpbroadcastq %x#$SS2,$S23593vpbroadcastq %x#$RR0,$R03594vpbroadcastq %x#$RR1,$R13595vpbroadcastq %x#$RR2,$R235963597vpunpcklqdq $T3,$T2,$T1 # transpose data3598vpunpckhqdq $T3,$T2,$T335993600# at this point 64-bit lanes are ordered as 7362514036013602vpsrlq \$24,$T3,$T2 # splat the data3603vporq $PAD,$T2,$T23604vpaddq $T2,$H2,$H2 # accumulate input3605vpandq $mask44,$T1,$T03606vpsrlq \$44,$T1,$T13607vpsllq \$20,$T3,$T33608vporq $T3,$T1,$T13609vpandq $mask44,$T1,$T136103611sub \$8,$len3612jz .Ltail_vpmadd52_8x3613jmp .Loop_vpmadd52_8x36143615.align 323616.Loop_vpmadd52_8x:3617#vpaddq $T2,$H2,$H2 # accumulate input3618vpaddq $T0,$H0,$H03619vpaddq $T1,$H1,$H136203621vpxorq $D0lo,$D0lo,$D0lo3622vpmadd52luq $H2,$S1,$D0lo3623vpxorq $D0hi,$D0hi,$D0hi3624vpmadd52huq $H2,$S1,$D0hi3625vpxorq $D1lo,$D1lo,$D1lo3626vpmadd52luq $H2,$S2,$D1lo3627vpxorq $D1hi,$D1hi,$D1hi3628vpmadd52huq $H2,$S2,$D1hi3629vpxorq $D2lo,$D2lo,$D2lo3630vpmadd52luq $H2,$R0,$D2lo3631vpxorq $D2hi,$D2hi,$D2hi3632vpmadd52huq $H2,$R0,$D2hi36333634vmovdqu64 16*0($inp),$T2 # load data3635vmovdqu64 16*4($inp),$T33636lea 16*8($inp),$inp3637vpmadd52luq $H0,$R0,$D0lo3638vpmadd52huq $H0,$R0,$D0hi3639vpmadd52luq $H0,$R1,$D1lo3640vpmadd52huq $H0,$R1,$D1hi3641vpmadd52luq $H0,$R2,$D2lo3642vpmadd52huq $H0,$R2,$D2hi36433644vpunpcklqdq $T3,$T2,$T1 # transpose data3645vpunpckhqdq $T3,$T2,$T33646vpmadd52luq $H1,$S2,$D0lo3647vpmadd52huq $H1,$S2,$D0hi3648vpmadd52luq $H1,$R0,$D1lo3649vpmadd52huq $H1,$R0,$D1hi3650vpmadd52luq $H1,$R1,$D2lo3651vpmadd52huq $H1,$R1,$D2hi36523653################################################################3654# partial reduction (interleaved with data splat)3655vpsrlq \$44,$D0lo,$tmp3656vpsllq \$8,$D0hi,$D0hi3657vpandq $mask44,$D0lo,$H03658vpaddq $tmp,$D0hi,$D0hi36593660vpsrlq \$24,$T3,$T23661vporq $PAD,$T2,$T23662vpaddq $D0hi,$D1lo,$D1lo36633664vpsrlq \$44,$D1lo,$tmp3665vpsllq \$8,$D1hi,$D1hi3666vpandq $mask44,$D1lo,$H13667vpaddq $tmp,$D1hi,$D1hi36683669vpandq $mask44,$T1,$T03670vpsrlq \$44,$T1,$T13671vpsllq \$20,$T3,$T33672vpaddq $D1hi,$D2lo,$D2lo36733674vpsrlq \$42,$D2lo,$tmp3675vpsllq \$10,$D2hi,$D2hi3676vpandq $mask42,$D2lo,$H23677vpaddq $tmp,$D2hi,$D2hi36783679vpaddq $T2,$H2,$H2 # accumulate input3680vpaddq $D2hi,$H0,$H03681vpsllq \$2,$D2hi,$D2hi36823683vpaddq $D2hi,$H0,$H03684vporq $T3,$T1,$T13685vpandq $mask44,$T1,$T136863687vpsrlq \$44,$H0,$tmp # additional step3688vpandq $mask44,$H0,$H036893690vpaddq $tmp,$H1,$H136913692sub \$8,$len # len-=1283693jnz .Loop_vpmadd52_8x36943695.Ltail_vpmadd52_8x:3696#vpaddq $T2,$H2,$H2 # accumulate input3697vpaddq $T0,$H0,$H03698vpaddq $T1,$H1,$H136993700vpxorq $D0lo,$D0lo,$D0lo3701vpmadd52luq $H2,$SS1,$D0lo3702vpxorq $D0hi,$D0hi,$D0hi3703vpmadd52huq $H2,$SS1,$D0hi3704vpxorq $D1lo,$D1lo,$D1lo3705vpmadd52luq $H2,$SS2,$D1lo3706vpxorq $D1hi,$D1hi,$D1hi3707vpmadd52huq $H2,$SS2,$D1hi3708vpxorq $D2lo,$D2lo,$D2lo3709vpmadd52luq $H2,$RR0,$D2lo3710vpxorq $D2hi,$D2hi,$D2hi3711vpmadd52huq $H2,$RR0,$D2hi37123713vpmadd52luq $H0,$RR0,$D0lo3714vpmadd52huq $H0,$RR0,$D0hi3715vpmadd52luq $H0,$RR1,$D1lo3716vpmadd52huq $H0,$RR1,$D1hi3717vpmadd52luq $H0,$RR2,$D2lo3718vpmadd52huq $H0,$RR2,$D2hi37193720vpmadd52luq $H1,$SS2,$D0lo3721vpmadd52huq $H1,$SS2,$D0hi3722vpmadd52luq $H1,$RR0,$D1lo3723vpmadd52huq $H1,$RR0,$D1hi3724vpmadd52luq $H1,$RR1,$D2lo3725vpmadd52huq $H1,$RR1,$D2hi37263727################################################################3728# horizontal addition37293730mov \$1,%eax3731kmovw %eax,%k13732vpsrldq \$8,$D0lo,$T03733vpsrldq \$8,$D0hi,$H03734vpsrldq \$8,$D1lo,$T13735vpsrldq \$8,$D1hi,$H13736vpaddq $T0,$D0lo,$D0lo3737vpaddq $H0,$D0hi,$D0hi3738vpsrldq \$8,$D2lo,$T23739vpsrldq \$8,$D2hi,$H23740vpaddq $T1,$D1lo,$D1lo3741vpaddq $H1,$D1hi,$D1hi3742vpermq \$0x2,$D0lo,$T03743vpermq \$0x2,$D0hi,$H03744vpaddq $T2,$D2lo,$D2lo3745vpaddq $H2,$D2hi,$D2hi37463747vpermq \$0x2,$D1lo,$T13748vpermq \$0x2,$D1hi,$H13749vpaddq $T0,$D0lo,$D0lo3750vpaddq $H0,$D0hi,$D0hi3751vpermq \$0x2,$D2lo,$T23752vpermq \$0x2,$D2hi,$H23753vpaddq $T1,$D1lo,$D1lo3754vpaddq $H1,$D1hi,$D1hi3755vextracti64x4 \$1,$D0lo,%y#$T03756vextracti64x4 \$1,$D0hi,%y#$H03757vpaddq $T2,$D2lo,$D2lo3758vpaddq $H2,$D2hi,$D2hi37593760vextracti64x4 \$1,$D1lo,%y#$T13761vextracti64x4 \$1,$D1hi,%y#$H13762vextracti64x4 \$1,$D2lo,%y#$T23763vextracti64x4 \$1,$D2hi,%y#$H23764___3765######## switch back to %ymm3766map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);3767map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);3768map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);37693770$code.=<<___;3771vpaddq $T0,$D0lo,${D0lo}{%k1}{z}3772vpaddq $H0,$D0hi,${D0hi}{%k1}{z}3773vpaddq $T1,$D1lo,${D1lo}{%k1}{z}3774vpaddq $H1,$D1hi,${D1hi}{%k1}{z}3775vpaddq $T2,$D2lo,${D2lo}{%k1}{z}3776vpaddq $H2,$D2hi,${D2hi}{%k1}{z}37773778################################################################3779# partial reduction3780vpsrlq \$44,$D0lo,$tmp3781vpsllq \$8,$D0hi,$D0hi3782vpandq $mask44,$D0lo,$H03783vpaddq $tmp,$D0hi,$D0hi37843785vpaddq $D0hi,$D1lo,$D1lo37863787vpsrlq \$44,$D1lo,$tmp3788vpsllq \$8,$D1hi,$D1hi3789vpandq $mask44,$D1lo,$H13790vpaddq $tmp,$D1hi,$D1hi37913792vpaddq $D1hi,$D2lo,$D2lo37933794vpsrlq \$42,$D2lo,$tmp3795vpsllq \$10,$D2hi,$D2hi3796vpandq $mask42,$D2lo,$H23797vpaddq $tmp,$D2hi,$D2hi37983799vpaddq $D2hi,$H0,$H03800vpsllq \$2,$D2hi,$D2hi38013802vpaddq $D2hi,$H0,$H038033804vpsrlq \$44,$H0,$tmp # additional step3805vpandq $mask44,$H0,$H038063807vpaddq $tmp,$H1,$H138083809################################################################38103811vmovq %x#$H0,0($ctx)3812vmovq %x#$H1,8($ctx)3813vmovq %x#$H2,16($ctx)3814vzeroall38153816.Lno_data_vpmadd52_8x:3817RET3818.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x3819___3820}3821$code.=<<___;3822.type poly1305_emit_base2_44,\@function,33823.align 323824poly1305_emit_base2_44:3825mov 0($ctx),%r8 # load hash value3826mov 8($ctx),%r93827mov 16($ctx),%r1038283829mov %r9,%rax3830shr \$20,%r93831shl \$44,%rax3832mov %r10,%rcx3833shr \$40,%r103834shl \$24,%rcx38353836add %rax,%r83837adc %rcx,%r93838adc \$0,%r1038393840mov %r8,%rax3841add \$5,%r8 # compare to modulus3842mov %r9,%rcx3843adc \$0,%r93844adc \$0,%r103845shr \$2,%r10 # did 130-bit value overflow?3846cmovnz %r8,%rax3847cmovnz %r9,%rcx38483849add 0($nonce),%rax # accumulate nonce3850adc 8($nonce),%rcx3851mov %rax,0($mac) # write result3852mov %rcx,8($mac)38533854RET3855.size poly1305_emit_base2_44,.-poly1305_emit_base2_443856___3857} } }3858}38593860if (!$kernel)3861{ # chacha20-poly1305 helpers3862my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order3863("%rdi","%rsi","%rdx","%rcx"); # Unix order3864$code.=<<___;3865.globl xor128_encrypt_n_pad3866.type xor128_encrypt_n_pad,\@abi-omnipotent3867.align 163868xor128_encrypt_n_pad:3869sub $otp,$inp3870sub $otp,$out3871mov $len,%r10 # put len aside3872shr \$4,$len # len / 163873jz .Ltail_enc3874nop3875.Loop_enc_xmm:3876movdqu ($inp,$otp),%xmm03877pxor ($otp),%xmm03878movdqu %xmm0,($out,$otp)3879movdqa %xmm0,($otp)3880lea 16($otp),$otp3881dec $len3882jnz .Loop_enc_xmm38833884and \$15,%r10 # len % 163885jz .Ldone_enc38863887.Ltail_enc:3888mov \$16,$len3889sub %r10,$len3890xor %eax,%eax3891.Loop_enc_byte:3892mov ($inp,$otp),%al3893xor ($otp),%al3894mov %al,($out,$otp)3895mov %al,($otp)3896lea 1($otp),$otp3897dec %r103898jnz .Loop_enc_byte38993900xor %eax,%eax3901.Loop_enc_pad:3902mov %al,($otp)3903lea 1($otp),$otp3904dec $len3905jnz .Loop_enc_pad39063907.Ldone_enc:3908mov $otp,%rax3909RET3910.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad39113912.globl xor128_decrypt_n_pad3913.type xor128_decrypt_n_pad,\@abi-omnipotent3914.align 163915xor128_decrypt_n_pad:3916sub $otp,$inp3917sub $otp,$out3918mov $len,%r10 # put len aside3919shr \$4,$len # len / 163920jz .Ltail_dec3921nop3922.Loop_dec_xmm:3923movdqu ($inp,$otp),%xmm03924movdqa ($otp),%xmm13925pxor %xmm0,%xmm13926movdqu %xmm1,($out,$otp)3927movdqa %xmm0,($otp)3928lea 16($otp),$otp3929dec $len3930jnz .Loop_dec_xmm39313932pxor %xmm1,%xmm13933and \$15,%r10 # len % 163934jz .Ldone_dec39353936.Ltail_dec:3937mov \$16,$len3938sub %r10,$len3939xor %eax,%eax3940xor %r11d,%r11d3941.Loop_dec_byte:3942mov ($inp,$otp),%r11b3943mov ($otp),%al3944xor %r11b,%al3945mov %al,($out,$otp)3946mov %r11b,($otp)3947lea 1($otp),$otp3948dec %r103949jnz .Loop_dec_byte39503951xor %eax,%eax3952.Loop_dec_pad:3953mov %al,($otp)3954lea 1($otp),$otp3955dec $len3956jnz .Loop_dec_pad39573958.Ldone_dec:3959mov $otp,%rax3960RET3961.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad3962___3963}39643965# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,3966# CONTEXT *context,DISPATCHER_CONTEXT *disp)3967if ($win64) {3968$rec="%rcx";3969$frame="%rdx";3970$context="%r8";3971$disp="%r9";39723973$code.=<<___;3974.extern __imp_RtlVirtualUnwind3975.type se_handler,\@abi-omnipotent3976.align 163977se_handler:3978push %rsi3979push %rdi3980push %rbx3981push %rbp3982push %r123983push %r133984push %r143985push %r153986pushfq3987sub \$64,%rsp39883989mov 120($context),%rax # pull context->Rax3990mov 248($context),%rbx # pull context->Rip39913992mov 8($disp),%rsi # disp->ImageBase3993mov 56($disp),%r11 # disp->HandlerData39943995mov 0(%r11),%r10d # HandlerData[0]3996lea (%rsi,%r10),%r10 # prologue label3997cmp %r10,%rbx # context->Rip<.Lprologue3998jb .Lcommon_seh_tail39994000mov 152($context),%rax # pull context->Rsp40014002mov 4(%r11),%r10d # HandlerData[1]4003lea (%rsi,%r10),%r10 # epilogue label4004cmp %r10,%rbx # context->Rip>=.Lepilogue4005jae .Lcommon_seh_tail40064007lea 48(%rax),%rax40084009mov -8(%rax),%rbx4010mov -16(%rax),%rbp4011mov -24(%rax),%r124012mov -32(%rax),%r134013mov -40(%rax),%r144014mov -48(%rax),%r154015mov %rbx,144($context) # restore context->Rbx4016mov %rbp,160($context) # restore context->Rbp4017mov %r12,216($context) # restore context->R124018mov %r13,224($context) # restore context->R134019mov %r14,232($context) # restore context->R144020mov %r15,240($context) # restore context->R1440214022jmp .Lcommon_seh_tail4023.size se_handler,.-se_handler40244025.type avx_handler,\@abi-omnipotent4026.align 164027avx_handler:4028push %rsi4029push %rdi4030push %rbx4031push %rbp4032push %r124033push %r134034push %r144035push %r154036pushfq4037sub \$64,%rsp40384039mov 120($context),%rax # pull context->Rax4040mov 248($context),%rbx # pull context->Rip40414042mov 8($disp),%rsi # disp->ImageBase4043mov 56($disp),%r11 # disp->HandlerData40444045mov 0(%r11),%r10d # HandlerData[0]4046lea (%rsi,%r10),%r10 # prologue label4047cmp %r10,%rbx # context->Rip<prologue label4048jb .Lcommon_seh_tail40494050mov 152($context),%rax # pull context->Rsp40514052mov 4(%r11),%r10d # HandlerData[1]4053lea (%rsi,%r10),%r10 # epilogue label4054cmp %r10,%rbx # context->Rip>=epilogue label4055jae .Lcommon_seh_tail40564057mov 208($context),%rax # pull context->R1140584059lea 0x50(%rax),%rsi4060lea 0xf8(%rax),%rax4061lea 512($context),%rdi # &context.Xmm64062mov \$20,%ecx4063.long 0xa548f3fc # cld; rep movsq40644065.Lcommon_seh_tail:4066mov 8(%rax),%rdi4067mov 16(%rax),%rsi4068mov %rax,152($context) # restore context->Rsp4069mov %rsi,168($context) # restore context->Rsi4070mov %rdi,176($context) # restore context->Rdi40714072mov 40($disp),%rdi # disp->ContextRecord4073mov $context,%rsi # context4074mov \$154,%ecx # sizeof(CONTEXT)4075.long 0xa548f3fc # cld; rep movsq40764077mov $disp,%rsi4078xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER4079mov 8(%rsi),%rdx # arg2, disp->ImageBase4080mov 0(%rsi),%r8 # arg3, disp->ControlPc4081mov 16(%rsi),%r9 # arg4, disp->FunctionEntry4082mov 40(%rsi),%r10 # disp->ContextRecord4083lea 56(%rsi),%r11 # &disp->HandlerData4084lea 24(%rsi),%r12 # &disp->EstablisherFrame4085mov %r10,32(%rsp) # arg54086mov %r11,40(%rsp) # arg64087mov %r12,48(%rsp) # arg74088mov %rcx,56(%rsp) # arg8, (NULL)4089call *__imp_RtlVirtualUnwind(%rip)40904091mov \$1,%eax # ExceptionContinueSearch4092add \$64,%rsp4093popfq4094pop %r154095pop %r144096pop %r134097pop %r124098pop %rbp4099pop %rbx4100pop %rdi4101pop %rsi4102RET4103.size avx_handler,.-avx_handler41044105.section .pdata4106.align 44107.rva .LSEH_begin_poly1305_init_x86_644108.rva .LSEH_end_poly1305_init_x86_644109.rva .LSEH_info_poly1305_init_x86_6441104111.rva .LSEH_begin_poly1305_blocks_x86_644112.rva .LSEH_end_poly1305_blocks_x86_644113.rva .LSEH_info_poly1305_blocks_x86_6441144115.rva .LSEH_begin_poly1305_emit_x86_644116.rva .LSEH_end_poly1305_emit_x86_644117.rva .LSEH_info_poly1305_emit_x86_644118___4119$code.=<<___ if ($avx);4120.rva .LSEH_begin_poly1305_blocks_avx4121.rva .Lbase2_64_avx4122.rva .LSEH_info_poly1305_blocks_avx_141234124.rva .Lbase2_64_avx4125.rva .Leven_avx4126.rva .LSEH_info_poly1305_blocks_avx_241274128.rva .Leven_avx4129.rva .LSEH_end_poly1305_blocks_avx4130.rva .LSEH_info_poly1305_blocks_avx_341314132.rva .LSEH_begin_poly1305_emit_avx4133.rva .LSEH_end_poly1305_emit_avx4134.rva .LSEH_info_poly1305_emit_avx4135___4136$code.=<<___ if ($avx>1);4137.rva .LSEH_begin_poly1305_blocks_avx24138.rva .Lbase2_64_avx24139.rva .LSEH_info_poly1305_blocks_avx2_141404141.rva .Lbase2_64_avx24142.rva .Leven_avx24143.rva .LSEH_info_poly1305_blocks_avx2_241444145.rva .Leven_avx24146.rva .LSEH_end_poly1305_blocks_avx24147.rva .LSEH_info_poly1305_blocks_avx2_34148___4149$code.=<<___ if ($avx>2);4150.rva .LSEH_begin_poly1305_blocks_avx5124151.rva .LSEH_end_poly1305_blocks_avx5124152.rva .LSEH_info_poly1305_blocks_avx5124153___4154$code.=<<___;4155.section .xdata4156.align 84157.LSEH_info_poly1305_init_x86_64:4158.byte 9,0,0,04159.rva se_handler4160.rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_6441614162.LSEH_info_poly1305_blocks_x86_64:4163.byte 9,0,0,04164.rva se_handler4165.rva .Lblocks_body,.Lblocks_epilogue41664167.LSEH_info_poly1305_emit_x86_64:4168.byte 9,0,0,04169.rva se_handler4170.rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_644171___4172$code.=<<___ if ($avx);4173.LSEH_info_poly1305_blocks_avx_1:4174.byte 9,0,0,04175.rva se_handler4176.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]41774178.LSEH_info_poly1305_blocks_avx_2:4179.byte 9,0,0,04180.rva se_handler4181.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]41824183.LSEH_info_poly1305_blocks_avx_3:4184.byte 9,0,0,04185.rva avx_handler4186.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]41874188.LSEH_info_poly1305_emit_avx:4189.byte 9,0,0,04190.rva se_handler4191.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx4192___4193$code.=<<___ if ($avx>1);4194.LSEH_info_poly1305_blocks_avx2_1:4195.byte 9,0,0,04196.rva se_handler4197.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]41984199.LSEH_info_poly1305_blocks_avx2_2:4200.byte 9,0,0,04201.rva se_handler4202.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]42034204.LSEH_info_poly1305_blocks_avx2_3:4205.byte 9,0,0,04206.rva avx_handler4207.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]4208___4209$code.=<<___ if ($avx>2);4210.LSEH_info_poly1305_blocks_avx512:4211.byte 9,0,0,04212.rva avx_handler4213.rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]4214___4215}42164217open SELF,$0;4218while(<SELF>) {4219next if (/^#!/);4220last if (!s/^#/\/\// and !/^$/);4221print;4222}4223close SELF;42244225foreach (split('\n',$code)) {4226s/\`([^\`]*)\`/eval($1)/ge;4227s/%r([a-z]+)#d/%e$1/g;4228s/%r([0-9]+)#d/%r$1d/g;4229s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;42304231if ($kernel) {4232s/(^\.type.*),[0-9]+$/\1/;4233s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;4234next if /^\.cfi.*/;4235}42364237print $_,"\n";4238}4239close STDOUT;424042414242