Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/riscv/poly1305-riscv.pl
29278 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3
#
4
# ====================================================================
5
# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
6
# ====================================================================
7
#
8
# Poly1305 hash for RISC-V.
9
#
10
# February 2019
11
#
12
# In the essence it's pretty straightforward transliteration of MIPS
13
# module [without big-endian option].
14
#
15
# 1.8 cycles per byte on U74, >100% faster than compiler-generated
16
# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
17
# improvement.
18
#
19
# June 2024.
20
#
21
# Add CHERI support.
22
#
23
######################################################################
24
#
25
($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
26
($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
27
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
28
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
29
#
30
######################################################################
31
32
$flavour = shift || "64";
33
34
for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
35
open STDOUT,">$output";
36
37
$code.=<<___;
38
#ifdef __KERNEL__
39
# ifdef __riscv_zicfilp
40
# undef __riscv_zicfilp // calls are expected to be direct
41
# endif
42
#endif
43
44
#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
45
# define __riscv_misaligned_fast 1
46
#endif
47
___
48
49
if ($flavour =~ /64/) {{{
50
######################################################################
51
# 64-bit code path...
52
#
53
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
54
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
55
56
$code.=<<___;
57
#if __riscv_xlen == 64
58
# if __SIZEOF_POINTER__ == 16
59
# define PUSH csc
60
# define POP clc
61
# else
62
# define PUSH sd
63
# define POP ld
64
# endif
65
#else
66
# error "unsupported __riscv_xlen"
67
#endif
68
69
.option pic
70
.text
71
72
.globl poly1305_init
73
.type poly1305_init,\@function
74
poly1305_init:
75
#ifdef __riscv_zicfilp
76
lpad 0
77
#endif
78
sd $zero,0($ctx)
79
sd $zero,8($ctx)
80
sd $zero,16($ctx)
81
82
beqz $inp,.Lno_key
83
84
#ifndef __riscv_misaligned_fast
85
andi $tmp0,$inp,7 # $inp % 8
86
andi $inp,$inp,-8 # align $inp
87
slli $tmp0,$tmp0,3 # byte to bit offset
88
#endif
89
ld $in0,0($inp)
90
ld $in1,8($inp)
91
#ifndef __riscv_misaligned_fast
92
beqz $tmp0,.Laligned_key
93
94
ld $tmp2,16($inp)
95
neg $tmp1,$tmp0 # implicit &63 in sll
96
srl $in0,$in0,$tmp0
97
sll $tmp3,$in1,$tmp1
98
srl $in1,$in1,$tmp0
99
sll $tmp2,$tmp2,$tmp1
100
or $in0,$in0,$tmp3
101
or $in1,$in1,$tmp2
102
103
.Laligned_key:
104
#endif
105
li $tmp0,1
106
slli $tmp0,$tmp0,32 # 0x0000000100000000
107
addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
108
slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
109
addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
110
111
and $in0,$in0,$tmp0
112
addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
113
and $in1,$in1,$tmp0
114
115
sd $in0,24($ctx)
116
srli $tmp0,$in1,2
117
sd $in1,32($ctx)
118
add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
119
sd $tmp0,40($ctx)
120
121
.Lno_key:
122
li $a0,0 # return 0
123
ret
124
.size poly1305_init,.-poly1305_init
125
___
126
{
127
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
128
($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
129
my ($shr,$shl) = ($t5,$t6); # used on R6
130
131
$code.=<<___;
132
.globl poly1305_blocks
133
.type poly1305_blocks,\@function
134
poly1305_blocks:
135
#ifdef __riscv_zicfilp
136
lpad 0
137
#endif
138
andi $len,$len,-16 # complete blocks only
139
beqz $len,.Lno_data
140
141
caddi $sp,$sp,-4*__SIZEOF_POINTER__
142
PUSH $s0,3*__SIZEOF_POINTER__($sp)
143
PUSH $s1,2*__SIZEOF_POINTER__($sp)
144
PUSH $s2,1*__SIZEOF_POINTER__($sp)
145
PUSH $s3,0*__SIZEOF_POINTER__($sp)
146
147
#ifndef __riscv_misaligned_fast
148
andi $shr,$inp,7
149
andi $inp,$inp,-8 # align $inp
150
slli $shr,$shr,3 # byte to bit offset
151
neg $shl,$shr # implicit &63 in sll
152
#endif
153
154
ld $h0,0($ctx) # load hash value
155
ld $h1,8($ctx)
156
ld $h2,16($ctx)
157
158
ld $r0,24($ctx) # load key
159
ld $r1,32($ctx)
160
ld $rs1,40($ctx)
161
162
add $len,$len,$inp # end of buffer
163
164
.Loop:
165
ld $in0,0($inp) # load input
166
ld $in1,8($inp)
167
#ifndef __riscv_misaligned_fast
168
beqz $shr,.Laligned_inp
169
170
ld $tmp2,16($inp)
171
srl $in0,$in0,$shr
172
sll $tmp3,$in1,$shl
173
srl $in1,$in1,$shr
174
sll $tmp2,$tmp2,$shl
175
or $in0,$in0,$tmp3
176
or $in1,$in1,$tmp2
177
178
.Laligned_inp:
179
#endif
180
caddi $inp,$inp,16
181
182
andi $tmp0,$h2,-4 # modulo-scheduled reduction
183
srli $tmp1,$h2,2
184
andi $h2,$h2,3
185
186
add $d0,$h0,$in0 # accumulate input
187
add $tmp1,$tmp1,$tmp0
188
sltu $tmp0,$d0,$h0
189
add $d0,$d0,$tmp1 # ... and residue
190
sltu $tmp1,$d0,$tmp1
191
add $d1,$h1,$in1
192
add $tmp0,$tmp0,$tmp1
193
sltu $tmp1,$d1,$h1
194
add $d1,$d1,$tmp0
195
196
add $d2,$h2,$padbit
197
sltu $tmp0,$d1,$tmp0
198
mulhu $h1,$r0,$d0 # h0*r0
199
mul $h0,$r0,$d0
200
201
add $d2,$d2,$tmp1
202
add $d2,$d2,$tmp0
203
mulhu $tmp1,$rs1,$d1 # h1*5*r1
204
mul $tmp0,$rs1,$d1
205
206
mulhu $h2,$r1,$d0 # h0*r1
207
mul $tmp2,$r1,$d0
208
add $h0,$h0,$tmp0
209
add $h1,$h1,$tmp1
210
sltu $tmp0,$h0,$tmp0
211
212
add $h1,$h1,$tmp0
213
add $h1,$h1,$tmp2
214
mulhu $tmp1,$r0,$d1 # h1*r0
215
mul $tmp0,$r0,$d1
216
217
sltu $tmp2,$h1,$tmp2
218
add $h2,$h2,$tmp2
219
mul $tmp2,$rs1,$d2 # h2*5*r1
220
221
add $h1,$h1,$tmp0
222
add $h2,$h2,$tmp1
223
mul $tmp3,$r0,$d2 # h2*r0
224
sltu $tmp0,$h1,$tmp0
225
add $h2,$h2,$tmp0
226
227
add $h1,$h1,$tmp2
228
sltu $tmp2,$h1,$tmp2
229
add $h2,$h2,$tmp2
230
add $h2,$h2,$tmp3
231
232
bne $inp,$len,.Loop
233
234
sd $h0,0($ctx) # store hash value
235
sd $h1,8($ctx)
236
sd $h2,16($ctx)
237
238
POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
239
POP $s1,2*__SIZEOF_POINTER__($sp)
240
POP $s2,1*__SIZEOF_POINTER__($sp)
241
POP $s3,0*__SIZEOF_POINTER__($sp)
242
caddi $sp,$sp,4*__SIZEOF_POINTER__
243
244
.Lno_data:
245
ret
246
.size poly1305_blocks,.-poly1305_blocks
247
___
248
}
249
{
250
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
251
252
$code.=<<___;
253
.globl poly1305_emit
254
.type poly1305_emit,\@function
255
poly1305_emit:
256
#ifdef __riscv_zicfilp
257
lpad 0
258
#endif
259
ld $tmp2,16($ctx)
260
ld $tmp0,0($ctx)
261
ld $tmp1,8($ctx)
262
263
andi $in0,$tmp2,-4 # final reduction
264
srl $in1,$tmp2,2
265
andi $tmp2,$tmp2,3
266
add $in0,$in0,$in1
267
268
add $tmp0,$tmp0,$in0
269
sltu $in1,$tmp0,$in0
270
addi $in0,$tmp0,5 # compare to modulus
271
add $tmp1,$tmp1,$in1
272
sltiu $tmp3,$in0,5
273
sltu $tmp4,$tmp1,$in1
274
add $in1,$tmp1,$tmp3
275
add $tmp2,$tmp2,$tmp4
276
sltu $tmp3,$in1,$tmp3
277
add $tmp2,$tmp2,$tmp3
278
279
srli $tmp2,$tmp2,2 # see if it carried/borrowed
280
neg $tmp2,$tmp2
281
282
xor $in0,$in0,$tmp0
283
xor $in1,$in1,$tmp1
284
and $in0,$in0,$tmp2
285
and $in1,$in1,$tmp2
286
xor $in0,$in0,$tmp0
287
xor $in1,$in1,$tmp1
288
289
lwu $tmp0,0($nonce) # load nonce
290
lwu $tmp1,4($nonce)
291
lwu $tmp2,8($nonce)
292
lwu $tmp3,12($nonce)
293
slli $tmp1,$tmp1,32
294
slli $tmp3,$tmp3,32
295
or $tmp0,$tmp0,$tmp1
296
or $tmp2,$tmp2,$tmp3
297
298
add $in0,$in0,$tmp0 # accumulate nonce
299
add $in1,$in1,$tmp2
300
sltu $tmp0,$in0,$tmp0
301
add $in1,$in1,$tmp0
302
303
#ifdef __riscv_misaligned_fast
304
sd $in0,0($mac) # write mac value
305
sd $in1,8($mac)
306
#else
307
srli $tmp0,$in0,8 # write mac value
308
srli $tmp1,$in0,16
309
srli $tmp2,$in0,24
310
sb $in0,0($mac)
311
srli $tmp3,$in0,32
312
sb $tmp0,1($mac)
313
srli $tmp0,$in0,40
314
sb $tmp1,2($mac)
315
srli $tmp1,$in0,48
316
sb $tmp2,3($mac)
317
srli $tmp2,$in0,56
318
sb $tmp3,4($mac)
319
srli $tmp3,$in1,8
320
sb $tmp0,5($mac)
321
srli $tmp0,$in1,16
322
sb $tmp1,6($mac)
323
srli $tmp1,$in1,24
324
sb $tmp2,7($mac)
325
326
sb $in1,8($mac)
327
srli $tmp2,$in1,32
328
sb $tmp3,9($mac)
329
srli $tmp3,$in1,40
330
sb $tmp0,10($mac)
331
srli $tmp0,$in1,48
332
sb $tmp1,11($mac)
333
srli $tmp1,$in1,56
334
sb $tmp2,12($mac)
335
sb $tmp3,13($mac)
336
sb $tmp0,14($mac)
337
sb $tmp1,15($mac)
338
#endif
339
340
ret
341
.size poly1305_emit,.-poly1305_emit
342
.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
343
___
344
}
345
}}} else {{{
346
######################################################################
347
# 32-bit code path
348
#
349
350
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
351
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
352
($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
353
354
$code.=<<___;
355
#if __riscv_xlen == 32
356
# if __SIZEOF_POINTER__ == 8
357
# define PUSH csc
358
# define POP clc
359
# else
360
# define PUSH sw
361
# define POP lw
362
# endif
363
# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
364
# define srliw srli
365
# define srlw srl
366
# define sllw sll
367
# define addw add
368
# define addiw addi
369
# define mulw mul
370
#elif __riscv_xlen == 64
371
# if __SIZEOF_POINTER__ == 16
372
# define PUSH csc
373
# define POP clc
374
# else
375
# define PUSH sd
376
# define POP ld
377
# endif
378
# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
379
#else
380
# error "unsupported __riscv_xlen"
381
#endif
382
383
.option pic
384
.text
385
386
.globl poly1305_init
387
.type poly1305_init,\@function
388
poly1305_init:
389
#ifdef __riscv_zicfilp
390
lpad 0
391
#endif
392
sw $zero,0($ctx)
393
sw $zero,4($ctx)
394
sw $zero,8($ctx)
395
sw $zero,12($ctx)
396
sw $zero,16($ctx)
397
398
beqz $inp,.Lno_key
399
400
#ifndef __riscv_misaligned_fast
401
andi $tmp0,$inp,3 # $inp % 4
402
sub $inp,$inp,$tmp0 # align $inp
403
sll $tmp0,$tmp0,3 # byte to bit offset
404
#endif
405
lw $in0,0($inp)
406
lw $in1,4($inp)
407
lw $in2,8($inp)
408
lw $in3,12($inp)
409
#ifndef __riscv_misaligned_fast
410
beqz $tmp0,.Laligned_key
411
412
lw $tmp2,16($inp)
413
sub $tmp1,$zero,$tmp0
414
srlw $in0,$in0,$tmp0
415
sllw $tmp3,$in1,$tmp1
416
srlw $in1,$in1,$tmp0
417
or $in0,$in0,$tmp3
418
sllw $tmp3,$in2,$tmp1
419
srlw $in2,$in2,$tmp0
420
or $in1,$in1,$tmp3
421
sllw $tmp3,$in3,$tmp1
422
srlw $in3,$in3,$tmp0
423
or $in2,$in2,$tmp3
424
sllw $tmp2,$tmp2,$tmp1
425
or $in3,$in3,$tmp2
426
.Laligned_key:
427
#endif
428
429
lui $tmp0,0x10000
430
addi $tmp0,$tmp0,-1 # 0x0fffffff
431
and $in0,$in0,$tmp0
432
addi $tmp0,$tmp0,-3 # 0x0ffffffc
433
and $in1,$in1,$tmp0
434
and $in2,$in2,$tmp0
435
and $in3,$in3,$tmp0
436
437
sw $in0,20($ctx)
438
sw $in1,24($ctx)
439
sw $in2,28($ctx)
440
sw $in3,32($ctx)
441
442
srlw $tmp1,$in1,2
443
srlw $tmp2,$in2,2
444
srlw $tmp3,$in3,2
445
addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
446
addw $in2,$in2,$tmp2
447
addw $in3,$in3,$tmp3
448
sw $in1,36($ctx)
449
sw $in2,40($ctx)
450
sw $in3,44($ctx)
451
.Lno_key:
452
li $a0,0
453
ret
454
.size poly1305_init,.-poly1305_init
455
___
456
{
457
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
458
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
459
my ($d0,$d1,$d2,$d3) =
460
($a4,$a5,$a6,$a7);
461
my $shr = $ra; # used on R6
462
463
$code.=<<___;
464
.globl poly1305_blocks
465
.type poly1305_blocks,\@function
466
poly1305_blocks:
467
#ifdef __riscv_zicfilp
468
lpad 0
469
#endif
470
andi $len,$len,-16 # complete blocks only
471
beqz $len,.Labort
472
473
#ifdef __riscv_zcmp
474
cm.push {ra,s0-s8}, -48
475
#else
476
caddi $sp,$sp,-__SIZEOF_POINTER__*12
477
PUSH $ra, __SIZEOF_POINTER__*11($sp)
478
PUSH $s0, __SIZEOF_POINTER__*10($sp)
479
PUSH $s1, __SIZEOF_POINTER__*9($sp)
480
PUSH $s2, __SIZEOF_POINTER__*8($sp)
481
PUSH $s3, __SIZEOF_POINTER__*7($sp)
482
PUSH $s4, __SIZEOF_POINTER__*6($sp)
483
PUSH $s5, __SIZEOF_POINTER__*5($sp)
484
PUSH $s6, __SIZEOF_POINTER__*4($sp)
485
PUSH $s7, __SIZEOF_POINTER__*3($sp)
486
PUSH $s8, __SIZEOF_POINTER__*2($sp)
487
#endif
488
489
#ifndef __riscv_misaligned_fast
490
andi $shr,$inp,3
491
andi $inp,$inp,-4 # align $inp
492
slli $shr,$shr,3 # byte to bit offset
493
#endif
494
495
lw $h0,0($ctx) # load hash value
496
lw $h1,4($ctx)
497
lw $h2,8($ctx)
498
lw $h3,12($ctx)
499
lw $h4,16($ctx)
500
501
lw $r0,20($ctx) # load key
502
lw $r1,24($ctx)
503
lw $r2,28($ctx)
504
lw $r3,32($ctx)
505
lw $rs1,36($ctx)
506
lw $rs2,40($ctx)
507
lw $rs3,44($ctx)
508
509
add $len,$len,$inp # end of buffer
510
511
.Loop:
512
lw $d0,0($inp) # load input
513
lw $d1,4($inp)
514
lw $d2,8($inp)
515
lw $d3,12($inp)
516
#ifndef __riscv_misaligned_fast
517
beqz $shr,.Laligned_inp
518
519
lw $t4,16($inp)
520
sub $t5,$zero,$shr
521
srlw $d0,$d0,$shr
522
sllw $t3,$d1,$t5
523
srlw $d1,$d1,$shr
524
or $d0,$d0,$t3
525
sllw $t3,$d2,$t5
526
srlw $d2,$d2,$shr
527
or $d1,$d1,$t3
528
sllw $t3,$d3,$t5
529
srlw $d3,$d3,$shr
530
or $d2,$d2,$t3
531
sllw $t4,$t4,$t5
532
or $d3,$d3,$t4
533
534
.Laligned_inp:
535
#endif
536
srliw $t3,$h4,2 # modulo-scheduled reduction
537
andi $t4,$h4,-4
538
andi $h4,$h4,3
539
540
addw $d0,$d0,$h0 # accumulate input
541
addw $t4,$t4,$t3
542
sltu $h0,$d0,$h0
543
addw $d0,$d0,$t4 # ... and residue
544
sltu $t4,$d0,$t4
545
546
addw $d1,$d1,$h1
547
addw $h0,$h0,$t4 # carry
548
sltu $h1,$d1,$h1
549
addw $d1,$d1,$h0
550
sltu $h0,$d1,$h0
551
552
addw $d2,$d2,$h2
553
addw $h1,$h1,$h0 # carry
554
sltu $h2,$d2,$h2
555
addw $d2,$d2,$h1
556
sltu $h1,$d2,$h1
557
558
addw $d3,$d3,$h3
559
addw $h2,$h2,$h1 # carry
560
sltu $h3,$d3,$h3
561
addw $d3,$d3,$h2
562
563
MULX ($h1,$h0,$r0,$d0) # d0*r0
564
565
sltu $h2,$d3,$h2
566
addw $h3,$h3,$h2 # carry
567
568
MULX ($t4,$t3,$rs3,$d1) # d1*s3
569
570
addw $h4,$h4,$padbit
571
caddi $inp,$inp,16
572
addw $h4,$h4,$h3
573
574
MULX ($t6,$a3,$rs2,$d2) # d2*s2
575
addw $h0,$h0,$t3
576
addw $h1,$h1,$t4
577
sltu $t3,$h0,$t3
578
addw $h1,$h1,$t3
579
580
MULX ($t4,$t3,$rs1,$d3) # d3*s1
581
addw $h0,$h0,$a3
582
addw $h1,$h1,$t6
583
sltu $a3,$h0,$a3
584
addw $h1,$h1,$a3
585
586
587
MULX ($h2,$a3,$r1,$d0) # d0*r1
588
addw $h0,$h0,$t3
589
addw $h1,$h1,$t4
590
sltu $t3,$h0,$t3
591
addw $h1,$h1,$t3
592
593
MULX ($t4,$t3,$r0,$d1) # d1*r0
594
addw $h1,$h1,$a3
595
sltu $a3,$h1,$a3
596
addw $h2,$h2,$a3
597
598
MULX ($t6,$a3,$rs3,$d2) # d2*s3
599
addw $h1,$h1,$t3
600
addw $h2,$h2,$t4
601
sltu $t3,$h1,$t3
602
addw $h2,$h2,$t3
603
604
MULX ($t4,$t3,$rs2,$d3) # d3*s2
605
addw $h1,$h1,$a3
606
addw $h2,$h2,$t6
607
sltu $a3,$h1,$a3
608
addw $h2,$h2,$a3
609
610
mulw $a3,$rs1,$h4 # h4*s1
611
addw $h1,$h1,$t3
612
addw $h2,$h2,$t4
613
sltu $t3,$h1,$t3
614
addw $h2,$h2,$t3
615
616
617
MULX ($h3,$t3,$r2,$d0) # d0*r2
618
addw $h1,$h1,$a3
619
sltu $a3,$h1,$a3
620
addw $h2,$h2,$a3
621
622
MULX ($t6,$a3,$r1,$d1) # d1*r1
623
addw $h2,$h2,$t3
624
sltu $t3,$h2,$t3
625
addw $h3,$h3,$t3
626
627
MULX ($t4,$t3,$r0,$d2) # d2*r0
628
addw $h2,$h2,$a3
629
addw $h3,$h3,$t6
630
sltu $a3,$h2,$a3
631
addw $h3,$h3,$a3
632
633
MULX ($t6,$a3,$rs3,$d3) # d3*s3
634
addw $h2,$h2,$t3
635
addw $h3,$h3,$t4
636
sltu $t3,$h2,$t3
637
addw $h3,$h3,$t3
638
639
mulw $t3,$rs2,$h4 # h4*s2
640
addw $h2,$h2,$a3
641
addw $h3,$h3,$t6
642
sltu $a3,$h2,$a3
643
addw $h3,$h3,$a3
644
645
646
MULX ($t6,$a3,$r3,$d0) # d0*r3
647
addw $h2,$h2,$t3
648
sltu $t3,$h2,$t3
649
addw $h3,$h3,$t3
650
651
MULX ($t4,$t3,$r2,$d1) # d1*r2
652
addw $h3,$h3,$a3
653
sltu $a3,$h3,$a3
654
addw $t6,$t6,$a3
655
656
MULX ($a3,$d3,$r0,$d3) # d3*r0
657
addw $h3,$h3,$t3
658
addw $t6,$t6,$t4
659
sltu $t3,$h3,$t3
660
addw $t6,$t6,$t3
661
662
MULX ($t4,$t3,$r1,$d2) # d2*r1
663
addw $h3,$h3,$d3
664
addw $t6,$t6,$a3
665
sltu $d3,$h3,$d3
666
addw $t6,$t6,$d3
667
668
mulw $a3,$rs3,$h4 # h4*s3
669
addw $h3,$h3,$t3
670
addw $t6,$t6,$t4
671
sltu $t3,$h3,$t3
672
addw $t6,$t6,$t3
673
674
675
mulw $h4,$r0,$h4 # h4*r0
676
addw $h3,$h3,$a3
677
sltu $a3,$h3,$a3
678
addw $t6,$t6,$a3
679
addw $h4,$t6,$h4
680
681
li $padbit,1 # if we loop, padbit is 1
682
683
bne $inp,$len,.Loop
684
685
sw $h0,0($ctx) # store hash value
686
sw $h1,4($ctx)
687
sw $h2,8($ctx)
688
sw $h3,12($ctx)
689
sw $h4,16($ctx)
690
691
#ifdef __riscv_zcmp
692
cm.popret {ra,s0-s8}, 48
693
#else
694
POP $ra, __SIZEOF_POINTER__*11($sp)
695
POP $s0, __SIZEOF_POINTER__*10($sp)
696
POP $s1, __SIZEOF_POINTER__*9($sp)
697
POP $s2, __SIZEOF_POINTER__*8($sp)
698
POP $s3, __SIZEOF_POINTER__*7($sp)
699
POP $s4, __SIZEOF_POINTER__*6($sp)
700
POP $s5, __SIZEOF_POINTER__*5($sp)
701
POP $s6, __SIZEOF_POINTER__*4($sp)
702
POP $s7, __SIZEOF_POINTER__*3($sp)
703
POP $s8, __SIZEOF_POINTER__*2($sp)
704
caddi $sp,$sp,__SIZEOF_POINTER__*12
705
#endif
706
.Labort:
707
ret
708
.size poly1305_blocks,.-poly1305_blocks
709
___
710
}
711
{
712
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
713
714
$code.=<<___;
715
.globl poly1305_emit
716
.type poly1305_emit,\@function
717
poly1305_emit:
718
#ifdef __riscv_zicfilp
719
lpad 0
720
#endif
721
lw $tmp4,16($ctx)
722
lw $tmp0,0($ctx)
723
lw $tmp1,4($ctx)
724
lw $tmp2,8($ctx)
725
lw $tmp3,12($ctx)
726
727
srliw $ctx,$tmp4,2 # final reduction
728
andi $in0,$tmp4,-4
729
andi $tmp4,$tmp4,3
730
addw $ctx,$ctx,$in0
731
732
addw $tmp0,$tmp0,$ctx
733
sltu $ctx,$tmp0,$ctx
734
addiw $in0,$tmp0,5 # compare to modulus
735
addw $tmp1,$tmp1,$ctx
736
sltiu $in1,$in0,5
737
sltu $ctx,$tmp1,$ctx
738
addw $in1,$in1,$tmp1
739
addw $tmp2,$tmp2,$ctx
740
sltu $in2,$in1,$tmp1
741
sltu $ctx,$tmp2,$ctx
742
addw $in2,$in2,$tmp2
743
addw $tmp3,$tmp3,$ctx
744
sltu $in3,$in2,$tmp2
745
sltu $ctx,$tmp3,$ctx
746
addw $in3,$in3,$tmp3
747
addw $tmp4,$tmp4,$ctx
748
sltu $ctx,$in3,$tmp3
749
addw $ctx,$ctx,$tmp4
750
751
srl $ctx,$ctx,2 # see if it carried/borrowed
752
sub $ctx,$zero,$ctx
753
754
xor $in0,$in0,$tmp0
755
xor $in1,$in1,$tmp1
756
xor $in2,$in2,$tmp2
757
xor $in3,$in3,$tmp3
758
and $in0,$in0,$ctx
759
and $in1,$in1,$ctx
760
and $in2,$in2,$ctx
761
and $in3,$in3,$ctx
762
xor $in0,$in0,$tmp0
763
xor $in1,$in1,$tmp1
764
xor $in2,$in2,$tmp2
765
xor $in3,$in3,$tmp3
766
767
lw $tmp0,0($nonce) # load nonce
768
lw $tmp1,4($nonce)
769
lw $tmp2,8($nonce)
770
lw $tmp3,12($nonce)
771
772
addw $in0,$in0,$tmp0 # accumulate nonce
773
sltu $ctx,$in0,$tmp0
774
775
addw $in1,$in1,$tmp1
776
sltu $tmp1,$in1,$tmp1
777
addw $in1,$in1,$ctx
778
sltu $ctx,$in1,$ctx
779
addw $ctx,$ctx,$tmp1
780
781
addw $in2,$in2,$tmp2
782
sltu $tmp2,$in2,$tmp2
783
addw $in2,$in2,$ctx
784
sltu $ctx,$in2,$ctx
785
addw $ctx,$ctx,$tmp2
786
787
addw $in3,$in3,$tmp3
788
addw $in3,$in3,$ctx
789
790
#ifdef __riscv_misaligned_fast
791
sw $in0,0($mac) # write mac value
792
sw $in1,4($mac)
793
sw $in2,8($mac)
794
sw $in3,12($mac)
795
#else
796
srl $tmp0,$in0,8 # write mac value
797
srl $tmp1,$in0,16
798
srl $tmp2,$in0,24
799
sb $in0, 0($mac)
800
sb $tmp0,1($mac)
801
srl $tmp0,$in1,8
802
sb $tmp1,2($mac)
803
srl $tmp1,$in1,16
804
sb $tmp2,3($mac)
805
srl $tmp2,$in1,24
806
sb $in1, 4($mac)
807
sb $tmp0,5($mac)
808
srl $tmp0,$in2,8
809
sb $tmp1,6($mac)
810
srl $tmp1,$in2,16
811
sb $tmp2,7($mac)
812
srl $tmp2,$in2,24
813
sb $in2, 8($mac)
814
sb $tmp0,9($mac)
815
srl $tmp0,$in3,8
816
sb $tmp1,10($mac)
817
srl $tmp1,$in3,16
818
sb $tmp2,11($mac)
819
srl $tmp2,$in3,24
820
sb $in3, 12($mac)
821
sb $tmp0,13($mac)
822
sb $tmp1,14($mac)
823
sb $tmp2,15($mac)
824
#endif
825
826
ret
827
.size poly1305_emit,.-poly1305_emit
828
.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
829
___
830
}
831
}}}
832
833
foreach (split("\n", $code)) {
834
if ($flavour =~ /^cheri/) {
835
s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
836
s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
837
s/\b(ret|jal)\b/c$1/;
838
s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
839
m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
840
} else {
841
s/\bcaddi?\b/add/ or
842
s/\bcmove\b/mv/;
843
}
844
print $_, "\n";
845
}
846
847
close STDOUT;
848
849