CoCalc -- poly1305-riscv.pl

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/riscv/poly1305-riscv.pl
²⁹²⁷⁸ views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3
#
4
# ====================================================================
5
# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
6
# ====================================================================
7
#
8
# Poly1305 hash for RISC-V.
9
#
10
# February 2019
11
#
12
# In the essence it's pretty straightforward transliteration of MIPS
13
# module [without big-endian option].
14
#
15
# 1.8 cycles per byte on U74, >100% faster than compiler-generated
16
# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
17
# improvement.
18
#
19
# June 2024.
20
#
21
# Add CHERI support.
22
#
23
######################################################################
24
#
25
($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
26
($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
27
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
28
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
29
#
30
######################################################################
31

32
$flavour = shift || "64";
33

34
for (@ARGV) {   $output=$_ if (/\w[\w\-]*\.\w+$/);   }
35
open STDOUT,">$output";
36

37
$code.=<<___;
38
#ifdef __KERNEL__
39
# ifdef __riscv_zicfilp
40
#  undef __riscv_zicfilp // calls are expected to be direct
41
# endif
42
#endif
43

44
#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
45
# define __riscv_misaligned_fast 1
46
#endif
47
___
48

49
if ($flavour =~ /64/) {{{
50
######################################################################
51
# 64-bit code path...
52
#
53
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
54
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
55

56
$code.=<<___;
57
#if __riscv_xlen == 64
58
# if __SIZEOF_POINTER__ == 16
59
#  define PUSH	csc
60
#  define POP	clc
61
# else
62
#  define PUSH	sd
63
#  define POP	ld
64
# endif
65
#else
66
# error "unsupported __riscv_xlen"
67
#endif
68

69
.option	pic
70
.text
71

72
.globl	poly1305_init
73
.type	poly1305_init,\@function
74
poly1305_init:
75
#ifdef	__riscv_zicfilp
76
	lpad	0
77
#endif
78
	sd	$zero,0($ctx)
79
	sd	$zero,8($ctx)
80
	sd	$zero,16($ctx)
81

82
	beqz	$inp,.Lno_key
83

84
#ifndef	__riscv_misaligned_fast
85
	andi	$tmp0,$inp,7		# $inp % 8
86
	andi	$inp,$inp,-8		# align $inp
87
	slli	$tmp0,$tmp0,3		# byte to bit offset
88
#endif
89
	ld	$in0,0($inp)
90
	ld	$in1,8($inp)
91
#ifndef	__riscv_misaligned_fast
92
	beqz	$tmp0,.Laligned_key
93

94
	ld	$tmp2,16($inp)
95
	neg	$tmp1,$tmp0		# implicit &63 in sll
96
	srl	$in0,$in0,$tmp0
97
	sll	$tmp3,$in1,$tmp1
98
	srl	$in1,$in1,$tmp0
99
	sll	$tmp2,$tmp2,$tmp1
100
	or	$in0,$in0,$tmp3
101
	or	$in1,$in1,$tmp2
102

103
.Laligned_key:
104
#endif
105
	li	$tmp0,1
106
	slli	$tmp0,$tmp0,32		# 0x0000000100000000
107
	addi	$tmp0,$tmp0,-63		# 0x00000000ffffffc1
108
	slli	$tmp0,$tmp0,28		# 0x0ffffffc10000000
109
	addi	$tmp0,$tmp0,-1		# 0x0ffffffc0fffffff
110

111
	and	$in0,$in0,$tmp0
112
	addi	$tmp0,$tmp0,-3		# 0x0ffffffc0ffffffc
113
	and	$in1,$in1,$tmp0
114

115
	sd	$in0,24($ctx)
116
	srli	$tmp0,$in1,2
117
	sd	$in1,32($ctx)
118
	add	$tmp0,$tmp0,$in1	# s1 = r1 + (r1 >> 2)
119
	sd	$tmp0,40($ctx)
120

121
.Lno_key:
122
	li	$a0,0			# return 0
123
	ret
124
.size	poly1305_init,.-poly1305_init
125
___
126
{
127
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
128
   ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
129
my ($shr,$shl) = ($t5,$t6);		# used on R6
130

131
$code.=<<___;
132
.globl	poly1305_blocks
133
.type	poly1305_blocks,\@function
134
poly1305_blocks:
135
#ifdef	__riscv_zicfilp
136
	lpad	0
137
#endif
138
	andi	$len,$len,-16		# complete blocks only
139
	beqz	$len,.Lno_data
140

141
	caddi	$sp,$sp,-4*__SIZEOF_POINTER__
142
	PUSH	$s0,3*__SIZEOF_POINTER__($sp)
143
	PUSH	$s1,2*__SIZEOF_POINTER__($sp)
144
	PUSH	$s2,1*__SIZEOF_POINTER__($sp)
145
	PUSH	$s3,0*__SIZEOF_POINTER__($sp)
146

147
#ifndef	__riscv_misaligned_fast
148
	andi	$shr,$inp,7
149
	andi	$inp,$inp,-8		# align $inp
150
	slli	$shr,$shr,3		# byte to bit offset
151
	neg	$shl,$shr		# implicit &63 in sll
152
#endif
153

154
	ld	$h0,0($ctx)		# load hash value
155
	ld	$h1,8($ctx)
156
	ld	$h2,16($ctx)
157

158
	ld	$r0,24($ctx)		# load key
159
	ld	$r1,32($ctx)
160
	ld	$rs1,40($ctx)
161

162
	add	$len,$len,$inp		# end of buffer
163

164
.Loop:
165
	ld	$in0,0($inp)		# load input
166
	ld	$in1,8($inp)
167
#ifndef	__riscv_misaligned_fast
168
	beqz	$shr,.Laligned_inp
169

170
	ld	$tmp2,16($inp)
171
	srl	$in0,$in0,$shr
172
	sll	$tmp3,$in1,$shl
173
	srl	$in1,$in1,$shr
174
	sll	$tmp2,$tmp2,$shl
175
	or	$in0,$in0,$tmp3
176
	or	$in1,$in1,$tmp2
177

178
.Laligned_inp:
179
#endif
180
	caddi	$inp,$inp,16
181

182
	andi	$tmp0,$h2,-4		# modulo-scheduled reduction
183
	srli	$tmp1,$h2,2
184
	andi	$h2,$h2,3
185

186
	add	$d0,$h0,$in0		# accumulate input
187
	 add	$tmp1,$tmp1,$tmp0
188
	sltu	$tmp0,$d0,$h0
189
	add	$d0,$d0,$tmp1		# ... and residue
190
	sltu	$tmp1,$d0,$tmp1
191
	add	$d1,$h1,$in1
192
	add	$tmp0,$tmp0,$tmp1
193
	sltu	$tmp1,$d1,$h1
194
	add	$d1,$d1,$tmp0
195

196
	 add	$d2,$h2,$padbit
197
	 sltu	$tmp0,$d1,$tmp0
198
	mulhu	$h1,$r0,$d0		# h0*r0
199
	mul	$h0,$r0,$d0
200

201
	 add	$d2,$d2,$tmp1
202
	 add	$d2,$d2,$tmp0
203
	mulhu	$tmp1,$rs1,$d1		# h1*5*r1
204
	mul	$tmp0,$rs1,$d1
205

206
	mulhu	$h2,$r1,$d0		# h0*r1
207
	mul	$tmp2,$r1,$d0
208
	 add	$h0,$h0,$tmp0
209
	 add	$h1,$h1,$tmp1
210
	 sltu	$tmp0,$h0,$tmp0
211

212
	 add	$h1,$h1,$tmp0
213
	 add	$h1,$h1,$tmp2
214
	mulhu	$tmp1,$r0,$d1		# h1*r0
215
	mul	$tmp0,$r0,$d1
216

217
	 sltu	$tmp2,$h1,$tmp2
218
	 add	$h2,$h2,$tmp2
219
	mul	$tmp2,$rs1,$d2		# h2*5*r1
220

221
	 add	$h1,$h1,$tmp0
222
	 add	$h2,$h2,$tmp1
223
	mul	$tmp3,$r0,$d2		# h2*r0
224
	 sltu	$tmp0,$h1,$tmp0
225
	 add	$h2,$h2,$tmp0
226

227
	add	$h1,$h1,$tmp2
228
	sltu	$tmp2,$h1,$tmp2
229
	add	$h2,$h2,$tmp2
230
	add	$h2,$h2,$tmp3
231

232
	bne	$inp,$len,.Loop
233

234
	sd	$h0,0($ctx)		# store hash value
235
	sd	$h1,8($ctx)
236
	sd	$h2,16($ctx)
237

238
	POP	$s0,3*__SIZEOF_POINTER__($sp)		# epilogue
239
	POP	$s1,2*__SIZEOF_POINTER__($sp)
240
	POP	$s2,1*__SIZEOF_POINTER__($sp)
241
	POP	$s3,0*__SIZEOF_POINTER__($sp)
242
	caddi	$sp,$sp,4*__SIZEOF_POINTER__
243

244
.Lno_data:
245
	ret
246
.size	poly1305_blocks,.-poly1305_blocks
247
___
248
}
249
{
250
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
251

252
$code.=<<___;
253
.globl	poly1305_emit
254
.type	poly1305_emit,\@function
255
poly1305_emit:
256
#ifdef	__riscv_zicfilp
257
	lpad	0
258
#endif
259
	ld	$tmp2,16($ctx)
260
	ld	$tmp0,0($ctx)
261
	ld	$tmp1,8($ctx)
262

263
	andi	$in0,$tmp2,-4		# final reduction
264
	srl	$in1,$tmp2,2
265
	andi	$tmp2,$tmp2,3
266
	add	$in0,$in0,$in1
267

268
	add	$tmp0,$tmp0,$in0
269
	sltu	$in1,$tmp0,$in0
270
	 addi	$in0,$tmp0,5		# compare to modulus
271
	add	$tmp1,$tmp1,$in1
272
	 sltiu	$tmp3,$in0,5
273
	sltu	$tmp4,$tmp1,$in1
274
	 add	$in1,$tmp1,$tmp3
275
	add	$tmp2,$tmp2,$tmp4
276
	 sltu	$tmp3,$in1,$tmp3
277
	 add	$tmp2,$tmp2,$tmp3
278

279
	srli	$tmp2,$tmp2,2		# see if it carried/borrowed
280
	neg	$tmp2,$tmp2
281

282
	xor	$in0,$in0,$tmp0
283
	xor	$in1,$in1,$tmp1
284
	and	$in0,$in0,$tmp2
285
	and	$in1,$in1,$tmp2
286
	xor	$in0,$in0,$tmp0
287
	xor	$in1,$in1,$tmp1
288

289
	lwu	$tmp0,0($nonce)		# load nonce
290
	lwu	$tmp1,4($nonce)
291
	lwu	$tmp2,8($nonce)
292
	lwu	$tmp3,12($nonce)
293
	slli	$tmp1,$tmp1,32
294
	slli	$tmp3,$tmp3,32
295
	or	$tmp0,$tmp0,$tmp1
296
	or	$tmp2,$tmp2,$tmp3
297

298
	add	$in0,$in0,$tmp0		# accumulate nonce
299
	add	$in1,$in1,$tmp2
300
	sltu	$tmp0,$in0,$tmp0
301
	add	$in1,$in1,$tmp0
302

303
#ifdef	__riscv_misaligned_fast
304
	sd	$in0,0($mac)		# write mac value
305
	sd	$in1,8($mac)
306
#else
307
	srli	$tmp0,$in0,8		# write mac value
308
	srli	$tmp1,$in0,16
309
	srli	$tmp2,$in0,24
310
	sb	$in0,0($mac)
311
	srli	$tmp3,$in0,32
312
	sb	$tmp0,1($mac)
313
	srli	$tmp0,$in0,40
314
	sb	$tmp1,2($mac)
315
	srli	$tmp1,$in0,48
316
	sb	$tmp2,3($mac)
317
	srli	$tmp2,$in0,56
318
	sb	$tmp3,4($mac)
319
	srli	$tmp3,$in1,8
320
	sb	$tmp0,5($mac)
321
	srli	$tmp0,$in1,16
322
	sb	$tmp1,6($mac)
323
	srli	$tmp1,$in1,24
324
	sb	$tmp2,7($mac)
325

326
	sb	$in1,8($mac)
327
	srli	$tmp2,$in1,32
328
	sb	$tmp3,9($mac)
329
	srli	$tmp3,$in1,40
330
	sb	$tmp0,10($mac)
331
	srli	$tmp0,$in1,48
332
	sb	$tmp1,11($mac)
333
	srli	$tmp1,$in1,56
334
	sb	$tmp2,12($mac)
335
	sb	$tmp3,13($mac)
336
	sb	$tmp0,14($mac)
337
	sb	$tmp1,15($mac)
338
#endif
339

340
	ret
341
.size	poly1305_emit,.-poly1305_emit
342
.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
343
___
344
}
345
}}} else {{{
346
######################################################################
347
# 32-bit code path
348
#
349

350
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
351
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
352
   ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
353

354
$code.=<<___;
355
#if __riscv_xlen == 32
356
# if __SIZEOF_POINTER__ == 8
357
#  define PUSH	csc
358
#  define POP	clc
359
# else
360
#  define PUSH	sw
361
#  define POP	lw
362
# endif
363
# define MULX(hi,lo,a,b)	mulhu hi,a,b; mul lo,a,b
364
# define srliw	srli
365
# define srlw	srl
366
# define sllw	sll
367
# define addw	add
368
# define addiw	addi
369
# define mulw	mul
370
#elif __riscv_xlen == 64
371
# if __SIZEOF_POINTER__ == 16
372
#  define PUSH	csc
373
#  define POP	clc
374
# else
375
#  define PUSH	sd
376
#  define POP	ld
377
# endif
378
# define MULX(hi,lo,a,b)	slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
379
#else
380
# error "unsupported __riscv_xlen"
381
#endif
382

383
.option	pic
384
.text
385

386
.globl	poly1305_init
387
.type	poly1305_init,\@function
388
poly1305_init:
389
#ifdef	__riscv_zicfilp
390
	lpad	0
391
#endif
392
	sw	$zero,0($ctx)
393
	sw	$zero,4($ctx)
394
	sw	$zero,8($ctx)
395
	sw	$zero,12($ctx)
396
	sw	$zero,16($ctx)
397

398
	beqz	$inp,.Lno_key
399

400
#ifndef	__riscv_misaligned_fast
401
	andi	$tmp0,$inp,3		# $inp % 4
402
	sub	$inp,$inp,$tmp0		# align $inp
403
	sll	$tmp0,$tmp0,3		# byte to bit offset
404
#endif
405
	lw	$in0,0($inp)
406
	lw	$in1,4($inp)
407
	lw	$in2,8($inp)
408
	lw	$in3,12($inp)
409
#ifndef	__riscv_misaligned_fast
410
	beqz	$tmp0,.Laligned_key
411

412
	lw	$tmp2,16($inp)
413
	sub	$tmp1,$zero,$tmp0
414
	srlw	$in0,$in0,$tmp0
415
	sllw	$tmp3,$in1,$tmp1
416
	srlw	$in1,$in1,$tmp0
417
	or	$in0,$in0,$tmp3
418
	sllw	$tmp3,$in2,$tmp1
419
	srlw	$in2,$in2,$tmp0
420
	or	$in1,$in1,$tmp3
421
	sllw	$tmp3,$in3,$tmp1
422
	srlw	$in3,$in3,$tmp0
423
	or	$in2,$in2,$tmp3
424
	sllw	$tmp2,$tmp2,$tmp1
425
	or	$in3,$in3,$tmp2
426
.Laligned_key:
427
#endif
428

429
	lui	$tmp0,0x10000
430
	addi	$tmp0,$tmp0,-1		# 0x0fffffff
431
	and	$in0,$in0,$tmp0
432
	addi	$tmp0,$tmp0,-3		# 0x0ffffffc
433
	and	$in1,$in1,$tmp0
434
	and	$in2,$in2,$tmp0
435
	and	$in3,$in3,$tmp0
436

437
	sw	$in0,20($ctx)
438
	sw	$in1,24($ctx)
439
	sw	$in2,28($ctx)
440
	sw	$in3,32($ctx)
441

442
	srlw	$tmp1,$in1,2
443
	srlw	$tmp2,$in2,2
444
	srlw	$tmp3,$in3,2
445
	addw	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
446
	addw	$in2,$in2,$tmp2
447
	addw	$in3,$in3,$tmp3
448
	sw	$in1,36($ctx)
449
	sw	$in2,40($ctx)
450
	sw	$in3,44($ctx)
451
.Lno_key:
452
	li	$a0,0
453
	ret
454
.size	poly1305_init,.-poly1305_init
455
___
456
{
457
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
458
   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
459
my ($d0,$d1,$d2,$d3) =
460
   ($a4,$a5,$a6,$a7);
461
my $shr = $ra;		# used on R6
462

463
$code.=<<___;
464
.globl	poly1305_blocks
465
.type	poly1305_blocks,\@function
466
poly1305_blocks:
467
#ifdef	__riscv_zicfilp
468
	lpad	0
469
#endif
470
	andi	$len,$len,-16		# complete blocks only
471
	beqz	$len,.Labort
472

473
#ifdef	__riscv_zcmp
474
	cm.push	{ra,s0-s8}, -48
475
#else
476
	caddi	$sp,$sp,-__SIZEOF_POINTER__*12
477
	PUSH	$ra, __SIZEOF_POINTER__*11($sp)
478
	PUSH	$s0, __SIZEOF_POINTER__*10($sp)
479
	PUSH	$s1, __SIZEOF_POINTER__*9($sp)
480
	PUSH	$s2, __SIZEOF_POINTER__*8($sp)
481
	PUSH	$s3, __SIZEOF_POINTER__*7($sp)
482
	PUSH	$s4, __SIZEOF_POINTER__*6($sp)
483
	PUSH	$s5, __SIZEOF_POINTER__*5($sp)
484
	PUSH	$s6, __SIZEOF_POINTER__*4($sp)
485
	PUSH	$s7, __SIZEOF_POINTER__*3($sp)
486
	PUSH	$s8, __SIZEOF_POINTER__*2($sp)
487
#endif
488

489
#ifndef	__riscv_misaligned_fast
490
	andi	$shr,$inp,3
491
	andi	$inp,$inp,-4		# align $inp
492
	slli	$shr,$shr,3		# byte to bit offset
493
#endif
494

495
	lw	$h0,0($ctx)		# load hash value
496
	lw	$h1,4($ctx)
497
	lw	$h2,8($ctx)
498
	lw	$h3,12($ctx)
499
	lw	$h4,16($ctx)
500

501
	lw	$r0,20($ctx)		# load key
502
	lw	$r1,24($ctx)
503
	lw	$r2,28($ctx)
504
	lw	$r3,32($ctx)
505
	lw	$rs1,36($ctx)
506
	lw	$rs2,40($ctx)
507
	lw	$rs3,44($ctx)
508

509
	add	$len,$len,$inp		# end of buffer
510

511
.Loop:
512
	lw	$d0,0($inp)		# load input
513
	lw	$d1,4($inp)
514
	lw	$d2,8($inp)
515
	lw	$d3,12($inp)
516
#ifndef	__riscv_misaligned_fast
517
	beqz	$shr,.Laligned_inp
518

519
	lw	$t4,16($inp)
520
	sub	$t5,$zero,$shr
521
	srlw	$d0,$d0,$shr
522
	sllw	$t3,$d1,$t5
523
	srlw	$d1,$d1,$shr
524
	or	$d0,$d0,$t3
525
	sllw	$t3,$d2,$t5
526
	srlw	$d2,$d2,$shr
527
	or	$d1,$d1,$t3
528
	sllw	$t3,$d3,$t5
529
	srlw	$d3,$d3,$shr
530
	or	$d2,$d2,$t3
531
	sllw	$t4,$t4,$t5
532
	or	$d3,$d3,$t4
533

534
.Laligned_inp:
535
#endif
536
	srliw	$t3,$h4,2		# modulo-scheduled reduction
537
	andi	$t4,$h4,-4
538
	andi	$h4,$h4,3
539

540
	addw	$d0,$d0,$h0		# accumulate input
541
	 addw	$t4,$t4,$t3
542
	sltu	$h0,$d0,$h0
543
	addw	$d0,$d0,$t4		# ... and residue
544
	sltu	$t4,$d0,$t4
545

546
	addw	$d1,$d1,$h1
547
	 addw	$h0,$h0,$t4		# carry
548
	sltu	$h1,$d1,$h1
549
	addw	$d1,$d1,$h0
550
	sltu	$h0,$d1,$h0
551

552
	addw	$d2,$d2,$h2
553
	 addw	$h1,$h1,$h0		# carry
554
	sltu	$h2,$d2,$h2
555
	addw	$d2,$d2,$h1
556
	sltu	$h1,$d2,$h1
557

558
	addw	$d3,$d3,$h3
559
	 addw	$h2,$h2,$h1		# carry
560
	sltu	$h3,$d3,$h3
561
	addw	$d3,$d3,$h2
562

563
	MULX	($h1,$h0,$r0,$d0)	# d0*r0
564

565
	 sltu	$h2,$d3,$h2
566
	 addw	$h3,$h3,$h2		# carry
567

568
	MULX	($t4,$t3,$rs3,$d1)	# d1*s3
569

570
	 addw	$h4,$h4,$padbit
571
	 caddi	$inp,$inp,16
572
	 addw	$h4,$h4,$h3
573

574
	MULX	($t6,$a3,$rs2,$d2)	# d2*s2
575
	 addw	$h0,$h0,$t3
576
	 addw	$h1,$h1,$t4
577
	 sltu	$t3,$h0,$t3
578
	 addw	$h1,$h1,$t3
579

580
	MULX	($t4,$t3,$rs1,$d3)	# d3*s1
581
	 addw	$h0,$h0,$a3
582
	 addw	$h1,$h1,$t6
583
	 sltu	$a3,$h0,$a3
584
	 addw	$h1,$h1,$a3
585

586

587
	MULX	($h2,$a3,$r1,$d0)	# d0*r1
588
	 addw	$h0,$h0,$t3
589
	 addw	$h1,$h1,$t4
590
	 sltu	$t3,$h0,$t3
591
	 addw	$h1,$h1,$t3
592

593
	MULX	($t4,$t3,$r0,$d1)	# d1*r0
594
	 addw	$h1,$h1,$a3
595
	 sltu	$a3,$h1,$a3
596
	 addw	$h2,$h2,$a3
597

598
	MULX	($t6,$a3,$rs3,$d2)	# d2*s3
599
	 addw	$h1,$h1,$t3
600
	 addw	$h2,$h2,$t4
601
	 sltu	$t3,$h1,$t3
602
	 addw	$h2,$h2,$t3
603

604
	MULX	($t4,$t3,$rs2,$d3)	# d3*s2
605
	 addw	$h1,$h1,$a3
606
	 addw	$h2,$h2,$t6
607
	 sltu	$a3,$h1,$a3
608
	 addw	$h2,$h2,$a3
609

610
	mulw	$a3,$rs1,$h4		# h4*s1
611
	 addw	$h1,$h1,$t3
612
	 addw	$h2,$h2,$t4
613
	 sltu	$t3,$h1,$t3
614
	 addw	$h2,$h2,$t3
615

616

617
	MULX	($h3,$t3,$r2,$d0)	# d0*r2
618
	 addw	$h1,$h1,$a3
619
	 sltu	$a3,$h1,$a3
620
	 addw	$h2,$h2,$a3
621

622
	MULX	($t6,$a3,$r1,$d1)	# d1*r1
623
	 addw	$h2,$h2,$t3
624
	 sltu	$t3,$h2,$t3
625
	 addw	$h3,$h3,$t3
626

627
	MULX	($t4,$t3,$r0,$d2)	# d2*r0
628
	 addw	$h2,$h2,$a3
629
	 addw	$h3,$h3,$t6
630
	 sltu	$a3,$h2,$a3
631
	 addw	$h3,$h3,$a3
632

633
	MULX	($t6,$a3,$rs3,$d3)	# d3*s3
634
	 addw	$h2,$h2,$t3
635
	 addw	$h3,$h3,$t4
636
	 sltu	$t3,$h2,$t3
637
	 addw	$h3,$h3,$t3
638

639
	mulw	$t3,$rs2,$h4		# h4*s2
640
	 addw	$h2,$h2,$a3
641
	 addw	$h3,$h3,$t6
642
	 sltu	$a3,$h2,$a3
643
	 addw	$h3,$h3,$a3
644

645

646
	MULX	($t6,$a3,$r3,$d0)	# d0*r3
647
	 addw	$h2,$h2,$t3
648
	 sltu	$t3,$h2,$t3
649
	 addw	$h3,$h3,$t3
650

651
	MULX	($t4,$t3,$r2,$d1)	# d1*r2
652
	 addw	$h3,$h3,$a3
653
	 sltu	$a3,$h3,$a3
654
	 addw	$t6,$t6,$a3
655

656
	MULX	($a3,$d3,$r0,$d3)	# d3*r0
657
	 addw	$h3,$h3,$t3
658
	 addw	$t6,$t6,$t4
659
	 sltu	$t3,$h3,$t3
660
	 addw	$t6,$t6,$t3
661

662
	MULX	($t4,$t3,$r1,$d2)	# d2*r1
663
	 addw	$h3,$h3,$d3
664
	 addw	$t6,$t6,$a3
665
	 sltu	$d3,$h3,$d3
666
	 addw	$t6,$t6,$d3
667

668
	mulw	$a3,$rs3,$h4		# h4*s3
669
	 addw	$h3,$h3,$t3
670
	 addw	$t6,$t6,$t4
671
	 sltu	$t3,$h3,$t3
672
	 addw	$t6,$t6,$t3
673

674

675
	mulw	$h4,$r0,$h4		# h4*r0
676
	 addw	$h3,$h3,$a3
677
	 sltu	$a3,$h3,$a3
678
	 addw	$t6,$t6,$a3
679
	addw	$h4,$t6,$h4
680

681
	li	$padbit,1		# if we loop, padbit is 1
682

683
	bne	$inp,$len,.Loop
684

685
	sw	$h0,0($ctx)		# store hash value
686
	sw	$h1,4($ctx)
687
	sw	$h2,8($ctx)
688
	sw	$h3,12($ctx)
689
	sw	$h4,16($ctx)
690

691
#ifdef	__riscv_zcmp
692
	cm.popret	{ra,s0-s8}, 48
693
#else
694
	POP	$ra, __SIZEOF_POINTER__*11($sp)
695
	POP	$s0, __SIZEOF_POINTER__*10($sp)
696
	POP	$s1, __SIZEOF_POINTER__*9($sp)
697
	POP	$s2, __SIZEOF_POINTER__*8($sp)
698
	POP	$s3, __SIZEOF_POINTER__*7($sp)
699
	POP	$s4, __SIZEOF_POINTER__*6($sp)
700
	POP	$s5, __SIZEOF_POINTER__*5($sp)
701
	POP	$s6, __SIZEOF_POINTER__*4($sp)
702
	POP	$s7, __SIZEOF_POINTER__*3($sp)
703
	POP	$s8, __SIZEOF_POINTER__*2($sp)
704
	caddi	$sp,$sp,__SIZEOF_POINTER__*12
705
#endif
706
.Labort:
707
	ret
708
.size	poly1305_blocks,.-poly1305_blocks
709
___
710
}
711
{
712
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
713

714
$code.=<<___;
715
.globl	poly1305_emit
716
.type	poly1305_emit,\@function
717
poly1305_emit:
718
#ifdef	__riscv_zicfilp
719
	lpad	0
720
#endif
721
	lw	$tmp4,16($ctx)
722
	lw	$tmp0,0($ctx)
723
	lw	$tmp1,4($ctx)
724
	lw	$tmp2,8($ctx)
725
	lw	$tmp3,12($ctx)
726

727
	srliw	$ctx,$tmp4,2		# final reduction
728
	andi	$in0,$tmp4,-4
729
	andi	$tmp4,$tmp4,3
730
	addw	$ctx,$ctx,$in0
731

732
	addw	$tmp0,$tmp0,$ctx
733
	sltu	$ctx,$tmp0,$ctx
734
	 addiw	$in0,$tmp0,5		# compare to modulus
735
	addw	$tmp1,$tmp1,$ctx
736
	 sltiu	$in1,$in0,5
737
	sltu	$ctx,$tmp1,$ctx
738
	 addw	$in1,$in1,$tmp1
739
	addw	$tmp2,$tmp2,$ctx
740
	 sltu	$in2,$in1,$tmp1
741
	sltu	$ctx,$tmp2,$ctx
742
	 addw	$in2,$in2,$tmp2
743
	addw	$tmp3,$tmp3,$ctx
744
	 sltu	$in3,$in2,$tmp2
745
	sltu	$ctx,$tmp3,$ctx
746
	 addw	$in3,$in3,$tmp3
747
	addw	$tmp4,$tmp4,$ctx
748
	 sltu	$ctx,$in3,$tmp3
749
	 addw	$ctx,$ctx,$tmp4
750

751
	srl	$ctx,$ctx,2		# see if it carried/borrowed
752
	sub	$ctx,$zero,$ctx
753

754
	xor	$in0,$in0,$tmp0
755
	xor	$in1,$in1,$tmp1
756
	xor	$in2,$in2,$tmp2
757
	xor	$in3,$in3,$tmp3
758
	and	$in0,$in0,$ctx
759
	and	$in1,$in1,$ctx
760
	and	$in2,$in2,$ctx
761
	and	$in3,$in3,$ctx
762
	xor	$in0,$in0,$tmp0
763
	xor	$in1,$in1,$tmp1
764
	xor	$in2,$in2,$tmp2
765
	xor	$in3,$in3,$tmp3
766

767
	lw	$tmp0,0($nonce)		# load nonce
768
	lw	$tmp1,4($nonce)
769
	lw	$tmp2,8($nonce)
770
	lw	$tmp3,12($nonce)
771

772
	addw	$in0,$in0,$tmp0		# accumulate nonce
773
	sltu	$ctx,$in0,$tmp0
774

775
	addw	$in1,$in1,$tmp1
776
	sltu	$tmp1,$in1,$tmp1
777
	addw	$in1,$in1,$ctx
778
	sltu	$ctx,$in1,$ctx
779
	addw	$ctx,$ctx,$tmp1
780

781
	addw	$in2,$in2,$tmp2
782
	sltu	$tmp2,$in2,$tmp2
783
	addw	$in2,$in2,$ctx
784
	sltu	$ctx,$in2,$ctx
785
	addw	$ctx,$ctx,$tmp2
786

787
	addw	$in3,$in3,$tmp3
788
	addw	$in3,$in3,$ctx
789

790
#ifdef	__riscv_misaligned_fast
791
	sw	$in0,0($mac)		# write mac value
792
	sw	$in1,4($mac)
793
	sw	$in2,8($mac)
794
	sw	$in3,12($mac)
795
#else
796
	srl	$tmp0,$in0,8		# write mac value
797
	srl	$tmp1,$in0,16
798
	srl	$tmp2,$in0,24
799
	sb	$in0, 0($mac)
800
	sb	$tmp0,1($mac)
801
	srl	$tmp0,$in1,8
802
	sb	$tmp1,2($mac)
803
	srl	$tmp1,$in1,16
804
	sb	$tmp2,3($mac)
805
	srl	$tmp2,$in1,24
806
	sb	$in1, 4($mac)
807
	sb	$tmp0,5($mac)
808
	srl	$tmp0,$in2,8
809
	sb	$tmp1,6($mac)
810
	srl	$tmp1,$in2,16
811
	sb	$tmp2,7($mac)
812
	srl	$tmp2,$in2,24
813
	sb	$in2, 8($mac)
814
	sb	$tmp0,9($mac)
815
	srl	$tmp0,$in3,8
816
	sb	$tmp1,10($mac)
817
	srl	$tmp1,$in3,16
818
	sb	$tmp2,11($mac)
819
	srl	$tmp2,$in3,24
820
	sb	$in3, 12($mac)
821
	sb	$tmp0,13($mac)
822
	sb	$tmp1,14($mac)
823
	sb	$tmp2,15($mac)
824
#endif
825

826
	ret
827
.size	poly1305_emit,.-poly1305_emit
828
.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
829
___
830
}
831
}}}
832

833
foreach (split("\n", $code)) {
834
    if ($flavour =~ /^cheri/) {
835
	s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
836
	s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
837
	s/\b(ret|jal)\b/c$1/;
838
	s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
839
	m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
840
    } else {
841
	s/\bcaddi?\b/add/ or
842
	s/\bcmove\b/mv/;
843
    }
844
    print $_, "\n";
845
}
846

847
close STDOUT;
848

849
Product

Resources

Company