Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/mips/poly1305-mips.pl
29278 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3
#
4
# ====================================================================
5
# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6
# project.
7
# ====================================================================
8
9
# Poly1305 hash for MIPS.
10
#
11
# May 2016
12
#
13
# Numbers are cycles per processed byte with poly1305_blocks alone.
14
#
15
# IALU/gcc
16
# R1x000 ~5.5/+130% (big-endian)
17
# Octeon II 2.50/+70% (little-endian)
18
#
19
# March 2019
20
#
21
# Add 32-bit code path.
22
#
23
# October 2019
24
#
25
# Modulo-scheduling reduction allows to omit dependency chain at the
26
# end of inner loop and improve performance. Also optimize MIPS32R2
27
# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28
#
29
# IALU/gcc
30
# R1x000 ~9.8/? (big-endian)
31
# Octeon II 3.65/+140% (little-endian)
32
# MT7621/1004K 4.75/? (little-endian)
33
#
34
######################################################################
35
# There is a number of MIPS ABI in use, O32 and N32/64 are most
36
# widely used. Then there is a new contender: NUBI. It appears that if
37
# one picks the latter, it's possible to arrange code in ABI neutral
38
# manner. Therefore let's stick to NUBI register layout:
39
#
40
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44
#
45
# The return value is placed in $a0. Following coding rules facilitate
46
# interoperability:
47
#
48
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49
# excluded from the rule, because it's specified volatile];
50
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51
# old code];
52
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53
#
54
# For reference here is register layout for N32/64 MIPS ABIs:
55
#
56
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61
#
62
# <[email protected]>
63
#
64
######################################################################
65
66
$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70
if ($flavour =~ /64|n32/i) {{{
71
######################################################################
72
# 64-bit code path
73
#
74
75
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78
$code.=<<___;
79
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80
defined(_MIPS_ARCH_MIPS64R6)) \\
81
&& !defined(_MIPS_ARCH_MIPS64R2)
82
# define _MIPS_ARCH_MIPS64R2
83
#endif
84
85
#if defined(_MIPS_ARCH_MIPS64R6)
86
# define dmultu(rs,rt)
87
# define mflo(rd,rs,rt) dmulu rd,rs,rt
88
# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
89
#else
90
# define dmultu(rs,rt) dmultu rs,rt
91
# define mflo(rd,rs,rt) mflo rd
92
# define mfhi(rd,rs,rt) mfhi rd
93
#endif
94
95
#ifdef __KERNEL__
96
# define poly1305_init poly1305_block_init
97
#endif
98
99
#if defined(__MIPSEB__) && !defined(MIPSEB)
100
# define MIPSEB
101
#endif
102
103
#ifdef MIPSEB
104
# define MSB 0
105
# define LSB 7
106
#else
107
# define MSB 7
108
# define LSB 0
109
#endif
110
111
.text
112
.set noat
113
.set noreorder
114
115
.align 5
116
.globl poly1305_init
117
.ent poly1305_init
118
poly1305_init:
119
.frame $sp,0,$ra
120
.set reorder
121
122
sd $zero,0($ctx)
123
sd $zero,8($ctx)
124
sd $zero,16($ctx)
125
126
beqz $inp,.Lno_key
127
128
#if defined(_MIPS_ARCH_MIPS64R6)
129
andi $tmp0,$inp,7 # $inp % 8
130
dsubu $inp,$inp,$tmp0 # align $inp
131
sll $tmp0,$tmp0,3 # byte to bit offset
132
ld $in0,0($inp)
133
ld $in1,8($inp)
134
beqz $tmp0,.Laligned_key
135
ld $tmp2,16($inp)
136
137
subu $tmp1,$zero,$tmp0
138
# ifdef MIPSEB
139
dsllv $in0,$in0,$tmp0
140
dsrlv $tmp3,$in1,$tmp1
141
dsllv $in1,$in1,$tmp0
142
dsrlv $tmp2,$tmp2,$tmp1
143
# else
144
dsrlv $in0,$in0,$tmp0
145
dsllv $tmp3,$in1,$tmp1
146
dsrlv $in1,$in1,$tmp0
147
dsllv $tmp2,$tmp2,$tmp1
148
# endif
149
or $in0,$in0,$tmp3
150
or $in1,$in1,$tmp2
151
.Laligned_key:
152
#else
153
ldl $in0,0+MSB($inp)
154
ldl $in1,8+MSB($inp)
155
ldr $in0,0+LSB($inp)
156
ldr $in1,8+LSB($inp)
157
#endif
158
#ifdef MIPSEB
159
# if defined(_MIPS_ARCH_MIPS64R2)
160
dsbh $in0,$in0 # byte swap
161
dsbh $in1,$in1
162
dshd $in0,$in0
163
dshd $in1,$in1
164
# else
165
ori $tmp0,$zero,0xFF
166
dsll $tmp2,$tmp0,32
167
or $tmp0,$tmp2 # 0x000000FF000000FF
168
169
and $tmp1,$in0,$tmp0 # byte swap
170
and $tmp3,$in1,$tmp0
171
dsrl $tmp2,$in0,24
172
dsrl $tmp4,$in1,24
173
dsll $tmp1,24
174
dsll $tmp3,24
175
and $tmp2,$tmp0
176
and $tmp4,$tmp0
177
dsll $tmp0,8 # 0x0000FF000000FF00
178
or $tmp1,$tmp2
179
or $tmp3,$tmp4
180
and $tmp2,$in0,$tmp0
181
and $tmp4,$in1,$tmp0
182
dsrl $in0,8
183
dsrl $in1,8
184
dsll $tmp2,8
185
dsll $tmp4,8
186
and $in0,$tmp0
187
and $in1,$tmp0
188
or $tmp1,$tmp2
189
or $tmp3,$tmp4
190
or $in0,$tmp1
191
or $in1,$tmp3
192
dsrl $tmp1,$in0,32
193
dsrl $tmp3,$in1,32
194
dsll $in0,32
195
dsll $in1,32
196
or $in0,$tmp1
197
or $in1,$tmp3
198
# endif
199
#endif
200
li $tmp0,1
201
dsll $tmp0,32 # 0x0000000100000000
202
daddiu $tmp0,-63 # 0x00000000ffffffc1
203
dsll $tmp0,28 # 0x0ffffffc10000000
204
daddiu $tmp0,-1 # 0x0ffffffc0fffffff
205
206
and $in0,$tmp0
207
daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
208
and $in1,$tmp0
209
210
sd $in0,24($ctx)
211
dsrl $tmp0,$in1,2
212
sd $in1,32($ctx)
213
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
214
sd $tmp0,40($ctx)
215
216
.Lno_key:
217
li $v0,0 # return 0
218
jr $ra
219
.end poly1305_init
220
___
221
{
222
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
223
224
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
225
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
226
my ($shr,$shl) = ($s6,$s7); # used on R6
227
228
$code.=<<___;
229
.align 5
230
.globl poly1305_blocks
231
.ent poly1305_blocks
232
poly1305_blocks:
233
.set noreorder
234
dsrl $len,4 # number of complete blocks
235
bnez $len,poly1305_blocks_internal
236
nop
237
jr $ra
238
nop
239
.end poly1305_blocks
240
241
.align 5
242
.ent poly1305_blocks_internal
243
poly1305_blocks_internal:
244
.set noreorder
245
#if defined(_MIPS_ARCH_MIPS64R6)
246
.frame $sp,8*8,$ra
247
.mask $SAVED_REGS_MASK|0x000c0000,-8
248
dsubu $sp,8*8
249
sd $s7,56($sp)
250
sd $s6,48($sp)
251
#else
252
.frame $sp,6*8,$ra
253
.mask $SAVED_REGS_MASK,-8
254
dsubu $sp,6*8
255
#endif
256
sd $s5,40($sp)
257
sd $s4,32($sp)
258
___
259
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
260
sd $s3,24($sp)
261
sd $s2,16($sp)
262
sd $s1,8($sp)
263
sd $s0,0($sp)
264
___
265
$code.=<<___;
266
.set reorder
267
268
#if defined(_MIPS_ARCH_MIPS64R6)
269
andi $shr,$inp,7
270
dsubu $inp,$inp,$shr # align $inp
271
sll $shr,$shr,3 # byte to bit offset
272
subu $shl,$zero,$shr
273
#endif
274
275
ld $h0,0($ctx) # load hash value
276
ld $h1,8($ctx)
277
ld $h2,16($ctx)
278
279
ld $r0,24($ctx) # load key
280
ld $r1,32($ctx)
281
ld $rs1,40($ctx)
282
283
dsll $len,4
284
daddu $len,$inp # end of buffer
285
b .Loop
286
287
.align 4
288
.Loop:
289
#if defined(_MIPS_ARCH_MIPS64R6)
290
ld $in0,0($inp) # load input
291
ld $in1,8($inp)
292
beqz $shr,.Laligned_inp
293
294
ld $tmp2,16($inp)
295
# ifdef MIPSEB
296
dsllv $in0,$in0,$shr
297
dsrlv $tmp3,$in1,$shl
298
dsllv $in1,$in1,$shr
299
dsrlv $tmp2,$tmp2,$shl
300
# else
301
dsrlv $in0,$in0,$shr
302
dsllv $tmp3,$in1,$shl
303
dsrlv $in1,$in1,$shr
304
dsllv $tmp2,$tmp2,$shl
305
# endif
306
or $in0,$in0,$tmp3
307
or $in1,$in1,$tmp2
308
.Laligned_inp:
309
#else
310
ldl $in0,0+MSB($inp) # load input
311
ldl $in1,8+MSB($inp)
312
ldr $in0,0+LSB($inp)
313
ldr $in1,8+LSB($inp)
314
#endif
315
daddiu $inp,16
316
#ifdef MIPSEB
317
# if defined(_MIPS_ARCH_MIPS64R2)
318
dsbh $in0,$in0 # byte swap
319
dsbh $in1,$in1
320
dshd $in0,$in0
321
dshd $in1,$in1
322
# else
323
ori $tmp0,$zero,0xFF
324
dsll $tmp2,$tmp0,32
325
or $tmp0,$tmp2 # 0x000000FF000000FF
326
327
and $tmp1,$in0,$tmp0 # byte swap
328
and $tmp3,$in1,$tmp0
329
dsrl $tmp2,$in0,24
330
dsrl $tmp4,$in1,24
331
dsll $tmp1,24
332
dsll $tmp3,24
333
and $tmp2,$tmp0
334
and $tmp4,$tmp0
335
dsll $tmp0,8 # 0x0000FF000000FF00
336
or $tmp1,$tmp2
337
or $tmp3,$tmp4
338
and $tmp2,$in0,$tmp0
339
and $tmp4,$in1,$tmp0
340
dsrl $in0,8
341
dsrl $in1,8
342
dsll $tmp2,8
343
dsll $tmp4,8
344
and $in0,$tmp0
345
and $in1,$tmp0
346
or $tmp1,$tmp2
347
or $tmp3,$tmp4
348
or $in0,$tmp1
349
or $in1,$tmp3
350
dsrl $tmp1,$in0,32
351
dsrl $tmp3,$in1,32
352
dsll $in0,32
353
dsll $in1,32
354
or $in0,$tmp1
355
or $in1,$tmp3
356
# endif
357
#endif
358
dsrl $tmp1,$h2,2 # modulo-scheduled reduction
359
andi $h2,$h2,3
360
dsll $tmp0,$tmp1,2
361
362
daddu $d0,$h0,$in0 # accumulate input
363
daddu $tmp1,$tmp0
364
sltu $tmp0,$d0,$h0
365
daddu $d0,$d0,$tmp1 # ... and residue
366
sltu $tmp1,$d0,$tmp1
367
daddu $d1,$h1,$in1
368
daddu $tmp0,$tmp1
369
sltu $tmp1,$d1,$h1
370
daddu $d1,$tmp0
371
372
dmultu ($r0,$d0) # h0*r0
373
daddu $d2,$h2,$padbit
374
sltu $tmp0,$d1,$tmp0
375
mflo ($h0,$r0,$d0)
376
mfhi ($h1,$r0,$d0)
377
378
dmultu ($rs1,$d1) # h1*5*r1
379
daddu $d2,$tmp1
380
daddu $d2,$tmp0
381
mflo ($tmp0,$rs1,$d1)
382
mfhi ($tmp1,$rs1,$d1)
383
384
dmultu ($r1,$d0) # h0*r1
385
mflo ($tmp2,$r1,$d0)
386
mfhi ($h2,$r1,$d0)
387
daddu $h0,$tmp0
388
daddu $h1,$tmp1
389
sltu $tmp0,$h0,$tmp0
390
391
dmultu ($r0,$d1) # h1*r0
392
daddu $h1,$tmp0
393
daddu $h1,$tmp2
394
mflo ($tmp0,$r0,$d1)
395
mfhi ($tmp1,$r0,$d1)
396
397
dmultu ($rs1,$d2) # h2*5*r1
398
sltu $tmp2,$h1,$tmp2
399
daddu $h2,$tmp2
400
mflo ($tmp2,$rs1,$d2)
401
402
dmultu ($r0,$d2) # h2*r0
403
daddu $h1,$tmp0
404
daddu $h2,$tmp1
405
mflo ($tmp3,$r0,$d2)
406
sltu $tmp0,$h1,$tmp0
407
daddu $h2,$tmp0
408
409
daddu $h1,$tmp2
410
sltu $tmp2,$h1,$tmp2
411
daddu $h2,$tmp2
412
daddu $h2,$tmp3
413
414
bne $inp,$len,.Loop
415
416
sd $h0,0($ctx) # store hash value
417
sd $h1,8($ctx)
418
sd $h2,16($ctx)
419
420
.set noreorder
421
#if defined(_MIPS_ARCH_MIPS64R6)
422
ld $s7,56($sp)
423
ld $s6,48($sp)
424
#endif
425
ld $s5,40($sp) # epilogue
426
ld $s4,32($sp)
427
___
428
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
429
ld $s3,24($sp)
430
ld $s2,16($sp)
431
ld $s1,8($sp)
432
ld $s0,0($sp)
433
___
434
$code.=<<___;
435
jr $ra
436
#if defined(_MIPS_ARCH_MIPS64R6)
437
daddu $sp,8*8
438
#else
439
daddu $sp,6*8
440
#endif
441
.end poly1305_blocks_internal
442
___
443
}
444
{
445
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
446
447
$code.=<<___;
448
.align 5
449
.globl poly1305_emit
450
.ent poly1305_emit
451
poly1305_emit:
452
.frame $sp,0,$ra
453
.set reorder
454
455
ld $tmp2,16($ctx)
456
ld $tmp0,0($ctx)
457
ld $tmp1,8($ctx)
458
459
li $in0,-4 # final reduction
460
dsrl $in1,$tmp2,2
461
and $in0,$tmp2
462
andi $tmp2,$tmp2,3
463
daddu $in0,$in1
464
465
daddu $tmp0,$tmp0,$in0
466
sltu $in1,$tmp0,$in0
467
daddiu $in0,$tmp0,5 # compare to modulus
468
daddu $tmp1,$tmp1,$in1
469
sltiu $tmp3,$in0,5
470
sltu $tmp4,$tmp1,$in1
471
daddu $in1,$tmp1,$tmp3
472
daddu $tmp2,$tmp2,$tmp4
473
sltu $tmp3,$in1,$tmp3
474
daddu $tmp2,$tmp2,$tmp3
475
476
dsrl $tmp2,2 # see if it carried/borrowed
477
dsubu $tmp2,$zero,$tmp2
478
479
xor $in0,$tmp0
480
xor $in1,$tmp1
481
and $in0,$tmp2
482
and $in1,$tmp2
483
xor $in0,$tmp0
484
xor $in1,$tmp1
485
486
lwu $tmp0,0($nonce) # load nonce
487
lwu $tmp1,4($nonce)
488
lwu $tmp2,8($nonce)
489
lwu $tmp3,12($nonce)
490
dsll $tmp1,32
491
dsll $tmp3,32
492
or $tmp0,$tmp1
493
or $tmp2,$tmp3
494
495
daddu $in0,$tmp0 # accumulate nonce
496
daddu $in1,$tmp2
497
sltu $tmp0,$in0,$tmp0
498
daddu $in1,$tmp0
499
500
dsrl $tmp0,$in0,8 # write mac value
501
dsrl $tmp1,$in0,16
502
dsrl $tmp2,$in0,24
503
sb $in0,0($mac)
504
dsrl $tmp3,$in0,32
505
sb $tmp0,1($mac)
506
dsrl $tmp0,$in0,40
507
sb $tmp1,2($mac)
508
dsrl $tmp1,$in0,48
509
sb $tmp2,3($mac)
510
dsrl $tmp2,$in0,56
511
sb $tmp3,4($mac)
512
dsrl $tmp3,$in1,8
513
sb $tmp0,5($mac)
514
dsrl $tmp0,$in1,16
515
sb $tmp1,6($mac)
516
dsrl $tmp1,$in1,24
517
sb $tmp2,7($mac)
518
519
sb $in1,8($mac)
520
dsrl $tmp2,$in1,32
521
sb $tmp3,9($mac)
522
dsrl $tmp3,$in1,40
523
sb $tmp0,10($mac)
524
dsrl $tmp0,$in1,48
525
sb $tmp1,11($mac)
526
dsrl $tmp1,$in1,56
527
sb $tmp2,12($mac)
528
sb $tmp3,13($mac)
529
sb $tmp0,14($mac)
530
sb $tmp1,15($mac)
531
532
jr $ra
533
.end poly1305_emit
534
.rdata
535
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
536
.align 2
537
___
538
}
539
}}} else {{{
540
######################################################################
541
# 32-bit code path
542
#
543
544
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
545
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
546
($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
547
548
$code.=<<___;
549
#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
550
defined(_MIPS_ARCH_MIPS32R6)) \\
551
&& !defined(_MIPS_ARCH_MIPS32R2)
552
# define _MIPS_ARCH_MIPS32R2
553
#endif
554
555
#if defined(_MIPS_ARCH_MIPS32R6)
556
# define multu(rs,rt)
557
# define mflo(rd,rs,rt) mulu rd,rs,rt
558
# define mfhi(rd,rs,rt) muhu rd,rs,rt
559
#else
560
# define multu(rs,rt) multu rs,rt
561
# define mflo(rd,rs,rt) mflo rd
562
# define mfhi(rd,rs,rt) mfhi rd
563
#endif
564
565
#ifdef __KERNEL__
566
# define poly1305_init poly1305_block_init
567
#endif
568
569
#if defined(__MIPSEB__) && !defined(MIPSEB)
570
# define MIPSEB
571
#endif
572
573
#ifdef MIPSEB
574
# define MSB 0
575
# define LSB 3
576
#else
577
# define MSB 3
578
# define LSB 0
579
#endif
580
581
.text
582
.set noat
583
.set noreorder
584
585
.align 5
586
.globl poly1305_init
587
.ent poly1305_init
588
poly1305_init:
589
.frame $sp,0,$ra
590
.set reorder
591
592
sw $zero,0($ctx)
593
sw $zero,4($ctx)
594
sw $zero,8($ctx)
595
sw $zero,12($ctx)
596
sw $zero,16($ctx)
597
598
beqz $inp,.Lno_key
599
600
#if defined(_MIPS_ARCH_MIPS32R6)
601
andi $tmp0,$inp,3 # $inp % 4
602
subu $inp,$inp,$tmp0 # align $inp
603
sll $tmp0,$tmp0,3 # byte to bit offset
604
lw $in0,0($inp)
605
lw $in1,4($inp)
606
lw $in2,8($inp)
607
lw $in3,12($inp)
608
beqz $tmp0,.Laligned_key
609
610
lw $tmp2,16($inp)
611
subu $tmp1,$zero,$tmp0
612
# ifdef MIPSEB
613
sllv $in0,$in0,$tmp0
614
srlv $tmp3,$in1,$tmp1
615
sllv $in1,$in1,$tmp0
616
or $in0,$in0,$tmp3
617
srlv $tmp3,$in2,$tmp1
618
sllv $in2,$in2,$tmp0
619
or $in1,$in1,$tmp3
620
srlv $tmp3,$in3,$tmp1
621
sllv $in3,$in3,$tmp0
622
or $in2,$in2,$tmp3
623
srlv $tmp2,$tmp2,$tmp1
624
or $in3,$in3,$tmp2
625
# else
626
srlv $in0,$in0,$tmp0
627
sllv $tmp3,$in1,$tmp1
628
srlv $in1,$in1,$tmp0
629
or $in0,$in0,$tmp3
630
sllv $tmp3,$in2,$tmp1
631
srlv $in2,$in2,$tmp0
632
or $in1,$in1,$tmp3
633
sllv $tmp3,$in3,$tmp1
634
srlv $in3,$in3,$tmp0
635
or $in2,$in2,$tmp3
636
sllv $tmp2,$tmp2,$tmp1
637
or $in3,$in3,$tmp2
638
# endif
639
.Laligned_key:
640
#else
641
lwl $in0,0+MSB($inp)
642
lwl $in1,4+MSB($inp)
643
lwl $in2,8+MSB($inp)
644
lwl $in3,12+MSB($inp)
645
lwr $in0,0+LSB($inp)
646
lwr $in1,4+LSB($inp)
647
lwr $in2,8+LSB($inp)
648
lwr $in3,12+LSB($inp)
649
#endif
650
#ifdef MIPSEB
651
# if defined(_MIPS_ARCH_MIPS32R2)
652
wsbh $in0,$in0 # byte swap
653
wsbh $in1,$in1
654
wsbh $in2,$in2
655
wsbh $in3,$in3
656
rotr $in0,$in0,16
657
rotr $in1,$in1,16
658
rotr $in2,$in2,16
659
rotr $in3,$in3,16
660
# else
661
srl $tmp0,$in0,24 # byte swap
662
srl $tmp1,$in0,8
663
andi $tmp2,$in0,0xFF00
664
sll $in0,$in0,24
665
andi $tmp1,0xFF00
666
sll $tmp2,$tmp2,8
667
or $in0,$tmp0
668
srl $tmp0,$in1,24
669
or $tmp1,$tmp2
670
srl $tmp2,$in1,8
671
or $in0,$tmp1
672
andi $tmp1,$in1,0xFF00
673
sll $in1,$in1,24
674
andi $tmp2,0xFF00
675
sll $tmp1,$tmp1,8
676
or $in1,$tmp0
677
srl $tmp0,$in2,24
678
or $tmp2,$tmp1
679
srl $tmp1,$in2,8
680
or $in1,$tmp2
681
andi $tmp2,$in2,0xFF00
682
sll $in2,$in2,24
683
andi $tmp1,0xFF00
684
sll $tmp2,$tmp2,8
685
or $in2,$tmp0
686
srl $tmp0,$in3,24
687
or $tmp1,$tmp2
688
srl $tmp2,$in3,8
689
or $in2,$tmp1
690
andi $tmp1,$in3,0xFF00
691
sll $in3,$in3,24
692
andi $tmp2,0xFF00
693
sll $tmp1,$tmp1,8
694
or $in3,$tmp0
695
or $tmp2,$tmp1
696
or $in3,$tmp2
697
# endif
698
#endif
699
lui $tmp0,0x0fff
700
ori $tmp0,0xffff # 0x0fffffff
701
and $in0,$in0,$tmp0
702
subu $tmp0,3 # 0x0ffffffc
703
and $in1,$in1,$tmp0
704
and $in2,$in2,$tmp0
705
and $in3,$in3,$tmp0
706
707
sw $in0,20($ctx)
708
sw $in1,24($ctx)
709
sw $in2,28($ctx)
710
sw $in3,32($ctx)
711
712
srl $tmp1,$in1,2
713
srl $tmp2,$in2,2
714
srl $tmp3,$in3,2
715
addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
716
addu $in2,$in2,$tmp2
717
addu $in3,$in3,$tmp3
718
sw $in1,36($ctx)
719
sw $in2,40($ctx)
720
sw $in3,44($ctx)
721
.Lno_key:
722
li $v0,0
723
jr $ra
724
.end poly1305_init
725
___
726
{
727
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
728
729
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
730
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
731
my ($d0,$d1,$d2,$d3) =
732
($a4,$a5,$a6,$a7);
733
my $shr = $t2; # used on R6
734
my $one = $t2; # used on R2
735
736
$code.=<<___;
737
.globl poly1305_blocks
738
.align 5
739
.ent poly1305_blocks
740
poly1305_blocks:
741
.frame $sp,16*4,$ra
742
.mask $SAVED_REGS_MASK,-4
743
.set noreorder
744
subu $sp, $sp,4*12
745
sw $s11,4*11($sp)
746
sw $s10,4*10($sp)
747
sw $s9, 4*9($sp)
748
sw $s8, 4*8($sp)
749
sw $s7, 4*7($sp)
750
sw $s6, 4*6($sp)
751
sw $s5, 4*5($sp)
752
sw $s4, 4*4($sp)
753
___
754
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
755
sw $s3, 4*3($sp)
756
sw $s2, 4*2($sp)
757
sw $s1, 4*1($sp)
758
sw $s0, 4*0($sp)
759
___
760
$code.=<<___;
761
.set reorder
762
763
srl $len,4 # number of complete blocks
764
li $one,1
765
beqz $len,.Labort
766
767
#if defined(_MIPS_ARCH_MIPS32R6)
768
andi $shr,$inp,3
769
subu $inp,$inp,$shr # align $inp
770
sll $shr,$shr,3 # byte to bit offset
771
#endif
772
773
lw $h0,0($ctx) # load hash value
774
lw $h1,4($ctx)
775
lw $h2,8($ctx)
776
lw $h3,12($ctx)
777
lw $h4,16($ctx)
778
779
lw $r0,20($ctx) # load key
780
lw $r1,24($ctx)
781
lw $r2,28($ctx)
782
lw $r3,32($ctx)
783
lw $rs1,36($ctx)
784
lw $rs2,40($ctx)
785
lw $rs3,44($ctx)
786
787
sll $len,4
788
addu $len,$len,$inp # end of buffer
789
b .Loop
790
791
.align 4
792
.Loop:
793
#if defined(_MIPS_ARCH_MIPS32R6)
794
lw $d0,0($inp) # load input
795
lw $d1,4($inp)
796
lw $d2,8($inp)
797
lw $d3,12($inp)
798
beqz $shr,.Laligned_inp
799
800
lw $t0,16($inp)
801
subu $t1,$zero,$shr
802
# ifdef MIPSEB
803
sllv $d0,$d0,$shr
804
srlv $at,$d1,$t1
805
sllv $d1,$d1,$shr
806
or $d0,$d0,$at
807
srlv $at,$d2,$t1
808
sllv $d2,$d2,$shr
809
or $d1,$d1,$at
810
srlv $at,$d3,$t1
811
sllv $d3,$d3,$shr
812
or $d2,$d2,$at
813
srlv $t0,$t0,$t1
814
or $d3,$d3,$t0
815
# else
816
srlv $d0,$d0,$shr
817
sllv $at,$d1,$t1
818
srlv $d1,$d1,$shr
819
or $d0,$d0,$at
820
sllv $at,$d2,$t1
821
srlv $d2,$d2,$shr
822
or $d1,$d1,$at
823
sllv $at,$d3,$t1
824
srlv $d3,$d3,$shr
825
or $d2,$d2,$at
826
sllv $t0,$t0,$t1
827
or $d3,$d3,$t0
828
# endif
829
.Laligned_inp:
830
#else
831
lwl $d0,0+MSB($inp) # load input
832
lwl $d1,4+MSB($inp)
833
lwl $d2,8+MSB($inp)
834
lwl $d3,12+MSB($inp)
835
lwr $d0,0+LSB($inp)
836
lwr $d1,4+LSB($inp)
837
lwr $d2,8+LSB($inp)
838
lwr $d3,12+LSB($inp)
839
#endif
840
#ifdef MIPSEB
841
# if defined(_MIPS_ARCH_MIPS32R2)
842
wsbh $d0,$d0 # byte swap
843
wsbh $d1,$d1
844
wsbh $d2,$d2
845
wsbh $d3,$d3
846
rotr $d0,$d0,16
847
rotr $d1,$d1,16
848
rotr $d2,$d2,16
849
rotr $d3,$d3,16
850
# else
851
srl $at,$d0,24 # byte swap
852
srl $t0,$d0,8
853
andi $t1,$d0,0xFF00
854
sll $d0,$d0,24
855
andi $t0,0xFF00
856
sll $t1,$t1,8
857
or $d0,$at
858
srl $at,$d1,24
859
or $t0,$t1
860
srl $t1,$d1,8
861
or $d0,$t0
862
andi $t0,$d1,0xFF00
863
sll $d1,$d1,24
864
andi $t1,0xFF00
865
sll $t0,$t0,8
866
or $d1,$at
867
srl $at,$d2,24
868
or $t1,$t0
869
srl $t0,$d2,8
870
or $d1,$t1
871
andi $t1,$d2,0xFF00
872
sll $d2,$d2,24
873
andi $t0,0xFF00
874
sll $t1,$t1,8
875
or $d2,$at
876
srl $at,$d3,24
877
or $t0,$t1
878
srl $t1,$d3,8
879
or $d2,$t0
880
andi $t0,$d3,0xFF00
881
sll $d3,$d3,24
882
andi $t1,0xFF00
883
sll $t0,$t0,8
884
or $d3,$at
885
or $t1,$t0
886
or $d3,$t1
887
# endif
888
#endif
889
srl $t0,$h4,2 # modulo-scheduled reduction
890
andi $h4,$h4,3
891
sll $at,$t0,2
892
893
addu $d0,$d0,$h0 # accumulate input
894
addu $t0,$t0,$at
895
sltu $h0,$d0,$h0
896
addu $d0,$d0,$t0 # ... and residue
897
sltu $at,$d0,$t0
898
899
addu $d1,$d1,$h1
900
addu $h0,$h0,$at # carry
901
sltu $h1,$d1,$h1
902
addu $d1,$d1,$h0
903
sltu $h0,$d1,$h0
904
905
addu $d2,$d2,$h2
906
addu $h1,$h1,$h0 # carry
907
sltu $h2,$d2,$h2
908
addu $d2,$d2,$h1
909
sltu $h1,$d2,$h1
910
911
addu $d3,$d3,$h3
912
addu $h2,$h2,$h1 # carry
913
sltu $h3,$d3,$h3
914
addu $d3,$d3,$h2
915
916
#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
917
multu $r0,$d0 # d0*r0
918
sltu $h2,$d3,$h2
919
maddu $rs3,$d1 # d1*s3
920
addu $h3,$h3,$h2 # carry
921
maddu $rs2,$d2 # d2*s2
922
addu $h4,$h4,$padbit
923
maddu $rs1,$d3 # d3*s1
924
addu $h4,$h4,$h3
925
mfhi $at
926
mflo $h0
927
928
multu $r1,$d0 # d0*r1
929
maddu $r0,$d1 # d1*r0
930
maddu $rs3,$d2 # d2*s3
931
maddu $rs2,$d3 # d3*s2
932
maddu $rs1,$h4 # h4*s1
933
maddu $at,$one # hi*1
934
mfhi $at
935
mflo $h1
936
937
multu $r2,$d0 # d0*r2
938
maddu $r1,$d1 # d1*r1
939
maddu $r0,$d2 # d2*r0
940
maddu $rs3,$d3 # d3*s3
941
maddu $rs2,$h4 # h4*s2
942
maddu $at,$one # hi*1
943
mfhi $at
944
mflo $h2
945
946
mul $t0,$r0,$h4 # h4*r0
947
948
multu $r3,$d0 # d0*r3
949
maddu $r2,$d1 # d1*r2
950
maddu $r1,$d2 # d2*r1
951
maddu $r0,$d3 # d3*r0
952
maddu $rs3,$h4 # h4*s3
953
maddu $at,$one # hi*1
954
mfhi $at
955
mflo $h3
956
957
addiu $inp,$inp,16
958
959
addu $h4,$t0,$at
960
#else
961
multu ($r0,$d0) # d0*r0
962
mflo ($h0,$r0,$d0)
963
mfhi ($h1,$r0,$d0)
964
965
sltu $h2,$d3,$h2
966
addu $h3,$h3,$h2 # carry
967
968
multu ($rs3,$d1) # d1*s3
969
mflo ($at,$rs3,$d1)
970
mfhi ($t0,$rs3,$d1)
971
972
addu $h4,$h4,$padbit
973
addiu $inp,$inp,16
974
addu $h4,$h4,$h3
975
976
multu ($rs2,$d2) # d2*s2
977
mflo ($a3,$rs2,$d2)
978
mfhi ($t1,$rs2,$d2)
979
addu $h0,$h0,$at
980
addu $h1,$h1,$t0
981
multu ($rs1,$d3) # d3*s1
982
sltu $at,$h0,$at
983
addu $h1,$h1,$at
984
985
mflo ($at,$rs1,$d3)
986
mfhi ($t0,$rs1,$d3)
987
addu $h0,$h0,$a3
988
addu $h1,$h1,$t1
989
multu ($r1,$d0) # d0*r1
990
sltu $a3,$h0,$a3
991
addu $h1,$h1,$a3
992
993
994
mflo ($a3,$r1,$d0)
995
mfhi ($h2,$r1,$d0)
996
addu $h0,$h0,$at
997
addu $h1,$h1,$t0
998
multu ($r0,$d1) # d1*r0
999
sltu $at,$h0,$at
1000
addu $h1,$h1,$at
1001
1002
mflo ($at,$r0,$d1)
1003
mfhi ($t0,$r0,$d1)
1004
addu $h1,$h1,$a3
1005
sltu $a3,$h1,$a3
1006
multu ($rs3,$d2) # d2*s3
1007
addu $h2,$h2,$a3
1008
1009
mflo ($a3,$rs3,$d2)
1010
mfhi ($t1,$rs3,$d2)
1011
addu $h1,$h1,$at
1012
addu $h2,$h2,$t0
1013
multu ($rs2,$d3) # d3*s2
1014
sltu $at,$h1,$at
1015
addu $h2,$h2,$at
1016
1017
mflo ($at,$rs2,$d3)
1018
mfhi ($t0,$rs2,$d3)
1019
addu $h1,$h1,$a3
1020
addu $h2,$h2,$t1
1021
multu ($rs1,$h4) # h4*s1
1022
sltu $a3,$h1,$a3
1023
addu $h2,$h2,$a3
1024
1025
mflo ($a3,$rs1,$h4)
1026
addu $h1,$h1,$at
1027
addu $h2,$h2,$t0
1028
multu ($r2,$d0) # d0*r2
1029
sltu $at,$h1,$at
1030
addu $h2,$h2,$at
1031
1032
1033
mflo ($at,$r2,$d0)
1034
mfhi ($h3,$r2,$d0)
1035
addu $h1,$h1,$a3
1036
sltu $a3,$h1,$a3
1037
multu ($r1,$d1) # d1*r1
1038
addu $h2,$h2,$a3
1039
1040
mflo ($a3,$r1,$d1)
1041
mfhi ($t1,$r1,$d1)
1042
addu $h2,$h2,$at
1043
sltu $at,$h2,$at
1044
multu ($r0,$d2) # d2*r0
1045
addu $h3,$h3,$at
1046
1047
mflo ($at,$r0,$d2)
1048
mfhi ($t0,$r0,$d2)
1049
addu $h2,$h2,$a3
1050
addu $h3,$h3,$t1
1051
multu ($rs3,$d3) # d3*s3
1052
sltu $a3,$h2,$a3
1053
addu $h3,$h3,$a3
1054
1055
mflo ($a3,$rs3,$d3)
1056
mfhi ($t1,$rs3,$d3)
1057
addu $h2,$h2,$at
1058
addu $h3,$h3,$t0
1059
multu ($rs2,$h4) # h4*s2
1060
sltu $at,$h2,$at
1061
addu $h3,$h3,$at
1062
1063
mflo ($at,$rs2,$h4)
1064
addu $h2,$h2,$a3
1065
addu $h3,$h3,$t1
1066
multu ($r3,$d0) # d0*r3
1067
sltu $a3,$h2,$a3
1068
addu $h3,$h3,$a3
1069
1070
1071
mflo ($a3,$r3,$d0)
1072
mfhi ($t1,$r3,$d0)
1073
addu $h2,$h2,$at
1074
sltu $at,$h2,$at
1075
multu ($r2,$d1) # d1*r2
1076
addu $h3,$h3,$at
1077
1078
mflo ($at,$r2,$d1)
1079
mfhi ($t0,$r2,$d1)
1080
addu $h3,$h3,$a3
1081
sltu $a3,$h3,$a3
1082
multu ($r0,$d3) # d3*r0
1083
addu $t1,$t1,$a3
1084
1085
mflo ($a3,$r0,$d3)
1086
mfhi ($d3,$r0,$d3)
1087
addu $h3,$h3,$at
1088
addu $t1,$t1,$t0
1089
multu ($r1,$d2) # d2*r1
1090
sltu $at,$h3,$at
1091
addu $t1,$t1,$at
1092
1093
mflo ($at,$r1,$d2)
1094
mfhi ($t0,$r1,$d2)
1095
addu $h3,$h3,$a3
1096
addu $t1,$t1,$d3
1097
multu ($rs3,$h4) # h4*s3
1098
sltu $a3,$h3,$a3
1099
addu $t1,$t1,$a3
1100
1101
mflo ($a3,$rs3,$h4)
1102
addu $h3,$h3,$at
1103
addu $t1,$t1,$t0
1104
multu ($r0,$h4) # h4*r0
1105
sltu $at,$h3,$at
1106
addu $t1,$t1,$at
1107
1108
1109
mflo ($h4,$r0,$h4)
1110
addu $h3,$h3,$a3
1111
sltu $a3,$h3,$a3
1112
addu $t1,$t1,$a3
1113
addu $h4,$h4,$t1
1114
1115
li $padbit,1 # if we loop, padbit is 1
1116
#endif
1117
bne $inp,$len,.Loop
1118
1119
sw $h0,0($ctx) # store hash value
1120
sw $h1,4($ctx)
1121
sw $h2,8($ctx)
1122
sw $h3,12($ctx)
1123
sw $h4,16($ctx)
1124
1125
.set noreorder
1126
.Labort:
1127
lw $s11,4*11($sp)
1128
lw $s10,4*10($sp)
1129
lw $s9, 4*9($sp)
1130
lw $s8, 4*8($sp)
1131
lw $s7, 4*7($sp)
1132
lw $s6, 4*6($sp)
1133
lw $s5, 4*5($sp)
1134
lw $s4, 4*4($sp)
1135
___
1136
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1137
lw $s3, 4*3($sp)
1138
lw $s2, 4*2($sp)
1139
lw $s1, 4*1($sp)
1140
lw $s0, 4*0($sp)
1141
___
1142
$code.=<<___;
1143
jr $ra
1144
addu $sp,$sp,4*12
1145
.end poly1305_blocks
1146
___
1147
}
1148
{
1149
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1150
1151
$code.=<<___;
1152
.align 5
1153
.globl poly1305_emit
1154
.ent poly1305_emit
1155
poly1305_emit:
1156
.frame $sp,0,$ra
1157
.set reorder
1158
1159
lw $tmp4,16($ctx)
1160
lw $tmp0,0($ctx)
1161
lw $tmp1,4($ctx)
1162
lw $tmp2,8($ctx)
1163
lw $tmp3,12($ctx)
1164
1165
li $in0,-4 # final reduction
1166
srl $ctx,$tmp4,2
1167
and $in0,$in0,$tmp4
1168
andi $tmp4,$tmp4,3
1169
addu $ctx,$ctx,$in0
1170
1171
addu $tmp0,$tmp0,$ctx
1172
sltu $ctx,$tmp0,$ctx
1173
addiu $in0,$tmp0,5 # compare to modulus
1174
addu $tmp1,$tmp1,$ctx
1175
sltiu $in1,$in0,5
1176
sltu $ctx,$tmp1,$ctx
1177
addu $in1,$in1,$tmp1
1178
addu $tmp2,$tmp2,$ctx
1179
sltu $in2,$in1,$tmp1
1180
sltu $ctx,$tmp2,$ctx
1181
addu $in2,$in2,$tmp2
1182
addu $tmp3,$tmp3,$ctx
1183
sltu $in3,$in2,$tmp2
1184
sltu $ctx,$tmp3,$ctx
1185
addu $in3,$in3,$tmp3
1186
addu $tmp4,$tmp4,$ctx
1187
sltu $ctx,$in3,$tmp3
1188
addu $ctx,$tmp4
1189
1190
srl $ctx,2 # see if it carried/borrowed
1191
subu $ctx,$zero,$ctx
1192
1193
xor $in0,$tmp0
1194
xor $in1,$tmp1
1195
xor $in2,$tmp2
1196
xor $in3,$tmp3
1197
and $in0,$ctx
1198
and $in1,$ctx
1199
and $in2,$ctx
1200
and $in3,$ctx
1201
xor $in0,$tmp0
1202
xor $in1,$tmp1
1203
xor $in2,$tmp2
1204
xor $in3,$tmp3
1205
1206
lw $tmp0,0($nonce) # load nonce
1207
lw $tmp1,4($nonce)
1208
lw $tmp2,8($nonce)
1209
lw $tmp3,12($nonce)
1210
1211
addu $in0,$tmp0 # accumulate nonce
1212
sltu $ctx,$in0,$tmp0
1213
1214
addu $in1,$tmp1
1215
sltu $tmp1,$in1,$tmp1
1216
addu $in1,$ctx
1217
sltu $ctx,$in1,$ctx
1218
addu $ctx,$tmp1
1219
1220
addu $in2,$tmp2
1221
sltu $tmp2,$in2,$tmp2
1222
addu $in2,$ctx
1223
sltu $ctx,$in2,$ctx
1224
addu $ctx,$tmp2
1225
1226
addu $in3,$tmp3
1227
addu $in3,$ctx
1228
1229
srl $tmp0,$in0,8 # write mac value
1230
srl $tmp1,$in0,16
1231
srl $tmp2,$in0,24
1232
sb $in0, 0($mac)
1233
sb $tmp0,1($mac)
1234
srl $tmp0,$in1,8
1235
sb $tmp1,2($mac)
1236
srl $tmp1,$in1,16
1237
sb $tmp2,3($mac)
1238
srl $tmp2,$in1,24
1239
sb $in1, 4($mac)
1240
sb $tmp0,5($mac)
1241
srl $tmp0,$in2,8
1242
sb $tmp1,6($mac)
1243
srl $tmp1,$in2,16
1244
sb $tmp2,7($mac)
1245
srl $tmp2,$in2,24
1246
sb $in2, 8($mac)
1247
sb $tmp0,9($mac)
1248
srl $tmp0,$in3,8
1249
sb $tmp1,10($mac)
1250
srl $tmp1,$in3,16
1251
sb $tmp2,11($mac)
1252
srl $tmp2,$in3,24
1253
sb $in3, 12($mac)
1254
sb $tmp0,13($mac)
1255
sb $tmp1,14($mac)
1256
sb $tmp2,15($mac)
1257
1258
jr $ra
1259
.end poly1305_emit
1260
.rdata
1261
.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1262
.align 2
1263
___
1264
}
1265
}}}
1266
1267
$output=pop and open STDOUT,">$output";
1268
print $code;
1269
close STDOUT;
1270
1271