Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/poly1305.h
29278 views
1
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2
/*
3
* Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
4
*/
5
6
#include <asm/cpu_device_id.h>
7
#include <asm/fpu/api.h>
8
#include <linux/jump_label.h>
9
#include <linux/kernel.h>
10
#include <linux/sizes.h>
11
12
struct poly1305_arch_internal {
13
union {
14
struct {
15
u32 h[5];
16
u32 is_base2_26;
17
};
18
u64 hs[3];
19
};
20
u64 r[2];
21
u64 pad;
22
struct { u32 r2, r1, r4, r3; } rn[9];
23
};
24
25
/*
26
* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
27
* the unfortunate situation of using AVX and then having to go back to scalar
28
* -- because the user is silly and has called the update function from two
29
* separate contexts -- then we need to convert back to the original base before
30
* proceeding. It is possible to reason that the initial reduction below is
31
* sufficient given the implementation invariants. However, for an avoidance of
32
* doubt and because this is not performance critical, we do the full reduction
33
* anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
34
*/
35
static void convert_to_base2_64(void *ctx)
36
{
37
struct poly1305_arch_internal *state = ctx;
38
u32 cy;
39
40
if (!state->is_base2_26)
41
return;
42
43
cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
44
cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
45
cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
46
cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
47
state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
48
state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
49
state->hs[2] = state->h[4] >> 24;
50
/* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
51
#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
52
cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
53
state->hs[2] &= 3;
54
state->hs[0] += cy;
55
state->hs[1] += (cy = ULT(state->hs[0], cy));
56
state->hs[2] += ULT(state->hs[1], cy);
57
#undef ULT
58
state->is_base2_26 = 0;
59
}
60
61
asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state,
62
const u8 raw_key[POLY1305_BLOCK_SIZE]);
63
asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
64
const u8 *inp,
65
const size_t len, const u32 padbit);
66
asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
67
u8 mac[POLY1305_DIGEST_SIZE],
68
const u32 nonce[4]);
69
asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
70
u8 mac[POLY1305_DIGEST_SIZE],
71
const u32 nonce[4]);
72
asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
73
const u8 *inp, const size_t len,
74
const u32 padbit);
75
asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
76
const u8 *inp, const size_t len,
77
const u32 padbit);
78
asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
79
const u8 *inp,
80
const size_t len, const u32 padbit);
81
82
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
83
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
84
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
85
86
static void poly1305_block_init(struct poly1305_block_state *state,
87
const u8 raw_key[POLY1305_BLOCK_SIZE])
88
{
89
poly1305_init_x86_64(state, raw_key);
90
}
91
92
static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp,
93
unsigned int len, u32 padbit)
94
{
95
struct poly1305_arch_internal *ctx =
96
container_of(&state->h.h, struct poly1305_arch_internal, h);
97
98
/* SIMD disables preemption, so relax after processing each page. */
99
BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
100
SZ_4K % POLY1305_BLOCK_SIZE);
101
102
/*
103
* The AVX implementations have significant setup overhead (e.g. key
104
* power computation, kernel FPU enabling) which makes them slower for
105
* short messages. Fall back to the scalar implementation for messages
106
* shorter than 288 bytes, unless the AVX-specific key setup has already
107
* been performed (indicated by ctx->is_base2_26).
108
*/
109
if (!static_branch_likely(&poly1305_use_avx) ||
110
(len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
111
unlikely(!irq_fpu_usable())) {
112
convert_to_base2_64(ctx);
113
poly1305_blocks_x86_64(ctx, inp, len, padbit);
114
return;
115
}
116
117
do {
118
const unsigned int bytes = min(len, SZ_4K);
119
120
kernel_fpu_begin();
121
if (static_branch_likely(&poly1305_use_avx512))
122
poly1305_blocks_avx512(ctx, inp, bytes, padbit);
123
else if (static_branch_likely(&poly1305_use_avx2))
124
poly1305_blocks_avx2(ctx, inp, bytes, padbit);
125
else
126
poly1305_blocks_avx(ctx, inp, bytes, padbit);
127
kernel_fpu_end();
128
129
len -= bytes;
130
inp += bytes;
131
} while (len);
132
}
133
134
static void poly1305_emit(const struct poly1305_state *ctx,
135
u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
136
{
137
if (!static_branch_likely(&poly1305_use_avx))
138
poly1305_emit_x86_64(ctx, mac, nonce);
139
else
140
poly1305_emit_avx(ctx, mac, nonce);
141
}
142
143
#define poly1305_mod_init_arch poly1305_mod_init_arch
144
static void poly1305_mod_init_arch(void)
145
{
146
if (boot_cpu_has(X86_FEATURE_AVX) &&
147
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
148
static_branch_enable(&poly1305_use_avx);
149
if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
150
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
151
static_branch_enable(&poly1305_use_avx2);
152
if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
153
boot_cpu_has(X86_FEATURE_AVX512F) &&
154
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
155
/* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
156
boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
157
static_branch_enable(&poly1305_use_avx512);
158
}
159
160