Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/core.c
29280 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Linux Socket Filter - Kernel level socket filtering
4
*
5
* Based on the design of the Berkeley Packet Filter. The new
6
* internal format has been designed by PLUMgrid:
7
*
8
* Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9
*
10
* Authors:
11
*
12
* Jay Schulist <[email protected]>
13
* Alexei Starovoitov <[email protected]>
14
* Daniel Borkmann <[email protected]>
15
*
16
* Andi Kleen - Fix a few bad bugs and races.
17
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
18
*/
19
20
#include <uapi/linux/btf.h>
21
#include <crypto/sha1.h>
22
#include <linux/filter.h>
23
#include <linux/skbuff.h>
24
#include <linux/vmalloc.h>
25
#include <linux/prandom.h>
26
#include <linux/bpf.h>
27
#include <linux/btf.h>
28
#include <linux/objtool.h>
29
#include <linux/overflow.h>
30
#include <linux/rbtree_latch.h>
31
#include <linux/kallsyms.h>
32
#include <linux/rcupdate.h>
33
#include <linux/perf_event.h>
34
#include <linux/extable.h>
35
#include <linux/log2.h>
36
#include <linux/bpf_verifier.h>
37
#include <linux/nodemask.h>
38
#include <linux/nospec.h>
39
#include <linux/bpf_mem_alloc.h>
40
#include <linux/memcontrol.h>
41
#include <linux/execmem.h>
42
#include <crypto/sha2.h>
43
44
#include <asm/barrier.h>
45
#include <linux/unaligned.h>
46
47
/* Registers */
48
#define BPF_R0 regs[BPF_REG_0]
49
#define BPF_R1 regs[BPF_REG_1]
50
#define BPF_R2 regs[BPF_REG_2]
51
#define BPF_R3 regs[BPF_REG_3]
52
#define BPF_R4 regs[BPF_REG_4]
53
#define BPF_R5 regs[BPF_REG_5]
54
#define BPF_R6 regs[BPF_REG_6]
55
#define BPF_R7 regs[BPF_REG_7]
56
#define BPF_R8 regs[BPF_REG_8]
57
#define BPF_R9 regs[BPF_REG_9]
58
#define BPF_R10 regs[BPF_REG_10]
59
60
/* Named registers */
61
#define DST regs[insn->dst_reg]
62
#define SRC regs[insn->src_reg]
63
#define FP regs[BPF_REG_FP]
64
#define AX regs[BPF_REG_AX]
65
#define ARG1 regs[BPF_REG_ARG1]
66
#define CTX regs[BPF_REG_CTX]
67
#define OFF insn->off
68
#define IMM insn->imm
69
70
struct bpf_mem_alloc bpf_global_ma;
71
bool bpf_global_ma_set;
72
73
/* No hurry in this branch
74
*
75
* Exported for the bpf jit load helper.
76
*/
77
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
78
{
79
u8 *ptr = NULL;
80
81
if (k >= SKF_NET_OFF) {
82
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
83
} else if (k >= SKF_LL_OFF) {
84
if (unlikely(!skb_mac_header_was_set(skb)))
85
return NULL;
86
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
87
}
88
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
89
return ptr;
90
91
return NULL;
92
}
93
94
/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
95
enum page_size_enum {
96
__PAGE_SIZE = PAGE_SIZE
97
};
98
99
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
100
{
101
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
102
struct bpf_prog_aux *aux;
103
struct bpf_prog *fp;
104
105
size = round_up(size, __PAGE_SIZE);
106
fp = __vmalloc(size, gfp_flags);
107
if (fp == NULL)
108
return NULL;
109
110
aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
111
if (aux == NULL) {
112
vfree(fp);
113
return NULL;
114
}
115
fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
116
if (!fp->active) {
117
vfree(fp);
118
kfree(aux);
119
return NULL;
120
}
121
122
fp->pages = size / PAGE_SIZE;
123
fp->aux = aux;
124
fp->aux->main_prog_aux = aux;
125
fp->aux->prog = fp;
126
fp->jit_requested = ebpf_jit_enabled();
127
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
128
#ifdef CONFIG_CGROUP_BPF
129
aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
130
#endif
131
132
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
133
#ifdef CONFIG_FINEIBT
134
INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
135
#endif
136
mutex_init(&fp->aux->used_maps_mutex);
137
mutex_init(&fp->aux->ext_mutex);
138
mutex_init(&fp->aux->dst_mutex);
139
140
#ifdef CONFIG_BPF_SYSCALL
141
bpf_prog_stream_init(fp);
142
#endif
143
144
return fp;
145
}
146
147
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
148
{
149
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
150
struct bpf_prog *prog;
151
int cpu;
152
153
prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
154
if (!prog)
155
return NULL;
156
157
prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
158
if (!prog->stats) {
159
free_percpu(prog->active);
160
kfree(prog->aux);
161
vfree(prog);
162
return NULL;
163
}
164
165
for_each_possible_cpu(cpu) {
166
struct bpf_prog_stats *pstats;
167
168
pstats = per_cpu_ptr(prog->stats, cpu);
169
u64_stats_init(&pstats->syncp);
170
}
171
return prog;
172
}
173
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
174
175
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
176
{
177
if (!prog->aux->nr_linfo || !prog->jit_requested)
178
return 0;
179
180
prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
181
sizeof(*prog->aux->jited_linfo),
182
bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
183
if (!prog->aux->jited_linfo)
184
return -ENOMEM;
185
186
return 0;
187
}
188
189
void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
190
{
191
if (prog->aux->jited_linfo &&
192
(!prog->jited || !prog->aux->jited_linfo[0])) {
193
kvfree(prog->aux->jited_linfo);
194
prog->aux->jited_linfo = NULL;
195
}
196
197
kfree(prog->aux->kfunc_tab);
198
prog->aux->kfunc_tab = NULL;
199
}
200
201
/* The jit engine is responsible to provide an array
202
* for insn_off to the jited_off mapping (insn_to_jit_off).
203
*
204
* The idx to this array is the insn_off. Hence, the insn_off
205
* here is relative to the prog itself instead of the main prog.
206
* This array has one entry for each xlated bpf insn.
207
*
208
* jited_off is the byte off to the end of the jited insn.
209
*
210
* Hence, with
211
* insn_start:
212
* The first bpf insn off of the prog. The insn off
213
* here is relative to the main prog.
214
* e.g. if prog is a subprog, insn_start > 0
215
* linfo_idx:
216
* The prog's idx to prog->aux->linfo and jited_linfo
217
*
218
* jited_linfo[linfo_idx] = prog->bpf_func
219
*
220
* For i > linfo_idx,
221
*
222
* jited_linfo[i] = prog->bpf_func +
223
* insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
224
*/
225
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
226
const u32 *insn_to_jit_off)
227
{
228
u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
229
const struct bpf_line_info *linfo;
230
void **jited_linfo;
231
232
if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
233
/* Userspace did not provide linfo */
234
return;
235
236
linfo_idx = prog->aux->linfo_idx;
237
linfo = &prog->aux->linfo[linfo_idx];
238
insn_start = linfo[0].insn_off;
239
insn_end = insn_start + prog->len;
240
241
jited_linfo = &prog->aux->jited_linfo[linfo_idx];
242
jited_linfo[0] = prog->bpf_func;
243
244
nr_linfo = prog->aux->nr_linfo - linfo_idx;
245
246
for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
247
/* The verifier ensures that linfo[i].insn_off is
248
* strictly increasing
249
*/
250
jited_linfo[i] = prog->bpf_func +
251
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
252
}
253
254
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
255
gfp_t gfp_extra_flags)
256
{
257
gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
258
struct bpf_prog *fp;
259
u32 pages;
260
261
size = round_up(size, PAGE_SIZE);
262
pages = size / PAGE_SIZE;
263
if (pages <= fp_old->pages)
264
return fp_old;
265
266
fp = __vmalloc(size, gfp_flags);
267
if (fp) {
268
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
269
fp->pages = pages;
270
fp->aux->prog = fp;
271
272
/* We keep fp->aux from fp_old around in the new
273
* reallocated structure.
274
*/
275
fp_old->aux = NULL;
276
fp_old->stats = NULL;
277
fp_old->active = NULL;
278
__bpf_prog_free(fp_old);
279
}
280
281
return fp;
282
}
283
284
void __bpf_prog_free(struct bpf_prog *fp)
285
{
286
if (fp->aux) {
287
mutex_destroy(&fp->aux->used_maps_mutex);
288
mutex_destroy(&fp->aux->dst_mutex);
289
kfree(fp->aux->poke_tab);
290
kfree(fp->aux);
291
}
292
free_percpu(fp->stats);
293
free_percpu(fp->active);
294
vfree(fp);
295
}
296
297
int bpf_prog_calc_tag(struct bpf_prog *fp)
298
{
299
size_t size = bpf_prog_insn_size(fp);
300
struct bpf_insn *dst;
301
bool was_ld_map;
302
u32 i;
303
304
dst = vmalloc(size);
305
if (!dst)
306
return -ENOMEM;
307
308
/* We need to take out the map fd for the digest calculation
309
* since they are unstable from user space side.
310
*/
311
for (i = 0, was_ld_map = false; i < fp->len; i++) {
312
dst[i] = fp->insnsi[i];
313
if (!was_ld_map &&
314
dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
315
(dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
316
dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
317
was_ld_map = true;
318
dst[i].imm = 0;
319
} else if (was_ld_map &&
320
dst[i].code == 0 &&
321
dst[i].dst_reg == 0 &&
322
dst[i].src_reg == 0 &&
323
dst[i].off == 0) {
324
was_ld_map = false;
325
dst[i].imm = 0;
326
} else {
327
was_ld_map = false;
328
}
329
}
330
sha256((u8 *)dst, size, fp->digest);
331
vfree(dst);
332
return 0;
333
}
334
335
static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
336
s32 end_new, s32 curr, const bool probe_pass)
337
{
338
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
339
s32 delta = end_new - end_old;
340
s64 imm = insn->imm;
341
342
if (curr < pos && curr + imm + 1 >= end_old)
343
imm += delta;
344
else if (curr >= end_new && curr + imm + 1 < end_new)
345
imm -= delta;
346
if (imm < imm_min || imm > imm_max)
347
return -ERANGE;
348
if (!probe_pass)
349
insn->imm = imm;
350
return 0;
351
}
352
353
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
354
s32 end_new, s32 curr, const bool probe_pass)
355
{
356
s64 off_min, off_max, off;
357
s32 delta = end_new - end_old;
358
359
if (insn->code == (BPF_JMP32 | BPF_JA)) {
360
off = insn->imm;
361
off_min = S32_MIN;
362
off_max = S32_MAX;
363
} else {
364
off = insn->off;
365
off_min = S16_MIN;
366
off_max = S16_MAX;
367
}
368
369
if (curr < pos && curr + off + 1 >= end_old)
370
off += delta;
371
else if (curr >= end_new && curr + off + 1 < end_new)
372
off -= delta;
373
if (off < off_min || off > off_max)
374
return -ERANGE;
375
if (!probe_pass) {
376
if (insn->code == (BPF_JMP32 | BPF_JA))
377
insn->imm = off;
378
else
379
insn->off = off;
380
}
381
return 0;
382
}
383
384
static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
385
s32 end_new, const bool probe_pass)
386
{
387
u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
388
struct bpf_insn *insn = prog->insnsi;
389
int ret = 0;
390
391
for (i = 0; i < insn_cnt; i++, insn++) {
392
u8 code;
393
394
/* In the probing pass we still operate on the original,
395
* unpatched image in order to check overflows before we
396
* do any other adjustments. Therefore skip the patchlet.
397
*/
398
if (probe_pass && i == pos) {
399
i = end_new;
400
insn = prog->insnsi + end_old;
401
}
402
if (bpf_pseudo_func(insn)) {
403
ret = bpf_adj_delta_to_imm(insn, pos, end_old,
404
end_new, i, probe_pass);
405
if (ret)
406
return ret;
407
continue;
408
}
409
code = insn->code;
410
if ((BPF_CLASS(code) != BPF_JMP &&
411
BPF_CLASS(code) != BPF_JMP32) ||
412
BPF_OP(code) == BPF_EXIT)
413
continue;
414
/* Adjust offset of jmps if we cross patch boundaries. */
415
if (BPF_OP(code) == BPF_CALL) {
416
if (insn->src_reg != BPF_PSEUDO_CALL)
417
continue;
418
ret = bpf_adj_delta_to_imm(insn, pos, end_old,
419
end_new, i, probe_pass);
420
} else {
421
ret = bpf_adj_delta_to_off(insn, pos, end_old,
422
end_new, i, probe_pass);
423
}
424
if (ret)
425
break;
426
}
427
428
return ret;
429
}
430
431
static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
432
{
433
struct bpf_line_info *linfo;
434
u32 i, nr_linfo;
435
436
nr_linfo = prog->aux->nr_linfo;
437
if (!nr_linfo || !delta)
438
return;
439
440
linfo = prog->aux->linfo;
441
442
for (i = 0; i < nr_linfo; i++)
443
if (off < linfo[i].insn_off)
444
break;
445
446
/* Push all off < linfo[i].insn_off by delta */
447
for (; i < nr_linfo; i++)
448
linfo[i].insn_off += delta;
449
}
450
451
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
452
const struct bpf_insn *patch, u32 len)
453
{
454
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
455
const u32 cnt_max = S16_MAX;
456
struct bpf_prog *prog_adj;
457
int err;
458
459
/* Since our patchlet doesn't expand the image, we're done. */
460
if (insn_delta == 0) {
461
memcpy(prog->insnsi + off, patch, sizeof(*patch));
462
return prog;
463
}
464
465
insn_adj_cnt = prog->len + insn_delta;
466
467
/* Reject anything that would potentially let the insn->off
468
* target overflow when we have excessive program expansions.
469
* We need to probe here before we do any reallocation where
470
* we afterwards may not fail anymore.
471
*/
472
if (insn_adj_cnt > cnt_max &&
473
(err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
474
return ERR_PTR(err);
475
476
/* Several new instructions need to be inserted. Make room
477
* for them. Likely, there's no need for a new allocation as
478
* last page could have large enough tailroom.
479
*/
480
prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
481
GFP_USER);
482
if (!prog_adj)
483
return ERR_PTR(-ENOMEM);
484
485
prog_adj->len = insn_adj_cnt;
486
487
/* Patching happens in 3 steps:
488
*
489
* 1) Move over tail of insnsi from next instruction onwards,
490
* so we can patch the single target insn with one or more
491
* new ones (patching is always from 1 to n insns, n > 0).
492
* 2) Inject new instructions at the target location.
493
* 3) Adjust branch offsets if necessary.
494
*/
495
insn_rest = insn_adj_cnt - off - len;
496
497
memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
498
sizeof(*patch) * insn_rest);
499
memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
500
501
/* We are guaranteed to not fail at this point, otherwise
502
* the ship has sailed to reverse to the original state. An
503
* overflow cannot happen at this point.
504
*/
505
BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
506
507
bpf_adj_linfo(prog_adj, off, insn_delta);
508
509
return prog_adj;
510
}
511
512
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
513
{
514
int err;
515
516
/* Branch offsets can't overflow when program is shrinking, no need
517
* to call bpf_adj_branches(..., true) here
518
*/
519
memmove(prog->insnsi + off, prog->insnsi + off + cnt,
520
sizeof(struct bpf_insn) * (prog->len - off - cnt));
521
prog->len -= cnt;
522
523
err = bpf_adj_branches(prog, off, off + cnt, off, false);
524
WARN_ON_ONCE(err);
525
return err;
526
}
527
528
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
529
{
530
int i;
531
532
for (i = 0; i < fp->aux->real_func_cnt; i++)
533
bpf_prog_kallsyms_del(fp->aux->func[i]);
534
}
535
536
void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
537
{
538
bpf_prog_kallsyms_del_subprogs(fp);
539
bpf_prog_kallsyms_del(fp);
540
}
541
542
#ifdef CONFIG_BPF_JIT
543
/* All BPF JIT sysctl knobs here. */
544
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
545
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
546
int bpf_jit_harden __read_mostly;
547
long bpf_jit_limit __read_mostly;
548
long bpf_jit_limit_max __read_mostly;
549
550
static void
551
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
552
{
553
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
554
555
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
556
prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
557
}
558
559
static void
560
bpf_prog_ksym_set_name(struct bpf_prog *prog)
561
{
562
char *sym = prog->aux->ksym.name;
563
const char *end = sym + KSYM_NAME_LEN;
564
const struct btf_type *type;
565
const char *func_name;
566
567
BUILD_BUG_ON(sizeof("bpf_prog_") +
568
sizeof(prog->tag) * 2 +
569
/* name has been null terminated.
570
* We should need +1 for the '_' preceding
571
* the name. However, the null character
572
* is double counted between the name and the
573
* sizeof("bpf_prog_") above, so we omit
574
* the +1 here.
575
*/
576
sizeof(prog->aux->name) > KSYM_NAME_LEN);
577
578
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
579
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
580
581
/* prog->aux->name will be ignored if full btf name is available */
582
if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
583
type = btf_type_by_id(prog->aux->btf,
584
prog->aux->func_info[prog->aux->func_idx].type_id);
585
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
586
snprintf(sym, (size_t)(end - sym), "_%s", func_name);
587
return;
588
}
589
590
if (prog->aux->name[0])
591
snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
592
else
593
*sym = 0;
594
}
595
596
static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
597
{
598
return container_of(n, struct bpf_ksym, tnode)->start;
599
}
600
601
static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
602
struct latch_tree_node *b)
603
{
604
return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
605
}
606
607
static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
608
{
609
unsigned long val = (unsigned long)key;
610
const struct bpf_ksym *ksym;
611
612
ksym = container_of(n, struct bpf_ksym, tnode);
613
614
if (val < ksym->start)
615
return -1;
616
/* Ensure that we detect return addresses as part of the program, when
617
* the final instruction is a call for a program part of the stack
618
* trace. Therefore, do val > ksym->end instead of val >= ksym->end.
619
*/
620
if (val > ksym->end)
621
return 1;
622
623
return 0;
624
}
625
626
static const struct latch_tree_ops bpf_tree_ops = {
627
.less = bpf_tree_less,
628
.comp = bpf_tree_comp,
629
};
630
631
static DEFINE_SPINLOCK(bpf_lock);
632
static LIST_HEAD(bpf_kallsyms);
633
static struct latch_tree_root bpf_tree __cacheline_aligned;
634
635
void bpf_ksym_add(struct bpf_ksym *ksym)
636
{
637
spin_lock_bh(&bpf_lock);
638
WARN_ON_ONCE(!list_empty(&ksym->lnode));
639
list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
640
latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
641
spin_unlock_bh(&bpf_lock);
642
}
643
644
static void __bpf_ksym_del(struct bpf_ksym *ksym)
645
{
646
if (list_empty(&ksym->lnode))
647
return;
648
649
latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
650
list_del_rcu(&ksym->lnode);
651
}
652
653
void bpf_ksym_del(struct bpf_ksym *ksym)
654
{
655
spin_lock_bh(&bpf_lock);
656
__bpf_ksym_del(ksym);
657
spin_unlock_bh(&bpf_lock);
658
}
659
660
static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
661
{
662
return fp->jited && !bpf_prog_was_classic(fp);
663
}
664
665
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
666
{
667
if (!bpf_prog_kallsyms_candidate(fp) ||
668
!bpf_token_capable(fp->aux->token, CAP_BPF))
669
return;
670
671
bpf_prog_ksym_set_addr(fp);
672
bpf_prog_ksym_set_name(fp);
673
fp->aux->ksym.prog = true;
674
675
bpf_ksym_add(&fp->aux->ksym);
676
677
#ifdef CONFIG_FINEIBT
678
/*
679
* When FineIBT, code in the __cfi_foo() symbols can get executed
680
* and hence unwinder needs help.
681
*/
682
if (cfi_mode != CFI_FINEIBT)
683
return;
684
685
snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
686
"__cfi_%s", fp->aux->ksym.name);
687
688
fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
689
fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
690
691
bpf_ksym_add(&fp->aux->ksym_prefix);
692
#endif
693
}
694
695
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
696
{
697
if (!bpf_prog_kallsyms_candidate(fp))
698
return;
699
700
bpf_ksym_del(&fp->aux->ksym);
701
#ifdef CONFIG_FINEIBT
702
if (cfi_mode != CFI_FINEIBT)
703
return;
704
bpf_ksym_del(&fp->aux->ksym_prefix);
705
#endif
706
}
707
708
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
709
{
710
struct latch_tree_node *n;
711
712
n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
713
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
714
}
715
716
int __bpf_address_lookup(unsigned long addr, unsigned long *size,
717
unsigned long *off, char *sym)
718
{
719
struct bpf_ksym *ksym;
720
int ret = 0;
721
722
rcu_read_lock();
723
ksym = bpf_ksym_find(addr);
724
if (ksym) {
725
unsigned long symbol_start = ksym->start;
726
unsigned long symbol_end = ksym->end;
727
728
ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
729
730
if (size)
731
*size = symbol_end - symbol_start;
732
if (off)
733
*off = addr - symbol_start;
734
}
735
rcu_read_unlock();
736
737
return ret;
738
}
739
740
bool is_bpf_text_address(unsigned long addr)
741
{
742
bool ret;
743
744
rcu_read_lock();
745
ret = bpf_ksym_find(addr) != NULL;
746
rcu_read_unlock();
747
748
return ret;
749
}
750
751
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
752
{
753
struct bpf_ksym *ksym;
754
755
WARN_ON_ONCE(!rcu_read_lock_held());
756
ksym = bpf_ksym_find(addr);
757
758
return ksym && ksym->prog ?
759
container_of(ksym, struct bpf_prog_aux, ksym)->prog :
760
NULL;
761
}
762
763
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
764
{
765
const struct exception_table_entry *e = NULL;
766
struct bpf_prog *prog;
767
768
rcu_read_lock();
769
prog = bpf_prog_ksym_find(addr);
770
if (!prog)
771
goto out;
772
if (!prog->aux->num_exentries)
773
goto out;
774
775
e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
776
out:
777
rcu_read_unlock();
778
return e;
779
}
780
781
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
782
char *sym)
783
{
784
struct bpf_ksym *ksym;
785
unsigned int it = 0;
786
int ret = -ERANGE;
787
788
if (!bpf_jit_kallsyms_enabled())
789
return ret;
790
791
rcu_read_lock();
792
list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
793
if (it++ != symnum)
794
continue;
795
796
strscpy(sym, ksym->name, KSYM_NAME_LEN);
797
798
*value = ksym->start;
799
*type = BPF_SYM_ELF_TYPE;
800
801
ret = 0;
802
break;
803
}
804
rcu_read_unlock();
805
806
return ret;
807
}
808
809
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
810
struct bpf_jit_poke_descriptor *poke)
811
{
812
struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
813
static const u32 poke_tab_max = 1024;
814
u32 slot = prog->aux->size_poke_tab;
815
u32 size = slot + 1;
816
817
if (size > poke_tab_max)
818
return -ENOSPC;
819
if (poke->tailcall_target || poke->tailcall_target_stable ||
820
poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
821
return -EINVAL;
822
823
switch (poke->reason) {
824
case BPF_POKE_REASON_TAIL_CALL:
825
if (!poke->tail_call.map)
826
return -EINVAL;
827
break;
828
default:
829
return -EINVAL;
830
}
831
832
tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
833
if (!tab)
834
return -ENOMEM;
835
836
memcpy(&tab[slot], poke, sizeof(*poke));
837
prog->aux->size_poke_tab = size;
838
prog->aux->poke_tab = tab;
839
840
return slot;
841
}
842
843
/*
844
* BPF program pack allocator.
845
*
846
* Most BPF programs are pretty small. Allocating a hole page for each
847
* program is sometime a waste. Many small bpf program also adds pressure
848
* to instruction TLB. To solve this issue, we introduce a BPF program pack
849
* allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
850
* to host BPF programs.
851
*/
852
#define BPF_PROG_CHUNK_SHIFT 6
853
#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
854
#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
855
856
struct bpf_prog_pack {
857
struct list_head list;
858
void *ptr;
859
unsigned long bitmap[];
860
};
861
862
void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
863
{
864
memset(area, 0, size);
865
}
866
867
#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
868
869
static DEFINE_MUTEX(pack_mutex);
870
static LIST_HEAD(pack_list);
871
872
/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
873
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
874
*/
875
#ifdef PMD_SIZE
876
/* PMD_SIZE is really big for some archs. It doesn't make sense to
877
* reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
878
* 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
879
* greater than or equal to 2MB.
880
*/
881
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
882
#else
883
#define BPF_PROG_PACK_SIZE PAGE_SIZE
884
#endif
885
886
#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
887
888
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
889
{
890
struct bpf_prog_pack *pack;
891
int err;
892
893
pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
894
GFP_KERNEL);
895
if (!pack)
896
return NULL;
897
pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
898
if (!pack->ptr)
899
goto out;
900
bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
901
bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
902
903
set_vm_flush_reset_perms(pack->ptr);
904
err = set_memory_rox((unsigned long)pack->ptr,
905
BPF_PROG_PACK_SIZE / PAGE_SIZE);
906
if (err)
907
goto out;
908
list_add_tail(&pack->list, &pack_list);
909
return pack;
910
911
out:
912
bpf_jit_free_exec(pack->ptr);
913
kfree(pack);
914
return NULL;
915
}
916
917
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
918
{
919
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
920
struct bpf_prog_pack *pack;
921
unsigned long pos;
922
void *ptr = NULL;
923
924
mutex_lock(&pack_mutex);
925
if (size > BPF_PROG_PACK_SIZE) {
926
size = round_up(size, PAGE_SIZE);
927
ptr = bpf_jit_alloc_exec(size);
928
if (ptr) {
929
int err;
930
931
bpf_fill_ill_insns(ptr, size);
932
set_vm_flush_reset_perms(ptr);
933
err = set_memory_rox((unsigned long)ptr,
934
size / PAGE_SIZE);
935
if (err) {
936
bpf_jit_free_exec(ptr);
937
ptr = NULL;
938
}
939
}
940
goto out;
941
}
942
list_for_each_entry(pack, &pack_list, list) {
943
pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
944
nbits, 0);
945
if (pos < BPF_PROG_CHUNK_COUNT)
946
goto found_free_area;
947
}
948
949
pack = alloc_new_pack(bpf_fill_ill_insns);
950
if (!pack)
951
goto out;
952
953
pos = 0;
954
955
found_free_area:
956
bitmap_set(pack->bitmap, pos, nbits);
957
ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
958
959
out:
960
mutex_unlock(&pack_mutex);
961
return ptr;
962
}
963
964
void bpf_prog_pack_free(void *ptr, u32 size)
965
{
966
struct bpf_prog_pack *pack = NULL, *tmp;
967
unsigned int nbits;
968
unsigned long pos;
969
970
mutex_lock(&pack_mutex);
971
if (size > BPF_PROG_PACK_SIZE) {
972
bpf_jit_free_exec(ptr);
973
goto out;
974
}
975
976
list_for_each_entry(tmp, &pack_list, list) {
977
if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
978
pack = tmp;
979
break;
980
}
981
}
982
983
if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
984
goto out;
985
986
nbits = BPF_PROG_SIZE_TO_NBITS(size);
987
pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
988
989
WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
990
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
991
992
bitmap_clear(pack->bitmap, pos, nbits);
993
if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
994
BPF_PROG_CHUNK_COUNT, 0) == 0) {
995
list_del(&pack->list);
996
bpf_jit_free_exec(pack->ptr);
997
kfree(pack);
998
}
999
out:
1000
mutex_unlock(&pack_mutex);
1001
}
1002
1003
static atomic_long_t bpf_jit_current;
1004
1005
/* Can be overridden by an arch's JIT compiler if it has a custom,
1006
* dedicated BPF backend memory area, or if neither of the two
1007
* below apply.
1008
*/
1009
u64 __weak bpf_jit_alloc_exec_limit(void)
1010
{
1011
#if defined(MODULES_VADDR)
1012
return MODULES_END - MODULES_VADDR;
1013
#else
1014
return VMALLOC_END - VMALLOC_START;
1015
#endif
1016
}
1017
1018
static int __init bpf_jit_charge_init(void)
1019
{
1020
/* Only used as heuristic here to derive limit. */
1021
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
1022
bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
1023
PAGE_SIZE), LONG_MAX);
1024
return 0;
1025
}
1026
pure_initcall(bpf_jit_charge_init);
1027
1028
int bpf_jit_charge_modmem(u32 size)
1029
{
1030
if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
1031
if (!bpf_capable()) {
1032
atomic_long_sub(size, &bpf_jit_current);
1033
return -EPERM;
1034
}
1035
}
1036
1037
return 0;
1038
}
1039
1040
void bpf_jit_uncharge_modmem(u32 size)
1041
{
1042
atomic_long_sub(size, &bpf_jit_current);
1043
}
1044
1045
void *__weak bpf_jit_alloc_exec(unsigned long size)
1046
{
1047
return execmem_alloc(EXECMEM_BPF, size);
1048
}
1049
1050
void __weak bpf_jit_free_exec(void *addr)
1051
{
1052
execmem_free(addr);
1053
}
1054
1055
struct bpf_binary_header *
1056
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
1057
unsigned int alignment,
1058
bpf_jit_fill_hole_t bpf_fill_ill_insns)
1059
{
1060
struct bpf_binary_header *hdr;
1061
u32 size, hole, start;
1062
1063
WARN_ON_ONCE(!is_power_of_2(alignment) ||
1064
alignment > BPF_IMAGE_ALIGNMENT);
1065
1066
/* Most of BPF filters are really small, but if some of them
1067
* fill a page, allow at least 128 extra bytes to insert a
1068
* random section of illegal instructions.
1069
*/
1070
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
1071
1072
if (bpf_jit_charge_modmem(size))
1073
return NULL;
1074
hdr = bpf_jit_alloc_exec(size);
1075
if (!hdr) {
1076
bpf_jit_uncharge_modmem(size);
1077
return NULL;
1078
}
1079
1080
/* Fill space with illegal/arch-dep instructions. */
1081
bpf_fill_ill_insns(hdr, size);
1082
1083
hdr->size = size;
1084
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
1085
PAGE_SIZE - sizeof(*hdr));
1086
start = get_random_u32_below(hole) & ~(alignment - 1);
1087
1088
/* Leave a random number of instructions before BPF code. */
1089
*image_ptr = &hdr->image[start];
1090
1091
return hdr;
1092
}
1093
1094
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
1095
{
1096
u32 size = hdr->size;
1097
1098
bpf_jit_free_exec(hdr);
1099
bpf_jit_uncharge_modmem(size);
1100
}
1101
1102
/* Allocate jit binary from bpf_prog_pack allocator.
1103
* Since the allocated memory is RO+X, the JIT engine cannot write directly
1104
* to the memory. To solve this problem, a RW buffer is also allocated at
1105
* as the same time. The JIT engine should calculate offsets based on the
1106
* RO memory address, but write JITed program to the RW buffer. Once the
1107
* JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
1108
* the JITed program to the RO memory.
1109
*/
1110
struct bpf_binary_header *
1111
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
1112
unsigned int alignment,
1113
struct bpf_binary_header **rw_header,
1114
u8 **rw_image,
1115
bpf_jit_fill_hole_t bpf_fill_ill_insns)
1116
{
1117
struct bpf_binary_header *ro_header;
1118
u32 size, hole, start;
1119
1120
WARN_ON_ONCE(!is_power_of_2(alignment) ||
1121
alignment > BPF_IMAGE_ALIGNMENT);
1122
1123
/* add 16 bytes for a random section of illegal instructions */
1124
size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
1125
1126
if (bpf_jit_charge_modmem(size))
1127
return NULL;
1128
ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
1129
if (!ro_header) {
1130
bpf_jit_uncharge_modmem(size);
1131
return NULL;
1132
}
1133
1134
*rw_header = kvmalloc(size, GFP_KERNEL);
1135
if (!*rw_header) {
1136
bpf_prog_pack_free(ro_header, size);
1137
bpf_jit_uncharge_modmem(size);
1138
return NULL;
1139
}
1140
1141
/* Fill space with illegal/arch-dep instructions. */
1142
bpf_fill_ill_insns(*rw_header, size);
1143
(*rw_header)->size = size;
1144
1145
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
1146
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
1147
start = get_random_u32_below(hole) & ~(alignment - 1);
1148
1149
*image_ptr = &ro_header->image[start];
1150
*rw_image = &(*rw_header)->image[start];
1151
1152
return ro_header;
1153
}
1154
1155
/* Copy JITed text from rw_header to its final location, the ro_header. */
1156
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
1157
struct bpf_binary_header *rw_header)
1158
{
1159
void *ptr;
1160
1161
ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
1162
1163
kvfree(rw_header);
1164
1165
if (IS_ERR(ptr)) {
1166
bpf_prog_pack_free(ro_header, ro_header->size);
1167
return PTR_ERR(ptr);
1168
}
1169
return 0;
1170
}
1171
1172
/* bpf_jit_binary_pack_free is called in two different scenarios:
1173
* 1) when the program is freed after;
1174
* 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
1175
* For case 2), we need to free both the RO memory and the RW buffer.
1176
*
1177
* bpf_jit_binary_pack_free requires proper ro_header->size. However,
1178
* bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
1179
* must be set with either bpf_jit_binary_pack_finalize (normal path) or
1180
* bpf_arch_text_copy (when jit fails).
1181
*/
1182
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
1183
struct bpf_binary_header *rw_header)
1184
{
1185
u32 size = ro_header->size;
1186
1187
bpf_prog_pack_free(ro_header, size);
1188
kvfree(rw_header);
1189
bpf_jit_uncharge_modmem(size);
1190
}
1191
1192
struct bpf_binary_header *
1193
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
1194
{
1195
unsigned long real_start = (unsigned long)fp->bpf_func;
1196
unsigned long addr;
1197
1198
addr = real_start & BPF_PROG_CHUNK_MASK;
1199
return (void *)addr;
1200
}
1201
1202
static inline struct bpf_binary_header *
1203
bpf_jit_binary_hdr(const struct bpf_prog *fp)
1204
{
1205
unsigned long real_start = (unsigned long)fp->bpf_func;
1206
unsigned long addr;
1207
1208
addr = real_start & PAGE_MASK;
1209
return (void *)addr;
1210
}
1211
1212
/* This symbol is only overridden by archs that have different
1213
* requirements than the usual eBPF JITs, f.e. when they only
1214
* implement cBPF JIT, do not set images read-only, etc.
1215
*/
1216
void __weak bpf_jit_free(struct bpf_prog *fp)
1217
{
1218
if (fp->jited) {
1219
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
1220
1221
bpf_jit_binary_free(hdr);
1222
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
1223
}
1224
1225
bpf_prog_unlock_free(fp);
1226
}
1227
1228
int bpf_jit_get_func_addr(const struct bpf_prog *prog,
1229
const struct bpf_insn *insn, bool extra_pass,
1230
u64 *func_addr, bool *func_addr_fixed)
1231
{
1232
s16 off = insn->off;
1233
s32 imm = insn->imm;
1234
u8 *addr;
1235
int err;
1236
1237
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
1238
if (!*func_addr_fixed) {
1239
/* Place-holder address till the last pass has collected
1240
* all addresses for JITed subprograms in which case we
1241
* can pick them up from prog->aux.
1242
*/
1243
if (!extra_pass)
1244
addr = NULL;
1245
else if (prog->aux->func &&
1246
off >= 0 && off < prog->aux->real_func_cnt)
1247
addr = (u8 *)prog->aux->func[off]->bpf_func;
1248
else
1249
return -EINVAL;
1250
} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
1251
bpf_jit_supports_far_kfunc_call()) {
1252
err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
1253
if (err)
1254
return err;
1255
} else {
1256
/* Address of a BPF helper call. Since part of the core
1257
* kernel, it's always at a fixed location. __bpf_call_base
1258
* and the helper with imm relative to it are both in core
1259
* kernel.
1260
*/
1261
addr = (u8 *)__bpf_call_base + imm;
1262
}
1263
1264
*func_addr = (unsigned long)addr;
1265
return 0;
1266
}
1267
1268
const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
1269
{
1270
if (prog->aux->ksym.prog)
1271
return prog->aux->ksym.name;
1272
return prog->aux->name;
1273
}
1274
1275
static int bpf_jit_blind_insn(const struct bpf_insn *from,
1276
const struct bpf_insn *aux,
1277
struct bpf_insn *to_buff,
1278
bool emit_zext)
1279
{
1280
struct bpf_insn *to = to_buff;
1281
u32 imm_rnd = get_random_u32();
1282
s16 off;
1283
1284
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
1285
BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
1286
1287
/* Constraints on AX register:
1288
*
1289
* AX register is inaccessible from user space. It is mapped in
1290
* all JITs, and used here for constant blinding rewrites. It is
1291
* typically "stateless" meaning its contents are only valid within
1292
* the executed instruction, but not across several instructions.
1293
* There are a few exceptions however which are further detailed
1294
* below.
1295
*
1296
* Constant blinding is only used by JITs, not in the interpreter.
1297
* The interpreter uses AX in some occasions as a local temporary
1298
* register e.g. in DIV or MOD instructions.
1299
*
1300
* In restricted circumstances, the verifier can also use the AX
1301
* register for rewrites as long as they do not interfere with
1302
* the above cases!
1303
*/
1304
if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
1305
goto out;
1306
1307
if (from->imm == 0 &&
1308
(from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
1309
from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
1310
*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
1311
goto out;
1312
}
1313
1314
switch (from->code) {
1315
case BPF_ALU | BPF_ADD | BPF_K:
1316
case BPF_ALU | BPF_SUB | BPF_K:
1317
case BPF_ALU | BPF_AND | BPF_K:
1318
case BPF_ALU | BPF_OR | BPF_K:
1319
case BPF_ALU | BPF_XOR | BPF_K:
1320
case BPF_ALU | BPF_MUL | BPF_K:
1321
case BPF_ALU | BPF_MOV | BPF_K:
1322
case BPF_ALU | BPF_DIV | BPF_K:
1323
case BPF_ALU | BPF_MOD | BPF_K:
1324
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1325
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1326
*to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1327
break;
1328
1329
case BPF_ALU64 | BPF_ADD | BPF_K:
1330
case BPF_ALU64 | BPF_SUB | BPF_K:
1331
case BPF_ALU64 | BPF_AND | BPF_K:
1332
case BPF_ALU64 | BPF_OR | BPF_K:
1333
case BPF_ALU64 | BPF_XOR | BPF_K:
1334
case BPF_ALU64 | BPF_MUL | BPF_K:
1335
case BPF_ALU64 | BPF_MOV | BPF_K:
1336
case BPF_ALU64 | BPF_DIV | BPF_K:
1337
case BPF_ALU64 | BPF_MOD | BPF_K:
1338
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1339
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1340
*to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1341
break;
1342
1343
case BPF_JMP | BPF_JEQ | BPF_K:
1344
case BPF_JMP | BPF_JNE | BPF_K:
1345
case BPF_JMP | BPF_JGT | BPF_K:
1346
case BPF_JMP | BPF_JLT | BPF_K:
1347
case BPF_JMP | BPF_JGE | BPF_K:
1348
case BPF_JMP | BPF_JLE | BPF_K:
1349
case BPF_JMP | BPF_JSGT | BPF_K:
1350
case BPF_JMP | BPF_JSLT | BPF_K:
1351
case BPF_JMP | BPF_JSGE | BPF_K:
1352
case BPF_JMP | BPF_JSLE | BPF_K:
1353
case BPF_JMP | BPF_JSET | BPF_K:
1354
/* Accommodate for extra offset in case of a backjump. */
1355
off = from->off;
1356
if (off < 0)
1357
off -= 2;
1358
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1359
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1360
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
1361
break;
1362
1363
case BPF_JMP32 | BPF_JEQ | BPF_K:
1364
case BPF_JMP32 | BPF_JNE | BPF_K:
1365
case BPF_JMP32 | BPF_JGT | BPF_K:
1366
case BPF_JMP32 | BPF_JLT | BPF_K:
1367
case BPF_JMP32 | BPF_JGE | BPF_K:
1368
case BPF_JMP32 | BPF_JLE | BPF_K:
1369
case BPF_JMP32 | BPF_JSGT | BPF_K:
1370
case BPF_JMP32 | BPF_JSLT | BPF_K:
1371
case BPF_JMP32 | BPF_JSGE | BPF_K:
1372
case BPF_JMP32 | BPF_JSLE | BPF_K:
1373
case BPF_JMP32 | BPF_JSET | BPF_K:
1374
/* Accommodate for extra offset in case of a backjump. */
1375
off = from->off;
1376
if (off < 0)
1377
off -= 2;
1378
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1379
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1380
*to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
1381
off);
1382
break;
1383
1384
case BPF_LD | BPF_IMM | BPF_DW:
1385
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
1386
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1387
*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
1388
*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
1389
break;
1390
case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
1391
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
1392
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1393
if (emit_zext)
1394
*to++ = BPF_ZEXT_REG(BPF_REG_AX);
1395
*to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
1396
break;
1397
1398
case BPF_ST | BPF_MEM | BPF_DW:
1399
case BPF_ST | BPF_MEM | BPF_W:
1400
case BPF_ST | BPF_MEM | BPF_H:
1401
case BPF_ST | BPF_MEM | BPF_B:
1402
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1403
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1404
*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
1405
break;
1406
}
1407
out:
1408
return to - to_buff;
1409
}
1410
1411
static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
1412
gfp_t gfp_extra_flags)
1413
{
1414
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
1415
struct bpf_prog *fp;
1416
1417
fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
1418
if (fp != NULL) {
1419
/* aux->prog still points to the fp_other one, so
1420
* when promoting the clone to the real program,
1421
* this still needs to be adapted.
1422
*/
1423
memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
1424
}
1425
1426
return fp;
1427
}
1428
1429
static void bpf_prog_clone_free(struct bpf_prog *fp)
1430
{
1431
/* aux was stolen by the other clone, so we cannot free
1432
* it from this path! It will be freed eventually by the
1433
* other program on release.
1434
*
1435
* At this point, we don't need a deferred release since
1436
* clone is guaranteed to not be locked.
1437
*/
1438
fp->aux = NULL;
1439
fp->stats = NULL;
1440
fp->active = NULL;
1441
__bpf_prog_free(fp);
1442
}
1443
1444
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
1445
{
1446
/* We have to repoint aux->prog to self, as we don't
1447
* know whether fp here is the clone or the original.
1448
*/
1449
fp->aux->prog = fp;
1450
bpf_prog_clone_free(fp_other);
1451
}
1452
1453
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
1454
{
1455
struct bpf_insn insn_buff[16], aux[2];
1456
struct bpf_prog *clone, *tmp;
1457
int insn_delta, insn_cnt;
1458
struct bpf_insn *insn;
1459
int i, rewritten;
1460
1461
if (!prog->blinding_requested || prog->blinded)
1462
return prog;
1463
1464
clone = bpf_prog_clone_create(prog, GFP_USER);
1465
if (!clone)
1466
return ERR_PTR(-ENOMEM);
1467
1468
insn_cnt = clone->len;
1469
insn = clone->insnsi;
1470
1471
for (i = 0; i < insn_cnt; i++, insn++) {
1472
if (bpf_pseudo_func(insn)) {
1473
/* ld_imm64 with an address of bpf subprog is not
1474
* a user controlled constant. Don't randomize it,
1475
* since it will conflict with jit_subprogs() logic.
1476
*/
1477
insn++;
1478
i++;
1479
continue;
1480
}
1481
1482
/* We temporarily need to hold the original ld64 insn
1483
* so that we can still access the first part in the
1484
* second blinding run.
1485
*/
1486
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
1487
insn[1].code == 0)
1488
memcpy(aux, insn, sizeof(aux));
1489
1490
rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
1491
clone->aux->verifier_zext);
1492
if (!rewritten)
1493
continue;
1494
1495
tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
1496
if (IS_ERR(tmp)) {
1497
/* Patching may have repointed aux->prog during
1498
* realloc from the original one, so we need to
1499
* fix it up here on error.
1500
*/
1501
bpf_jit_prog_release_other(prog, clone);
1502
return tmp;
1503
}
1504
1505
clone = tmp;
1506
insn_delta = rewritten - 1;
1507
1508
/* Walk new program and skip insns we just inserted. */
1509
insn = clone->insnsi + i + insn_delta;
1510
insn_cnt += insn_delta;
1511
i += insn_delta;
1512
}
1513
1514
clone->blinded = 1;
1515
return clone;
1516
}
1517
#endif /* CONFIG_BPF_JIT */
1518
1519
/* Base function for offset calculation. Needs to go into .text section,
1520
* therefore keeping it non-static as well; will also be used by JITs
1521
* anyway later on, so do not let the compiler omit it. This also needs
1522
* to go into kallsyms for correlation from e.g. bpftool, so naming
1523
* must not change.
1524
*/
1525
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1526
{
1527
return 0;
1528
}
1529
EXPORT_SYMBOL_GPL(__bpf_call_base);
1530
1531
/* All UAPI available opcodes. */
1532
#define BPF_INSN_MAP(INSN_2, INSN_3) \
1533
/* 32 bit ALU operations. */ \
1534
/* Register based. */ \
1535
INSN_3(ALU, ADD, X), \
1536
INSN_3(ALU, SUB, X), \
1537
INSN_3(ALU, AND, X), \
1538
INSN_3(ALU, OR, X), \
1539
INSN_3(ALU, LSH, X), \
1540
INSN_3(ALU, RSH, X), \
1541
INSN_3(ALU, XOR, X), \
1542
INSN_3(ALU, MUL, X), \
1543
INSN_3(ALU, MOV, X), \
1544
INSN_3(ALU, ARSH, X), \
1545
INSN_3(ALU, DIV, X), \
1546
INSN_3(ALU, MOD, X), \
1547
INSN_2(ALU, NEG), \
1548
INSN_3(ALU, END, TO_BE), \
1549
INSN_3(ALU, END, TO_LE), \
1550
/* Immediate based. */ \
1551
INSN_3(ALU, ADD, K), \
1552
INSN_3(ALU, SUB, K), \
1553
INSN_3(ALU, AND, K), \
1554
INSN_3(ALU, OR, K), \
1555
INSN_3(ALU, LSH, K), \
1556
INSN_3(ALU, RSH, K), \
1557
INSN_3(ALU, XOR, K), \
1558
INSN_3(ALU, MUL, K), \
1559
INSN_3(ALU, MOV, K), \
1560
INSN_3(ALU, ARSH, K), \
1561
INSN_3(ALU, DIV, K), \
1562
INSN_3(ALU, MOD, K), \
1563
/* 64 bit ALU operations. */ \
1564
/* Register based. */ \
1565
INSN_3(ALU64, ADD, X), \
1566
INSN_3(ALU64, SUB, X), \
1567
INSN_3(ALU64, AND, X), \
1568
INSN_3(ALU64, OR, X), \
1569
INSN_3(ALU64, LSH, X), \
1570
INSN_3(ALU64, RSH, X), \
1571
INSN_3(ALU64, XOR, X), \
1572
INSN_3(ALU64, MUL, X), \
1573
INSN_3(ALU64, MOV, X), \
1574
INSN_3(ALU64, ARSH, X), \
1575
INSN_3(ALU64, DIV, X), \
1576
INSN_3(ALU64, MOD, X), \
1577
INSN_2(ALU64, NEG), \
1578
INSN_3(ALU64, END, TO_LE), \
1579
/* Immediate based. */ \
1580
INSN_3(ALU64, ADD, K), \
1581
INSN_3(ALU64, SUB, K), \
1582
INSN_3(ALU64, AND, K), \
1583
INSN_3(ALU64, OR, K), \
1584
INSN_3(ALU64, LSH, K), \
1585
INSN_3(ALU64, RSH, K), \
1586
INSN_3(ALU64, XOR, K), \
1587
INSN_3(ALU64, MUL, K), \
1588
INSN_3(ALU64, MOV, K), \
1589
INSN_3(ALU64, ARSH, K), \
1590
INSN_3(ALU64, DIV, K), \
1591
INSN_3(ALU64, MOD, K), \
1592
/* Call instruction. */ \
1593
INSN_2(JMP, CALL), \
1594
/* Exit instruction. */ \
1595
INSN_2(JMP, EXIT), \
1596
/* 32-bit Jump instructions. */ \
1597
/* Register based. */ \
1598
INSN_3(JMP32, JEQ, X), \
1599
INSN_3(JMP32, JNE, X), \
1600
INSN_3(JMP32, JGT, X), \
1601
INSN_3(JMP32, JLT, X), \
1602
INSN_3(JMP32, JGE, X), \
1603
INSN_3(JMP32, JLE, X), \
1604
INSN_3(JMP32, JSGT, X), \
1605
INSN_3(JMP32, JSLT, X), \
1606
INSN_3(JMP32, JSGE, X), \
1607
INSN_3(JMP32, JSLE, X), \
1608
INSN_3(JMP32, JSET, X), \
1609
/* Immediate based. */ \
1610
INSN_3(JMP32, JEQ, K), \
1611
INSN_3(JMP32, JNE, K), \
1612
INSN_3(JMP32, JGT, K), \
1613
INSN_3(JMP32, JLT, K), \
1614
INSN_3(JMP32, JGE, K), \
1615
INSN_3(JMP32, JLE, K), \
1616
INSN_3(JMP32, JSGT, K), \
1617
INSN_3(JMP32, JSLT, K), \
1618
INSN_3(JMP32, JSGE, K), \
1619
INSN_3(JMP32, JSLE, K), \
1620
INSN_3(JMP32, JSET, K), \
1621
/* Jump instructions. */ \
1622
/* Register based. */ \
1623
INSN_3(JMP, JEQ, X), \
1624
INSN_3(JMP, JNE, X), \
1625
INSN_3(JMP, JGT, X), \
1626
INSN_3(JMP, JLT, X), \
1627
INSN_3(JMP, JGE, X), \
1628
INSN_3(JMP, JLE, X), \
1629
INSN_3(JMP, JSGT, X), \
1630
INSN_3(JMP, JSLT, X), \
1631
INSN_3(JMP, JSGE, X), \
1632
INSN_3(JMP, JSLE, X), \
1633
INSN_3(JMP, JSET, X), \
1634
/* Immediate based. */ \
1635
INSN_3(JMP, JEQ, K), \
1636
INSN_3(JMP, JNE, K), \
1637
INSN_3(JMP, JGT, K), \
1638
INSN_3(JMP, JLT, K), \
1639
INSN_3(JMP, JGE, K), \
1640
INSN_3(JMP, JLE, K), \
1641
INSN_3(JMP, JSGT, K), \
1642
INSN_3(JMP, JSLT, K), \
1643
INSN_3(JMP, JSGE, K), \
1644
INSN_3(JMP, JSLE, K), \
1645
INSN_3(JMP, JSET, K), \
1646
INSN_2(JMP, JA), \
1647
INSN_2(JMP32, JA), \
1648
/* Atomic operations. */ \
1649
INSN_3(STX, ATOMIC, B), \
1650
INSN_3(STX, ATOMIC, H), \
1651
INSN_3(STX, ATOMIC, W), \
1652
INSN_3(STX, ATOMIC, DW), \
1653
/* Store instructions. */ \
1654
/* Register based. */ \
1655
INSN_3(STX, MEM, B), \
1656
INSN_3(STX, MEM, H), \
1657
INSN_3(STX, MEM, W), \
1658
INSN_3(STX, MEM, DW), \
1659
/* Immediate based. */ \
1660
INSN_3(ST, MEM, B), \
1661
INSN_3(ST, MEM, H), \
1662
INSN_3(ST, MEM, W), \
1663
INSN_3(ST, MEM, DW), \
1664
/* Load instructions. */ \
1665
/* Register based. */ \
1666
INSN_3(LDX, MEM, B), \
1667
INSN_3(LDX, MEM, H), \
1668
INSN_3(LDX, MEM, W), \
1669
INSN_3(LDX, MEM, DW), \
1670
INSN_3(LDX, MEMSX, B), \
1671
INSN_3(LDX, MEMSX, H), \
1672
INSN_3(LDX, MEMSX, W), \
1673
/* Immediate based. */ \
1674
INSN_3(LD, IMM, DW)
1675
1676
bool bpf_opcode_in_insntable(u8 code)
1677
{
1678
#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true
1679
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
1680
static const bool public_insntable[256] = {
1681
[0 ... 255] = false,
1682
/* Now overwrite non-defaults ... */
1683
BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
1684
/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
1685
[BPF_LD | BPF_ABS | BPF_B] = true,
1686
[BPF_LD | BPF_ABS | BPF_H] = true,
1687
[BPF_LD | BPF_ABS | BPF_W] = true,
1688
[BPF_LD | BPF_IND | BPF_B] = true,
1689
[BPF_LD | BPF_IND | BPF_H] = true,
1690
[BPF_LD | BPF_IND | BPF_W] = true,
1691
[BPF_JMP | BPF_JCOND] = true,
1692
};
1693
#undef BPF_INSN_3_TBL
1694
#undef BPF_INSN_2_TBL
1695
return public_insntable[code];
1696
}
1697
1698
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1699
/**
1700
* ___bpf_prog_run - run eBPF program on a given context
1701
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
1702
* @insn: is the array of eBPF instructions
1703
*
1704
* Decode and execute eBPF instructions.
1705
*
1706
* Return: whatever value is in %BPF_R0 at program exit
1707
*/
1708
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
1709
{
1710
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
1711
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
1712
static const void * const jumptable[256] __annotate_jump_table = {
1713
[0 ... 255] = &&default_label,
1714
/* Now overwrite non-defaults ... */
1715
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
1716
/* Non-UAPI available opcodes. */
1717
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
1718
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
1719
[BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
1720
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
1721
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
1722
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
1723
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
1724
[BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
1725
[BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
1726
[BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
1727
};
1728
#undef BPF_INSN_3_LBL
1729
#undef BPF_INSN_2_LBL
1730
u32 tail_call_cnt = 0;
1731
1732
#define CONT ({ insn++; goto select_insn; })
1733
#define CONT_JMP ({ insn++; goto select_insn; })
1734
1735
select_insn:
1736
goto *jumptable[insn->code];
1737
1738
/* Explicitly mask the register-based shift amounts with 63 or 31
1739
* to avoid undefined behavior. Normally this won't affect the
1740
* generated code, for example, in case of native 64 bit archs such
1741
* as x86-64 or arm64, the compiler is optimizing the AND away for
1742
* the interpreter. In case of JITs, each of the JIT backends compiles
1743
* the BPF shift operations to machine instructions which produce
1744
* implementation-defined results in such a case; the resulting
1745
* contents of the register may be arbitrary, but program behaviour
1746
* as a whole remains defined. In other words, in case of JIT backends,
1747
* the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
1748
*/
1749
/* ALU (shifts) */
1750
#define SHT(OPCODE, OP) \
1751
ALU64_##OPCODE##_X: \
1752
DST = DST OP (SRC & 63); \
1753
CONT; \
1754
ALU_##OPCODE##_X: \
1755
DST = (u32) DST OP ((u32) SRC & 31); \
1756
CONT; \
1757
ALU64_##OPCODE##_K: \
1758
DST = DST OP IMM; \
1759
CONT; \
1760
ALU_##OPCODE##_K: \
1761
DST = (u32) DST OP (u32) IMM; \
1762
CONT;
1763
/* ALU (rest) */
1764
#define ALU(OPCODE, OP) \
1765
ALU64_##OPCODE##_X: \
1766
DST = DST OP SRC; \
1767
CONT; \
1768
ALU_##OPCODE##_X: \
1769
DST = (u32) DST OP (u32) SRC; \
1770
CONT; \
1771
ALU64_##OPCODE##_K: \
1772
DST = DST OP IMM; \
1773
CONT; \
1774
ALU_##OPCODE##_K: \
1775
DST = (u32) DST OP (u32) IMM; \
1776
CONT;
1777
ALU(ADD, +)
1778
ALU(SUB, -)
1779
ALU(AND, &)
1780
ALU(OR, |)
1781
ALU(XOR, ^)
1782
ALU(MUL, *)
1783
SHT(LSH, <<)
1784
SHT(RSH, >>)
1785
#undef SHT
1786
#undef ALU
1787
ALU_NEG:
1788
DST = (u32) -DST;
1789
CONT;
1790
ALU64_NEG:
1791
DST = -DST;
1792
CONT;
1793
ALU_MOV_X:
1794
switch (OFF) {
1795
case 0:
1796
DST = (u32) SRC;
1797
break;
1798
case 8:
1799
DST = (u32)(s8) SRC;
1800
break;
1801
case 16:
1802
DST = (u32)(s16) SRC;
1803
break;
1804
}
1805
CONT;
1806
ALU_MOV_K:
1807
DST = (u32) IMM;
1808
CONT;
1809
ALU64_MOV_X:
1810
switch (OFF) {
1811
case 0:
1812
DST = SRC;
1813
break;
1814
case 8:
1815
DST = (s8) SRC;
1816
break;
1817
case 16:
1818
DST = (s16) SRC;
1819
break;
1820
case 32:
1821
DST = (s32) SRC;
1822
break;
1823
}
1824
CONT;
1825
ALU64_MOV_K:
1826
DST = IMM;
1827
CONT;
1828
LD_IMM_DW:
1829
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
1830
insn++;
1831
CONT;
1832
ALU_ARSH_X:
1833
DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
1834
CONT;
1835
ALU_ARSH_K:
1836
DST = (u64) (u32) (((s32) DST) >> IMM);
1837
CONT;
1838
ALU64_ARSH_X:
1839
(*(s64 *) &DST) >>= (SRC & 63);
1840
CONT;
1841
ALU64_ARSH_K:
1842
(*(s64 *) &DST) >>= IMM;
1843
CONT;
1844
ALU64_MOD_X:
1845
switch (OFF) {
1846
case 0:
1847
div64_u64_rem(DST, SRC, &AX);
1848
DST = AX;
1849
break;
1850
case 1:
1851
AX = div64_s64(DST, SRC);
1852
DST = DST - AX * SRC;
1853
break;
1854
}
1855
CONT;
1856
ALU_MOD_X:
1857
switch (OFF) {
1858
case 0:
1859
AX = (u32) DST;
1860
DST = do_div(AX, (u32) SRC);
1861
break;
1862
case 1:
1863
AX = abs((s32)DST);
1864
AX = do_div(AX, abs((s32)SRC));
1865
if ((s32)DST < 0)
1866
DST = (u32)-AX;
1867
else
1868
DST = (u32)AX;
1869
break;
1870
}
1871
CONT;
1872
ALU64_MOD_K:
1873
switch (OFF) {
1874
case 0:
1875
div64_u64_rem(DST, IMM, &AX);
1876
DST = AX;
1877
break;
1878
case 1:
1879
AX = div64_s64(DST, IMM);
1880
DST = DST - AX * IMM;
1881
break;
1882
}
1883
CONT;
1884
ALU_MOD_K:
1885
switch (OFF) {
1886
case 0:
1887
AX = (u32) DST;
1888
DST = do_div(AX, (u32) IMM);
1889
break;
1890
case 1:
1891
AX = abs((s32)DST);
1892
AX = do_div(AX, abs((s32)IMM));
1893
if ((s32)DST < 0)
1894
DST = (u32)-AX;
1895
else
1896
DST = (u32)AX;
1897
break;
1898
}
1899
CONT;
1900
ALU64_DIV_X:
1901
switch (OFF) {
1902
case 0:
1903
DST = div64_u64(DST, SRC);
1904
break;
1905
case 1:
1906
DST = div64_s64(DST, SRC);
1907
break;
1908
}
1909
CONT;
1910
ALU_DIV_X:
1911
switch (OFF) {
1912
case 0:
1913
AX = (u32) DST;
1914
do_div(AX, (u32) SRC);
1915
DST = (u32) AX;
1916
break;
1917
case 1:
1918
AX = abs((s32)DST);
1919
do_div(AX, abs((s32)SRC));
1920
if (((s32)DST < 0) == ((s32)SRC < 0))
1921
DST = (u32)AX;
1922
else
1923
DST = (u32)-AX;
1924
break;
1925
}
1926
CONT;
1927
ALU64_DIV_K:
1928
switch (OFF) {
1929
case 0:
1930
DST = div64_u64(DST, IMM);
1931
break;
1932
case 1:
1933
DST = div64_s64(DST, IMM);
1934
break;
1935
}
1936
CONT;
1937
ALU_DIV_K:
1938
switch (OFF) {
1939
case 0:
1940
AX = (u32) DST;
1941
do_div(AX, (u32) IMM);
1942
DST = (u32) AX;
1943
break;
1944
case 1:
1945
AX = abs((s32)DST);
1946
do_div(AX, abs((s32)IMM));
1947
if (((s32)DST < 0) == ((s32)IMM < 0))
1948
DST = (u32)AX;
1949
else
1950
DST = (u32)-AX;
1951
break;
1952
}
1953
CONT;
1954
ALU_END_TO_BE:
1955
switch (IMM) {
1956
case 16:
1957
DST = (__force u16) cpu_to_be16(DST);
1958
break;
1959
case 32:
1960
DST = (__force u32) cpu_to_be32(DST);
1961
break;
1962
case 64:
1963
DST = (__force u64) cpu_to_be64(DST);
1964
break;
1965
}
1966
CONT;
1967
ALU_END_TO_LE:
1968
switch (IMM) {
1969
case 16:
1970
DST = (__force u16) cpu_to_le16(DST);
1971
break;
1972
case 32:
1973
DST = (__force u32) cpu_to_le32(DST);
1974
break;
1975
case 64:
1976
DST = (__force u64) cpu_to_le64(DST);
1977
break;
1978
}
1979
CONT;
1980
ALU64_END_TO_LE:
1981
switch (IMM) {
1982
case 16:
1983
DST = (__force u16) __swab16(DST);
1984
break;
1985
case 32:
1986
DST = (__force u32) __swab32(DST);
1987
break;
1988
case 64:
1989
DST = (__force u64) __swab64(DST);
1990
break;
1991
}
1992
CONT;
1993
1994
/* CALL */
1995
JMP_CALL:
1996
/* Function call scratches BPF_R1-BPF_R5 registers,
1997
* preserves BPF_R6-BPF_R9, and stores return value
1998
* into BPF_R0.
1999
*/
2000
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
2001
BPF_R4, BPF_R5);
2002
CONT;
2003
2004
JMP_CALL_ARGS:
2005
BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
2006
BPF_R3, BPF_R4,
2007
BPF_R5,
2008
insn + insn->off + 1);
2009
CONT;
2010
2011
JMP_TAIL_CALL: {
2012
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
2013
struct bpf_array *array = container_of(map, struct bpf_array, map);
2014
struct bpf_prog *prog;
2015
u32 index = BPF_R3;
2016
2017
if (unlikely(index >= array->map.max_entries))
2018
goto out;
2019
2020
if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
2021
goto out;
2022
2023
tail_call_cnt++;
2024
2025
prog = READ_ONCE(array->ptrs[index]);
2026
if (!prog)
2027
goto out;
2028
2029
/* ARG1 at this point is guaranteed to point to CTX from
2030
* the verifier side due to the fact that the tail call is
2031
* handled like a helper, that is, bpf_tail_call_proto,
2032
* where arg1_type is ARG_PTR_TO_CTX.
2033
*/
2034
insn = prog->insnsi;
2035
goto select_insn;
2036
out:
2037
CONT;
2038
}
2039
JMP_JA:
2040
insn += insn->off;
2041
CONT;
2042
JMP32_JA:
2043
insn += insn->imm;
2044
CONT;
2045
JMP_EXIT:
2046
return BPF_R0;
2047
/* JMP */
2048
#define COND_JMP(SIGN, OPCODE, CMP_OP) \
2049
JMP_##OPCODE##_X: \
2050
if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
2051
insn += insn->off; \
2052
CONT_JMP; \
2053
} \
2054
CONT; \
2055
JMP32_##OPCODE##_X: \
2056
if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
2057
insn += insn->off; \
2058
CONT_JMP; \
2059
} \
2060
CONT; \
2061
JMP_##OPCODE##_K: \
2062
if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
2063
insn += insn->off; \
2064
CONT_JMP; \
2065
} \
2066
CONT; \
2067
JMP32_##OPCODE##_K: \
2068
if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
2069
insn += insn->off; \
2070
CONT_JMP; \
2071
} \
2072
CONT;
2073
COND_JMP(u, JEQ, ==)
2074
COND_JMP(u, JNE, !=)
2075
COND_JMP(u, JGT, >)
2076
COND_JMP(u, JLT, <)
2077
COND_JMP(u, JGE, >=)
2078
COND_JMP(u, JLE, <=)
2079
COND_JMP(u, JSET, &)
2080
COND_JMP(s, JSGT, >)
2081
COND_JMP(s, JSLT, <)
2082
COND_JMP(s, JSGE, >=)
2083
COND_JMP(s, JSLE, <=)
2084
#undef COND_JMP
2085
/* ST, STX and LDX*/
2086
ST_NOSPEC:
2087
/* Speculation barrier for mitigating Speculative Store Bypass,
2088
* Bounds-Check Bypass and Type Confusion. In case of arm64, we
2089
* rely on the firmware mitigation as controlled via the ssbd
2090
* kernel parameter. Whenever the mitigation is enabled, it
2091
* works for all of the kernel code with no need to provide any
2092
* additional instructions here. In case of x86, we use 'lfence'
2093
* insn for mitigation. We reuse preexisting logic from Spectre
2094
* v1 mitigation that happens to produce the required code on
2095
* x86 for v4 as well.
2096
*/
2097
barrier_nospec();
2098
CONT;
2099
#define LDST(SIZEOP, SIZE) \
2100
STX_MEM_##SIZEOP: \
2101
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
2102
CONT; \
2103
ST_MEM_##SIZEOP: \
2104
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
2105
CONT; \
2106
LDX_MEM_##SIZEOP: \
2107
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
2108
CONT; \
2109
LDX_PROBE_MEM_##SIZEOP: \
2110
bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
2111
(const void *)(long) (SRC + insn->off)); \
2112
DST = *((SIZE *)&DST); \
2113
CONT;
2114
2115
LDST(B, u8)
2116
LDST(H, u16)
2117
LDST(W, u32)
2118
LDST(DW, u64)
2119
#undef LDST
2120
2121
#define LDSX(SIZEOP, SIZE) \
2122
LDX_MEMSX_##SIZEOP: \
2123
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
2124
CONT; \
2125
LDX_PROBE_MEMSX_##SIZEOP: \
2126
bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
2127
(const void *)(long) (SRC + insn->off)); \
2128
DST = *((SIZE *)&DST); \
2129
CONT;
2130
2131
LDSX(B, s8)
2132
LDSX(H, s16)
2133
LDSX(W, s32)
2134
#undef LDSX
2135
2136
#define ATOMIC_ALU_OP(BOP, KOP) \
2137
case BOP: \
2138
if (BPF_SIZE(insn->code) == BPF_W) \
2139
atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
2140
(DST + insn->off)); \
2141
else if (BPF_SIZE(insn->code) == BPF_DW) \
2142
atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
2143
(DST + insn->off)); \
2144
else \
2145
goto default_label; \
2146
break; \
2147
case BOP | BPF_FETCH: \
2148
if (BPF_SIZE(insn->code) == BPF_W) \
2149
SRC = (u32) atomic_fetch_##KOP( \
2150
(u32) SRC, \
2151
(atomic_t *)(unsigned long) (DST + insn->off)); \
2152
else if (BPF_SIZE(insn->code) == BPF_DW) \
2153
SRC = (u64) atomic64_fetch_##KOP( \
2154
(u64) SRC, \
2155
(atomic64_t *)(unsigned long) (DST + insn->off)); \
2156
else \
2157
goto default_label; \
2158
break;
2159
2160
STX_ATOMIC_DW:
2161
STX_ATOMIC_W:
2162
STX_ATOMIC_H:
2163
STX_ATOMIC_B:
2164
switch (IMM) {
2165
/* Atomic read-modify-write instructions support only W and DW
2166
* size modifiers.
2167
*/
2168
ATOMIC_ALU_OP(BPF_ADD, add)
2169
ATOMIC_ALU_OP(BPF_AND, and)
2170
ATOMIC_ALU_OP(BPF_OR, or)
2171
ATOMIC_ALU_OP(BPF_XOR, xor)
2172
#undef ATOMIC_ALU_OP
2173
2174
case BPF_XCHG:
2175
if (BPF_SIZE(insn->code) == BPF_W)
2176
SRC = (u32) atomic_xchg(
2177
(atomic_t *)(unsigned long) (DST + insn->off),
2178
(u32) SRC);
2179
else if (BPF_SIZE(insn->code) == BPF_DW)
2180
SRC = (u64) atomic64_xchg(
2181
(atomic64_t *)(unsigned long) (DST + insn->off),
2182
(u64) SRC);
2183
else
2184
goto default_label;
2185
break;
2186
case BPF_CMPXCHG:
2187
if (BPF_SIZE(insn->code) == BPF_W)
2188
BPF_R0 = (u32) atomic_cmpxchg(
2189
(atomic_t *)(unsigned long) (DST + insn->off),
2190
(u32) BPF_R0, (u32) SRC);
2191
else if (BPF_SIZE(insn->code) == BPF_DW)
2192
BPF_R0 = (u64) atomic64_cmpxchg(
2193
(atomic64_t *)(unsigned long) (DST + insn->off),
2194
(u64) BPF_R0, (u64) SRC);
2195
else
2196
goto default_label;
2197
break;
2198
/* Atomic load and store instructions support all size
2199
* modifiers.
2200
*/
2201
case BPF_LOAD_ACQ:
2202
switch (BPF_SIZE(insn->code)) {
2203
#define LOAD_ACQUIRE(SIZEOP, SIZE) \
2204
case BPF_##SIZEOP: \
2205
DST = (SIZE)smp_load_acquire( \
2206
(SIZE *)(unsigned long)(SRC + insn->off)); \
2207
break;
2208
LOAD_ACQUIRE(B, u8)
2209
LOAD_ACQUIRE(H, u16)
2210
LOAD_ACQUIRE(W, u32)
2211
#ifdef CONFIG_64BIT
2212
LOAD_ACQUIRE(DW, u64)
2213
#endif
2214
#undef LOAD_ACQUIRE
2215
default:
2216
goto default_label;
2217
}
2218
break;
2219
case BPF_STORE_REL:
2220
switch (BPF_SIZE(insn->code)) {
2221
#define STORE_RELEASE(SIZEOP, SIZE) \
2222
case BPF_##SIZEOP: \
2223
smp_store_release( \
2224
(SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \
2225
break;
2226
STORE_RELEASE(B, u8)
2227
STORE_RELEASE(H, u16)
2228
STORE_RELEASE(W, u32)
2229
#ifdef CONFIG_64BIT
2230
STORE_RELEASE(DW, u64)
2231
#endif
2232
#undef STORE_RELEASE
2233
default:
2234
goto default_label;
2235
}
2236
break;
2237
2238
default:
2239
goto default_label;
2240
}
2241
CONT;
2242
2243
default_label:
2244
/* If we ever reach this, we have a bug somewhere. Die hard here
2245
* instead of just returning 0; we could be somewhere in a subprog,
2246
* so execution could continue otherwise which we do /not/ want.
2247
*
2248
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
2249
*/
2250
pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
2251
insn->code, insn->imm);
2252
BUG_ON(1);
2253
return 0;
2254
}
2255
2256
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
2257
#define DEFINE_BPF_PROG_RUN(stack_size) \
2258
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
2259
{ \
2260
u64 stack[stack_size / sizeof(u64)]; \
2261
u64 regs[MAX_BPF_EXT_REG] = {}; \
2262
\
2263
kmsan_unpoison_memory(stack, sizeof(stack)); \
2264
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2265
ARG1 = (u64) (unsigned long) ctx; \
2266
return ___bpf_prog_run(regs, insn); \
2267
}
2268
2269
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
2270
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
2271
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
2272
const struct bpf_insn *insn) \
2273
{ \
2274
u64 stack[stack_size / sizeof(u64)]; \
2275
u64 regs[MAX_BPF_EXT_REG]; \
2276
\
2277
kmsan_unpoison_memory(stack, sizeof(stack)); \
2278
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2279
BPF_R1 = r1; \
2280
BPF_R2 = r2; \
2281
BPF_R3 = r3; \
2282
BPF_R4 = r4; \
2283
BPF_R5 = r5; \
2284
return ___bpf_prog_run(regs, insn); \
2285
}
2286
2287
#define EVAL1(FN, X) FN(X)
2288
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
2289
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
2290
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
2291
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
2292
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
2293
2294
EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
2295
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
2296
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
2297
2298
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
2299
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
2300
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
2301
2302
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
2303
2304
static unsigned int (*interpreters[])(const void *ctx,
2305
const struct bpf_insn *insn) = {
2306
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2307
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2308
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2309
};
2310
#undef PROG_NAME_LIST
2311
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
2312
static __maybe_unused
2313
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
2314
const struct bpf_insn *insn) = {
2315
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2316
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2317
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2318
};
2319
#undef PROG_NAME_LIST
2320
2321
#ifdef CONFIG_BPF_SYSCALL
2322
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
2323
{
2324
stack_depth = max_t(u32, stack_depth, 1);
2325
insn->off = (s16) insn->imm;
2326
insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
2327
__bpf_call_base_args;
2328
insn->code = BPF_JMP | BPF_CALL_ARGS;
2329
}
2330
#endif
2331
#endif
2332
2333
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
2334
const struct bpf_insn *insn)
2335
{
2336
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
2337
* is not working properly, so warn about it!
2338
*/
2339
WARN_ON_ONCE(1);
2340
return 0;
2341
}
2342
2343
static bool __bpf_prog_map_compatible(struct bpf_map *map,
2344
const struct bpf_prog *fp)
2345
{
2346
enum bpf_prog_type prog_type = resolve_prog_type(fp);
2347
struct bpf_prog_aux *aux = fp->aux;
2348
enum bpf_cgroup_storage_type i;
2349
bool ret = false;
2350
u64 cookie;
2351
2352
if (fp->kprobe_override)
2353
return ret;
2354
2355
spin_lock(&map->owner_lock);
2356
/* There's no owner yet where we could check for compatibility. */
2357
if (!map->owner) {
2358
map->owner = bpf_map_owner_alloc(map);
2359
if (!map->owner)
2360
goto err;
2361
map->owner->type = prog_type;
2362
map->owner->jited = fp->jited;
2363
map->owner->xdp_has_frags = aux->xdp_has_frags;
2364
map->owner->expected_attach_type = fp->expected_attach_type;
2365
map->owner->attach_func_proto = aux->attach_func_proto;
2366
for_each_cgroup_storage_type(i) {
2367
map->owner->storage_cookie[i] =
2368
aux->cgroup_storage[i] ?
2369
aux->cgroup_storage[i]->cookie : 0;
2370
}
2371
ret = true;
2372
} else {
2373
ret = map->owner->type == prog_type &&
2374
map->owner->jited == fp->jited &&
2375
map->owner->xdp_has_frags == aux->xdp_has_frags;
2376
if (ret &&
2377
map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
2378
map->owner->expected_attach_type != fp->expected_attach_type)
2379
ret = false;
2380
for_each_cgroup_storage_type(i) {
2381
if (!ret)
2382
break;
2383
cookie = aux->cgroup_storage[i] ?
2384
aux->cgroup_storage[i]->cookie : 0;
2385
ret = map->owner->storage_cookie[i] == cookie ||
2386
!cookie;
2387
}
2388
if (ret &&
2389
map->owner->attach_func_proto != aux->attach_func_proto) {
2390
switch (prog_type) {
2391
case BPF_PROG_TYPE_TRACING:
2392
case BPF_PROG_TYPE_LSM:
2393
case BPF_PROG_TYPE_EXT:
2394
case BPF_PROG_TYPE_STRUCT_OPS:
2395
ret = false;
2396
break;
2397
default:
2398
break;
2399
}
2400
}
2401
}
2402
err:
2403
spin_unlock(&map->owner_lock);
2404
return ret;
2405
}
2406
2407
bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
2408
{
2409
/* XDP programs inserted into maps are not guaranteed to run on
2410
* a particular netdev (and can run outside driver context entirely
2411
* in the case of devmap and cpumap). Until device checks
2412
* are implemented, prohibit adding dev-bound programs to program maps.
2413
*/
2414
if (bpf_prog_is_dev_bound(fp->aux))
2415
return false;
2416
2417
return __bpf_prog_map_compatible(map, fp);
2418
}
2419
2420
static int bpf_check_tail_call(const struct bpf_prog *fp)
2421
{
2422
struct bpf_prog_aux *aux = fp->aux;
2423
int i, ret = 0;
2424
2425
mutex_lock(&aux->used_maps_mutex);
2426
for (i = 0; i < aux->used_map_cnt; i++) {
2427
struct bpf_map *map = aux->used_maps[i];
2428
2429
if (!map_type_contains_progs(map))
2430
continue;
2431
2432
if (!__bpf_prog_map_compatible(map, fp)) {
2433
ret = -EINVAL;
2434
goto out;
2435
}
2436
}
2437
2438
out:
2439
mutex_unlock(&aux->used_maps_mutex);
2440
return ret;
2441
}
2442
2443
static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
2444
{
2445
bool select_interpreter = false;
2446
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
2447
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
2448
u32 idx = (round_up(stack_depth, 32) / 32) - 1;
2449
2450
/* may_goto may cause stack size > 512, leading to idx out-of-bounds.
2451
* But for non-JITed programs, we don't need bpf_func, so no bounds
2452
* check needed.
2453
*/
2454
if (idx < ARRAY_SIZE(interpreters)) {
2455
fp->bpf_func = interpreters[idx];
2456
select_interpreter = true;
2457
} else {
2458
fp->bpf_func = __bpf_prog_ret0_warn;
2459
}
2460
#else
2461
fp->bpf_func = __bpf_prog_ret0_warn;
2462
#endif
2463
return select_interpreter;
2464
}
2465
2466
/**
2467
* bpf_prog_select_runtime - select exec runtime for BPF program
2468
* @fp: bpf_prog populated with BPF program
2469
* @err: pointer to error variable
2470
*
2471
* Try to JIT eBPF program, if JIT is not available, use interpreter.
2472
* The BPF program will be executed via bpf_prog_run() function.
2473
*
2474
* Return: the &fp argument along with &err set to 0 for success or
2475
* a negative errno code on failure
2476
*/
2477
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
2478
{
2479
/* In case of BPF to BPF calls, verifier did all the prep
2480
* work with regards to JITing, etc.
2481
*/
2482
bool jit_needed = false;
2483
2484
if (fp->bpf_func)
2485
goto finalize;
2486
2487
if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
2488
bpf_prog_has_kfunc_call(fp))
2489
jit_needed = true;
2490
2491
if (!bpf_prog_select_interpreter(fp))
2492
jit_needed = true;
2493
2494
/* eBPF JITs can rewrite the program in case constant
2495
* blinding is active. However, in case of error during
2496
* blinding, bpf_int_jit_compile() must always return a
2497
* valid program, which in this case would simply not
2498
* be JITed, but falls back to the interpreter.
2499
*/
2500
if (!bpf_prog_is_offloaded(fp->aux)) {
2501
*err = bpf_prog_alloc_jited_linfo(fp);
2502
if (*err)
2503
return fp;
2504
2505
fp = bpf_int_jit_compile(fp);
2506
bpf_prog_jit_attempt_done(fp);
2507
if (!fp->jited && jit_needed) {
2508
*err = -ENOTSUPP;
2509
return fp;
2510
}
2511
} else {
2512
*err = bpf_prog_offload_compile(fp);
2513
if (*err)
2514
return fp;
2515
}
2516
2517
finalize:
2518
*err = bpf_prog_lock_ro(fp);
2519
if (*err)
2520
return fp;
2521
2522
/* The tail call compatibility check can only be done at
2523
* this late stage as we need to determine, if we deal
2524
* with JITed or non JITed program concatenations and not
2525
* all eBPF JITs might immediately support all features.
2526
*/
2527
*err = bpf_check_tail_call(fp);
2528
2529
return fp;
2530
}
2531
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
2532
2533
static unsigned int __bpf_prog_ret1(const void *ctx,
2534
const struct bpf_insn *insn)
2535
{
2536
return 1;
2537
}
2538
2539
static struct bpf_prog_dummy {
2540
struct bpf_prog prog;
2541
} dummy_bpf_prog = {
2542
.prog = {
2543
.bpf_func = __bpf_prog_ret1,
2544
},
2545
};
2546
2547
struct bpf_empty_prog_array bpf_empty_prog_array = {
2548
.null_prog = NULL,
2549
};
2550
EXPORT_SYMBOL(bpf_empty_prog_array);
2551
2552
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
2553
{
2554
struct bpf_prog_array *p;
2555
2556
if (prog_cnt)
2557
p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
2558
else
2559
p = &bpf_empty_prog_array.hdr;
2560
2561
return p;
2562
}
2563
2564
void bpf_prog_array_free(struct bpf_prog_array *progs)
2565
{
2566
if (!progs || progs == &bpf_empty_prog_array.hdr)
2567
return;
2568
kfree_rcu(progs, rcu);
2569
}
2570
2571
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2572
{
2573
struct bpf_prog_array *progs;
2574
2575
/* If RCU Tasks Trace grace period implies RCU grace period, there is
2576
* no need to call kfree_rcu(), just call kfree() directly.
2577
*/
2578
progs = container_of(rcu, struct bpf_prog_array, rcu);
2579
if (rcu_trace_implies_rcu_gp())
2580
kfree(progs);
2581
else
2582
kfree_rcu(progs, rcu);
2583
}
2584
2585
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2586
{
2587
if (!progs || progs == &bpf_empty_prog_array.hdr)
2588
return;
2589
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2590
}
2591
2592
int bpf_prog_array_length(struct bpf_prog_array *array)
2593
{
2594
struct bpf_prog_array_item *item;
2595
u32 cnt = 0;
2596
2597
for (item = array->items; item->prog; item++)
2598
if (item->prog != &dummy_bpf_prog.prog)
2599
cnt++;
2600
return cnt;
2601
}
2602
2603
bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
2604
{
2605
struct bpf_prog_array_item *item;
2606
2607
for (item = array->items; item->prog; item++)
2608
if (item->prog != &dummy_bpf_prog.prog)
2609
return false;
2610
return true;
2611
}
2612
2613
static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
2614
u32 *prog_ids,
2615
u32 request_cnt)
2616
{
2617
struct bpf_prog_array_item *item;
2618
int i = 0;
2619
2620
for (item = array->items; item->prog; item++) {
2621
if (item->prog == &dummy_bpf_prog.prog)
2622
continue;
2623
prog_ids[i] = item->prog->aux->id;
2624
if (++i == request_cnt) {
2625
item++;
2626
break;
2627
}
2628
}
2629
2630
return !!(item->prog);
2631
}
2632
2633
int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
2634
__u32 __user *prog_ids, u32 cnt)
2635
{
2636
unsigned long err = 0;
2637
bool nospc;
2638
u32 *ids;
2639
2640
/* users of this function are doing:
2641
* cnt = bpf_prog_array_length();
2642
* if (cnt > 0)
2643
* bpf_prog_array_copy_to_user(..., cnt);
2644
* so below kcalloc doesn't need extra cnt > 0 check.
2645
*/
2646
ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
2647
if (!ids)
2648
return -ENOMEM;
2649
nospc = bpf_prog_array_copy_core(array, ids, cnt);
2650
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
2651
kfree(ids);
2652
if (err)
2653
return -EFAULT;
2654
if (nospc)
2655
return -ENOSPC;
2656
return 0;
2657
}
2658
2659
void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
2660
struct bpf_prog *old_prog)
2661
{
2662
struct bpf_prog_array_item *item;
2663
2664
for (item = array->items; item->prog; item++)
2665
if (item->prog == old_prog) {
2666
WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
2667
break;
2668
}
2669
}
2670
2671
/**
2672
* bpf_prog_array_delete_safe_at() - Replaces the program at the given
2673
* index into the program array with
2674
* a dummy no-op program.
2675
* @array: a bpf_prog_array
2676
* @index: the index of the program to replace
2677
*
2678
* Skips over dummy programs, by not counting them, when calculating
2679
* the position of the program to replace.
2680
*
2681
* Return:
2682
* * 0 - Success
2683
* * -EINVAL - Invalid index value. Must be a non-negative integer.
2684
* * -ENOENT - Index out of range
2685
*/
2686
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
2687
{
2688
return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
2689
}
2690
2691
/**
2692
* bpf_prog_array_update_at() - Updates the program at the given index
2693
* into the program array.
2694
* @array: a bpf_prog_array
2695
* @index: the index of the program to update
2696
* @prog: the program to insert into the array
2697
*
2698
* Skips over dummy programs, by not counting them, when calculating
2699
* the position of the program to update.
2700
*
2701
* Return:
2702
* * 0 - Success
2703
* * -EINVAL - Invalid index value. Must be a non-negative integer.
2704
* * -ENOENT - Index out of range
2705
*/
2706
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
2707
struct bpf_prog *prog)
2708
{
2709
struct bpf_prog_array_item *item;
2710
2711
if (unlikely(index < 0))
2712
return -EINVAL;
2713
2714
for (item = array->items; item->prog; item++) {
2715
if (item->prog == &dummy_bpf_prog.prog)
2716
continue;
2717
if (!index) {
2718
WRITE_ONCE(item->prog, prog);
2719
return 0;
2720
}
2721
index--;
2722
}
2723
return -ENOENT;
2724
}
2725
2726
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
2727
struct bpf_prog *exclude_prog,
2728
struct bpf_prog *include_prog,
2729
u64 bpf_cookie,
2730
struct bpf_prog_array **new_array)
2731
{
2732
int new_prog_cnt, carry_prog_cnt = 0;
2733
struct bpf_prog_array_item *existing, *new;
2734
struct bpf_prog_array *array;
2735
bool found_exclude = false;
2736
2737
/* Figure out how many existing progs we need to carry over to
2738
* the new array.
2739
*/
2740
if (old_array) {
2741
existing = old_array->items;
2742
for (; existing->prog; existing++) {
2743
if (existing->prog == exclude_prog) {
2744
found_exclude = true;
2745
continue;
2746
}
2747
if (existing->prog != &dummy_bpf_prog.prog)
2748
carry_prog_cnt++;
2749
if (existing->prog == include_prog)
2750
return -EEXIST;
2751
}
2752
}
2753
2754
if (exclude_prog && !found_exclude)
2755
return -ENOENT;
2756
2757
/* How many progs (not NULL) will be in the new array? */
2758
new_prog_cnt = carry_prog_cnt;
2759
if (include_prog)
2760
new_prog_cnt += 1;
2761
2762
/* Do we have any prog (not NULL) in the new array? */
2763
if (!new_prog_cnt) {
2764
*new_array = NULL;
2765
return 0;
2766
}
2767
2768
/* +1 as the end of prog_array is marked with NULL */
2769
array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
2770
if (!array)
2771
return -ENOMEM;
2772
new = array->items;
2773
2774
/* Fill in the new prog array */
2775
if (carry_prog_cnt) {
2776
existing = old_array->items;
2777
for (; existing->prog; existing++) {
2778
if (existing->prog == exclude_prog ||
2779
existing->prog == &dummy_bpf_prog.prog)
2780
continue;
2781
2782
new->prog = existing->prog;
2783
new->bpf_cookie = existing->bpf_cookie;
2784
new++;
2785
}
2786
}
2787
if (include_prog) {
2788
new->prog = include_prog;
2789
new->bpf_cookie = bpf_cookie;
2790
new++;
2791
}
2792
new->prog = NULL;
2793
*new_array = array;
2794
return 0;
2795
}
2796
2797
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
2798
u32 *prog_ids, u32 request_cnt,
2799
u32 *prog_cnt)
2800
{
2801
u32 cnt = 0;
2802
2803
if (array)
2804
cnt = bpf_prog_array_length(array);
2805
2806
*prog_cnt = cnt;
2807
2808
/* return early if user requested only program count or nothing to copy */
2809
if (!request_cnt || !cnt)
2810
return 0;
2811
2812
/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
2813
return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
2814
: 0;
2815
}
2816
2817
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
2818
struct bpf_map **used_maps, u32 len)
2819
{
2820
struct bpf_map *map;
2821
bool sleepable;
2822
u32 i;
2823
2824
sleepable = aux->prog->sleepable;
2825
for (i = 0; i < len; i++) {
2826
map = used_maps[i];
2827
if (map->ops->map_poke_untrack)
2828
map->ops->map_poke_untrack(map, aux);
2829
if (sleepable)
2830
atomic64_dec(&map->sleepable_refcnt);
2831
bpf_map_put(map);
2832
}
2833
}
2834
2835
static void bpf_free_used_maps(struct bpf_prog_aux *aux)
2836
{
2837
__bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
2838
kfree(aux->used_maps);
2839
}
2840
2841
void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
2842
{
2843
#ifdef CONFIG_BPF_SYSCALL
2844
struct btf_mod_pair *btf_mod;
2845
u32 i;
2846
2847
for (i = 0; i < len; i++) {
2848
btf_mod = &used_btfs[i];
2849
if (btf_mod->module)
2850
module_put(btf_mod->module);
2851
btf_put(btf_mod->btf);
2852
}
2853
#endif
2854
}
2855
2856
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
2857
{
2858
__bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
2859
kfree(aux->used_btfs);
2860
}
2861
2862
static void bpf_prog_free_deferred(struct work_struct *work)
2863
{
2864
struct bpf_prog_aux *aux;
2865
int i;
2866
2867
aux = container_of(work, struct bpf_prog_aux, work);
2868
#ifdef CONFIG_BPF_SYSCALL
2869
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
2870
bpf_prog_stream_free(aux->prog);
2871
#endif
2872
#ifdef CONFIG_CGROUP_BPF
2873
if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
2874
bpf_cgroup_atype_put(aux->cgroup_atype);
2875
#endif
2876
bpf_free_used_maps(aux);
2877
bpf_free_used_btfs(aux);
2878
if (bpf_prog_is_dev_bound(aux))
2879
bpf_prog_dev_bound_destroy(aux->prog);
2880
#ifdef CONFIG_PERF_EVENTS
2881
if (aux->prog->has_callchain_buf)
2882
put_callchain_buffers();
2883
#endif
2884
if (aux->dst_trampoline)
2885
bpf_trampoline_put(aux->dst_trampoline);
2886
for (i = 0; i < aux->real_func_cnt; i++) {
2887
/* We can just unlink the subprog poke descriptor table as
2888
* it was originally linked to the main program and is also
2889
* released along with it.
2890
*/
2891
aux->func[i]->aux->poke_tab = NULL;
2892
bpf_jit_free(aux->func[i]);
2893
}
2894
if (aux->real_func_cnt) {
2895
kfree(aux->func);
2896
bpf_prog_unlock_free(aux->prog);
2897
} else {
2898
bpf_jit_free(aux->prog);
2899
}
2900
}
2901
2902
void bpf_prog_free(struct bpf_prog *fp)
2903
{
2904
struct bpf_prog_aux *aux = fp->aux;
2905
2906
if (aux->dst_prog)
2907
bpf_prog_put(aux->dst_prog);
2908
bpf_token_put(aux->token);
2909
INIT_WORK(&aux->work, bpf_prog_free_deferred);
2910
schedule_work(&aux->work);
2911
}
2912
EXPORT_SYMBOL_GPL(bpf_prog_free);
2913
2914
/* RNG for unprivileged user space with separated state from prandom_u32(). */
2915
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
2916
2917
void bpf_user_rnd_init_once(void)
2918
{
2919
prandom_init_once(&bpf_user_rnd_state);
2920
}
2921
2922
BPF_CALL_0(bpf_user_rnd_u32)
2923
{
2924
/* Should someone ever have the rather unwise idea to use some
2925
* of the registers passed into this function, then note that
2926
* this function is called from native eBPF and classic-to-eBPF
2927
* transformations. Register assignments from both sides are
2928
* different, f.e. classic always sets fn(ctx, A, X) here.
2929
*/
2930
struct rnd_state *state;
2931
u32 res;
2932
2933
state = &get_cpu_var(bpf_user_rnd_state);
2934
res = prandom_u32_state(state);
2935
put_cpu_var(bpf_user_rnd_state);
2936
2937
return res;
2938
}
2939
2940
BPF_CALL_0(bpf_get_raw_cpu_id)
2941
{
2942
return raw_smp_processor_id();
2943
}
2944
2945
/* Weak definitions of helper functions in case we don't have bpf syscall. */
2946
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
2947
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
2948
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
2949
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
2950
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
2951
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
2952
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
2953
const struct bpf_func_proto bpf_spin_lock_proto __weak;
2954
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
2955
const struct bpf_func_proto bpf_jiffies64_proto __weak;
2956
2957
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
2958
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
2959
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
2960
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
2961
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
2962
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
2963
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
2964
2965
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
2966
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
2967
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
2968
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
2969
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
2970
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
2971
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
2972
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
2973
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
2974
const struct bpf_func_proto bpf_set_retval_proto __weak;
2975
const struct bpf_func_proto bpf_get_retval_proto __weak;
2976
2977
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
2978
{
2979
return NULL;
2980
}
2981
2982
const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
2983
{
2984
return NULL;
2985
}
2986
2987
const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
2988
{
2989
return NULL;
2990
}
2991
2992
u64 __weak
2993
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
2994
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
2995
{
2996
return -ENOTSUPP;
2997
}
2998
EXPORT_SYMBOL_GPL(bpf_event_output);
2999
3000
/* Always built-in helper functions. */
3001
const struct bpf_func_proto bpf_tail_call_proto = {
3002
/* func is unused for tail_call, we set it to pass the
3003
* get_helper_proto check
3004
*/
3005
.func = BPF_PTR_POISON,
3006
.gpl_only = false,
3007
.ret_type = RET_VOID,
3008
.arg1_type = ARG_PTR_TO_CTX,
3009
.arg2_type = ARG_CONST_MAP_PTR,
3010
.arg3_type = ARG_ANYTHING,
3011
};
3012
3013
/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
3014
* It is encouraged to implement bpf_int_jit_compile() instead, so that
3015
* eBPF and implicitly also cBPF can get JITed!
3016
*/
3017
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
3018
{
3019
return prog;
3020
}
3021
3022
/* Stub for JITs that support eBPF. All cBPF code gets transformed into
3023
* eBPF by the kernel and is later compiled by bpf_int_jit_compile().
3024
*/
3025
void __weak bpf_jit_compile(struct bpf_prog *prog)
3026
{
3027
}
3028
3029
bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
3030
{
3031
return false;
3032
}
3033
3034
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
3035
* analysis code and wants explicit zero extension inserted by verifier.
3036
* Otherwise, return FALSE.
3037
*
3038
* The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
3039
* you don't override this. JITs that don't want these extra insns can detect
3040
* them using insn_is_zext.
3041
*/
3042
bool __weak bpf_jit_needs_zext(void)
3043
{
3044
return false;
3045
}
3046
3047
/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
3048
* all archs. The value returned must not change at runtime as there is
3049
* currently no support for reloading programs that were loaded without
3050
* mitigations.
3051
*/
3052
bool __weak bpf_jit_bypass_spec_v1(void)
3053
{
3054
return false;
3055
}
3056
3057
bool __weak bpf_jit_bypass_spec_v4(void)
3058
{
3059
return false;
3060
}
3061
3062
/* Return true if the JIT inlines the call to the helper corresponding to
3063
* the imm.
3064
*
3065
* The verifier will not patch the insn->imm for the call to the helper if
3066
* this returns true.
3067
*/
3068
bool __weak bpf_jit_inlines_helper_call(s32 imm)
3069
{
3070
return false;
3071
}
3072
3073
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
3074
bool __weak bpf_jit_supports_subprog_tailcalls(void)
3075
{
3076
return false;
3077
}
3078
3079
bool __weak bpf_jit_supports_percpu_insn(void)
3080
{
3081
return false;
3082
}
3083
3084
bool __weak bpf_jit_supports_kfunc_call(void)
3085
{
3086
return false;
3087
}
3088
3089
bool __weak bpf_jit_supports_far_kfunc_call(void)
3090
{
3091
return false;
3092
}
3093
3094
bool __weak bpf_jit_supports_arena(void)
3095
{
3096
return false;
3097
}
3098
3099
bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
3100
{
3101
return false;
3102
}
3103
3104
u64 __weak bpf_arch_uaddress_limit(void)
3105
{
3106
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
3107
return TASK_SIZE;
3108
#else
3109
return 0;
3110
#endif
3111
}
3112
3113
/* Return TRUE if the JIT backend satisfies the following two conditions:
3114
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
3115
* 2) Under the specific arch, the implementation of xchg() is the same
3116
* as atomic_xchg() on pointer-sized words.
3117
*/
3118
bool __weak bpf_jit_supports_ptr_xchg(void)
3119
{
3120
return false;
3121
}
3122
3123
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
3124
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
3125
*/
3126
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
3127
int len)
3128
{
3129
return -EFAULT;
3130
}
3131
3132
int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
3133
void *addr1, void *addr2)
3134
{
3135
return -ENOTSUPP;
3136
}
3137
3138
void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
3139
{
3140
return ERR_PTR(-ENOTSUPP);
3141
}
3142
3143
int __weak bpf_arch_text_invalidate(void *dst, size_t len)
3144
{
3145
return -ENOTSUPP;
3146
}
3147
3148
bool __weak bpf_jit_supports_exceptions(void)
3149
{
3150
return false;
3151
}
3152
3153
bool __weak bpf_jit_supports_private_stack(void)
3154
{
3155
return false;
3156
}
3157
3158
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
3159
{
3160
}
3161
3162
bool __weak bpf_jit_supports_timed_may_goto(void)
3163
{
3164
return false;
3165
}
3166
3167
u64 __weak arch_bpf_timed_may_goto(void)
3168
{
3169
return 0;
3170
}
3171
3172
static noinline void bpf_prog_report_may_goto_violation(void)
3173
{
3174
#ifdef CONFIG_BPF_SYSCALL
3175
struct bpf_stream_stage ss;
3176
struct bpf_prog *prog;
3177
3178
prog = bpf_prog_find_from_stack();
3179
if (!prog)
3180
return;
3181
bpf_stream_stage(ss, prog, BPF_STDERR, ({
3182
bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
3183
bpf_stream_dump_stack(ss);
3184
}));
3185
#endif
3186
}
3187
3188
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
3189
{
3190
u64 time = ktime_get_mono_fast_ns();
3191
3192
/* Populate the timestamp for this stack frame, and refresh count. */
3193
if (!p->timestamp) {
3194
p->timestamp = time;
3195
return BPF_MAX_TIMED_LOOPS;
3196
}
3197
/* Check if we've exhausted our time slice, and zero count. */
3198
if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
3199
bpf_prog_report_may_goto_violation();
3200
return 0;
3201
}
3202
/* Refresh the count for the stack frame. */
3203
return BPF_MAX_TIMED_LOOPS;
3204
}
3205
3206
/* for configs without MMU or 32-bit */
3207
__weak const struct bpf_map_ops arena_map_ops;
3208
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
3209
{
3210
return 0;
3211
}
3212
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
3213
{
3214
return 0;
3215
}
3216
3217
#ifdef CONFIG_BPF_SYSCALL
3218
static int __init bpf_global_ma_init(void)
3219
{
3220
int ret;
3221
3222
ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
3223
bpf_global_ma_set = !ret;
3224
return ret;
3225
}
3226
late_initcall(bpf_global_ma_init);
3227
#endif
3228
3229
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
3230
EXPORT_SYMBOL(bpf_stats_enabled_key);
3231
3232
/* All definitions of tracepoints related to BPF. */
3233
#define CREATE_TRACE_POINTS
3234
#include <linux/bpf_trace.h>
3235
3236
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
3237
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
3238
3239
#ifdef CONFIG_BPF_SYSCALL
3240
3241
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
3242
const char **linep, int *nump)
3243
{
3244
int idx = -1, insn_start, insn_end, len;
3245
struct bpf_line_info *linfo;
3246
void **jited_linfo;
3247
struct btf *btf;
3248
int nr_linfo;
3249
3250
btf = prog->aux->btf;
3251
linfo = prog->aux->linfo;
3252
jited_linfo = prog->aux->jited_linfo;
3253
3254
if (!btf || !linfo || !jited_linfo)
3255
return -EINVAL;
3256
len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
3257
3258
linfo = &prog->aux->linfo[prog->aux->linfo_idx];
3259
jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
3260
3261
insn_start = linfo[0].insn_off;
3262
insn_end = insn_start + len;
3263
nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
3264
3265
for (int i = 0; i < nr_linfo &&
3266
linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
3267
if (jited_linfo[i] >= (void *)ip)
3268
break;
3269
idx = i;
3270
}
3271
3272
if (idx == -1)
3273
return -ENOENT;
3274
3275
/* Get base component of the file path. */
3276
*filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
3277
*filep = kbasename(*filep);
3278
/* Obtain the source line, and strip whitespace in prefix. */
3279
*linep = btf_name_by_offset(btf, linfo[idx].line_off);
3280
while (isspace(**linep))
3281
*linep += 1;
3282
*nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
3283
return 0;
3284
}
3285
3286
struct walk_stack_ctx {
3287
struct bpf_prog *prog;
3288
};
3289
3290
static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
3291
{
3292
struct walk_stack_ctx *ctxp = cookie;
3293
struct bpf_prog *prog;
3294
3295
/*
3296
* The RCU read lock is held to safely traverse the latch tree, but we
3297
* don't need its protection when accessing the prog, since it has an
3298
* active stack frame on the current stack trace, and won't disappear.
3299
*/
3300
rcu_read_lock();
3301
prog = bpf_prog_ksym_find(ip);
3302
rcu_read_unlock();
3303
if (!prog)
3304
return true;
3305
/* Make sure we return the main prog if we found a subprog */
3306
ctxp->prog = prog->aux->main_prog_aux->prog;
3307
return false;
3308
}
3309
3310
struct bpf_prog *bpf_prog_find_from_stack(void)
3311
{
3312
struct walk_stack_ctx ctx = {};
3313
3314
arch_bpf_stack_walk(find_from_stack_cb, &ctx);
3315
return ctx.prog;
3316
}
3317
3318
#endif
3319
3320