CoCalc -- core.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/core.c
²⁹²⁸⁰ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
 * Linux Socket Filter - Kernel level socket filtering
4
 *
5
 * Based on the design of the Berkeley Packet Filter. The new
6
 * internal format has been designed by PLUMgrid:
7
 *
8
 *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9
 *
10
 * Authors:
11
 *
12
 *	Jay Schulist <[email protected]>
13
 *	Alexei Starovoitov <[email protected]>
14
 *	Daniel Borkmann <[email protected]>
15
 *
16
 * Andi Kleen - Fix a few bad bugs and races.
17
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
18
 */
19

20
#include <uapi/linux/btf.h>
21
#include <crypto/sha1.h>
22
#include <linux/filter.h>
23
#include <linux/skbuff.h>
24
#include <linux/vmalloc.h>
25
#include <linux/prandom.h>
26
#include <linux/bpf.h>
27
#include <linux/btf.h>
28
#include <linux/objtool.h>
29
#include <linux/overflow.h>
30
#include <linux/rbtree_latch.h>
31
#include <linux/kallsyms.h>
32
#include <linux/rcupdate.h>
33
#include <linux/perf_event.h>
34
#include <linux/extable.h>
35
#include <linux/log2.h>
36
#include <linux/bpf_verifier.h>
37
#include <linux/nodemask.h>
38
#include <linux/nospec.h>
39
#include <linux/bpf_mem_alloc.h>
40
#include <linux/memcontrol.h>
41
#include <linux/execmem.h>
42
#include <crypto/sha2.h>
43

44
#include <asm/barrier.h>
45
#include <linux/unaligned.h>
46

47
/* Registers */
48
#define BPF_R0	regs[BPF_REG_0]
49
#define BPF_R1	regs[BPF_REG_1]
50
#define BPF_R2	regs[BPF_REG_2]
51
#define BPF_R3	regs[BPF_REG_3]
52
#define BPF_R4	regs[BPF_REG_4]
53
#define BPF_R5	regs[BPF_REG_5]
54
#define BPF_R6	regs[BPF_REG_6]
55
#define BPF_R7	regs[BPF_REG_7]
56
#define BPF_R8	regs[BPF_REG_8]
57
#define BPF_R9	regs[BPF_REG_9]
58
#define BPF_R10	regs[BPF_REG_10]
59

60
/* Named registers */
61
#define DST	regs[insn->dst_reg]
62
#define SRC	regs[insn->src_reg]
63
#define FP	regs[BPF_REG_FP]
64
#define AX	regs[BPF_REG_AX]
65
#define ARG1	regs[BPF_REG_ARG1]
66
#define CTX	regs[BPF_REG_CTX]
67
#define OFF	insn->off
68
#define IMM	insn->imm
69

70
struct bpf_mem_alloc bpf_global_ma;
71
bool bpf_global_ma_set;
72

73
/* No hurry in this branch
74
 *
75
 * Exported for the bpf jit load helper.
76
 */
77
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
78
{
79
	u8 *ptr = NULL;
80

81
	if (k >= SKF_NET_OFF) {
82
		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
83
	} else if (k >= SKF_LL_OFF) {
84
		if (unlikely(!skb_mac_header_was_set(skb)))
85
			return NULL;
86
		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
87
	}
88
	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
89
		return ptr;
90

91
	return NULL;
92
}
93

94
/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
95
enum page_size_enum {
96
	__PAGE_SIZE = PAGE_SIZE
97
};
98

99
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
100
{
101
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
102
	struct bpf_prog_aux *aux;
103
	struct bpf_prog *fp;
104

105
	size = round_up(size, __PAGE_SIZE);
106
	fp = __vmalloc(size, gfp_flags);
107
	if (fp == NULL)
108
		return NULL;
109

110
	aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
111
	if (aux == NULL) {
112
		vfree(fp);
113
		return NULL;
114
	}
115
	fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
116
	if (!fp->active) {
117
		vfree(fp);
118
		kfree(aux);
119
		return NULL;
120
	}
121

122
	fp->pages = size / PAGE_SIZE;
123
	fp->aux = aux;
124
	fp->aux->main_prog_aux = aux;
125
	fp->aux->prog = fp;
126
	fp->jit_requested = ebpf_jit_enabled();
127
	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
128
#ifdef CONFIG_CGROUP_BPF
129
	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
130
#endif
131

132
	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
133
#ifdef CONFIG_FINEIBT
134
	INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
135
#endif
136
	mutex_init(&fp->aux->used_maps_mutex);
137
	mutex_init(&fp->aux->ext_mutex);
138
	mutex_init(&fp->aux->dst_mutex);
139

140
#ifdef CONFIG_BPF_SYSCALL
141
	bpf_prog_stream_init(fp);
142
#endif
143

144
	return fp;
145
}
146

147
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
148
{
149
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
150
	struct bpf_prog *prog;
151
	int cpu;
152

153
	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
154
	if (!prog)
155
		return NULL;
156

157
	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
158
	if (!prog->stats) {
159
		free_percpu(prog->active);
160
		kfree(prog->aux);
161
		vfree(prog);
162
		return NULL;
163
	}
164

165
	for_each_possible_cpu(cpu) {
166
		struct bpf_prog_stats *pstats;
167

168
		pstats = per_cpu_ptr(prog->stats, cpu);
169
		u64_stats_init(&pstats->syncp);
170
	}
171
	return prog;
172
}
173
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
174

175
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
176
{
177
	if (!prog->aux->nr_linfo || !prog->jit_requested)
178
		return 0;
179

180
	prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
181
					  sizeof(*prog->aux->jited_linfo),
182
					  bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
183
	if (!prog->aux->jited_linfo)
184
		return -ENOMEM;
185

186
	return 0;
187
}
188

189
void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
190
{
191
	if (prog->aux->jited_linfo &&
192
	    (!prog->jited || !prog->aux->jited_linfo[0])) {
193
		kvfree(prog->aux->jited_linfo);
194
		prog->aux->jited_linfo = NULL;
195
	}
196

197
	kfree(prog->aux->kfunc_tab);
198
	prog->aux->kfunc_tab = NULL;
199
}
200

201
/* The jit engine is responsible to provide an array
202
 * for insn_off to the jited_off mapping (insn_to_jit_off).
203
 *
204
 * The idx to this array is the insn_off.  Hence, the insn_off
205
 * here is relative to the prog itself instead of the main prog.
206
 * This array has one entry for each xlated bpf insn.
207
 *
208
 * jited_off is the byte off to the end of the jited insn.
209
 *
210
 * Hence, with
211
 * insn_start:
212
 *      The first bpf insn off of the prog.  The insn off
213
 *      here is relative to the main prog.
214
 *      e.g. if prog is a subprog, insn_start > 0
215
 * linfo_idx:
216
 *      The prog's idx to prog->aux->linfo and jited_linfo
217
 *
218
 * jited_linfo[linfo_idx] = prog->bpf_func
219
 *
220
 * For i > linfo_idx,
221
 *
222
 * jited_linfo[i] = prog->bpf_func +
223
 *	insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
224
 */
225
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
226
			       const u32 *insn_to_jit_off)
227
{
228
	u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
229
	const struct bpf_line_info *linfo;
230
	void **jited_linfo;
231

232
	if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
233
		/* Userspace did not provide linfo */
234
		return;
235

236
	linfo_idx = prog->aux->linfo_idx;
237
	linfo = &prog->aux->linfo[linfo_idx];
238
	insn_start = linfo[0].insn_off;
239
	insn_end = insn_start + prog->len;
240

241
	jited_linfo = &prog->aux->jited_linfo[linfo_idx];
242
	jited_linfo[0] = prog->bpf_func;
243

244
	nr_linfo = prog->aux->nr_linfo - linfo_idx;
245

246
	for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
247
		/* The verifier ensures that linfo[i].insn_off is
248
		 * strictly increasing
249
		 */
250
		jited_linfo[i] = prog->bpf_func +
251
			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
252
}
253

254
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
255
				  gfp_t gfp_extra_flags)
256
{
257
	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
258
	struct bpf_prog *fp;
259
	u32 pages;
260

261
	size = round_up(size, PAGE_SIZE);
262
	pages = size / PAGE_SIZE;
263
	if (pages <= fp_old->pages)
264
		return fp_old;
265

266
	fp = __vmalloc(size, gfp_flags);
267
	if (fp) {
268
		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
269
		fp->pages = pages;
270
		fp->aux->prog = fp;
271

272
		/* We keep fp->aux from fp_old around in the new
273
		 * reallocated structure.
274
		 */
275
		fp_old->aux = NULL;
276
		fp_old->stats = NULL;
277
		fp_old->active = NULL;
278
		__bpf_prog_free(fp_old);
279
	}
280

281
	return fp;
282
}
283

284
void __bpf_prog_free(struct bpf_prog *fp)
285
{
286
	if (fp->aux) {
287
		mutex_destroy(&fp->aux->used_maps_mutex);
288
		mutex_destroy(&fp->aux->dst_mutex);
289
		kfree(fp->aux->poke_tab);
290
		kfree(fp->aux);
291
	}
292
	free_percpu(fp->stats);
293
	free_percpu(fp->active);
294
	vfree(fp);
295
}
296

297
int bpf_prog_calc_tag(struct bpf_prog *fp)
298
{
299
	size_t size = bpf_prog_insn_size(fp);
300
	struct bpf_insn *dst;
301
	bool was_ld_map;
302
	u32 i;
303

304
	dst = vmalloc(size);
305
	if (!dst)
306
		return -ENOMEM;
307

308
	/* We need to take out the map fd for the digest calculation
309
	 * since they are unstable from user space side.
310
	 */
311
	for (i = 0, was_ld_map = false; i < fp->len; i++) {
312
		dst[i] = fp->insnsi[i];
313
		if (!was_ld_map &&
314
		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
315
		    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
316
		     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
317
			was_ld_map = true;
318
			dst[i].imm = 0;
319
		} else if (was_ld_map &&
320
			   dst[i].code == 0 &&
321
			   dst[i].dst_reg == 0 &&
322
			   dst[i].src_reg == 0 &&
323
			   dst[i].off == 0) {
324
			was_ld_map = false;
325
			dst[i].imm = 0;
326
		} else {
327
			was_ld_map = false;
328
		}
329
	}
330
	sha256((u8 *)dst, size, fp->digest);
331
	vfree(dst);
332
	return 0;
333
}
334

335
static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
336
				s32 end_new, s32 curr, const bool probe_pass)
337
{
338
	const s64 imm_min = S32_MIN, imm_max = S32_MAX;
339
	s32 delta = end_new - end_old;
340
	s64 imm = insn->imm;
341

342
	if (curr < pos && curr + imm + 1 >= end_old)
343
		imm += delta;
344
	else if (curr >= end_new && curr + imm + 1 < end_new)
345
		imm -= delta;
346
	if (imm < imm_min || imm > imm_max)
347
		return -ERANGE;
348
	if (!probe_pass)
349
		insn->imm = imm;
350
	return 0;
351
}
352

353
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
354
				s32 end_new, s32 curr, const bool probe_pass)
355
{
356
	s64 off_min, off_max, off;
357
	s32 delta = end_new - end_old;
358

359
	if (insn->code == (BPF_JMP32 | BPF_JA)) {
360
		off = insn->imm;
361
		off_min = S32_MIN;
362
		off_max = S32_MAX;
363
	} else {
364
		off = insn->off;
365
		off_min = S16_MIN;
366
		off_max = S16_MAX;
367
	}
368

369
	if (curr < pos && curr + off + 1 >= end_old)
370
		off += delta;
371
	else if (curr >= end_new && curr + off + 1 < end_new)
372
		off -= delta;
373
	if (off < off_min || off > off_max)
374
		return -ERANGE;
375
	if (!probe_pass) {
376
		if (insn->code == (BPF_JMP32 | BPF_JA))
377
			insn->imm = off;
378
		else
379
			insn->off = off;
380
	}
381
	return 0;
382
}
383

384
static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
385
			    s32 end_new, const bool probe_pass)
386
{
387
	u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
388
	struct bpf_insn *insn = prog->insnsi;
389
	int ret = 0;
390

391
	for (i = 0; i < insn_cnt; i++, insn++) {
392
		u8 code;
393

394
		/* In the probing pass we still operate on the original,
395
		 * unpatched image in order to check overflows before we
396
		 * do any other adjustments. Therefore skip the patchlet.
397
		 */
398
		if (probe_pass && i == pos) {
399
			i = end_new;
400
			insn = prog->insnsi + end_old;
401
		}
402
		if (bpf_pseudo_func(insn)) {
403
			ret = bpf_adj_delta_to_imm(insn, pos, end_old,
404
						   end_new, i, probe_pass);
405
			if (ret)
406
				return ret;
407
			continue;
408
		}
409
		code = insn->code;
410
		if ((BPF_CLASS(code) != BPF_JMP &&
411
		     BPF_CLASS(code) != BPF_JMP32) ||
412
		    BPF_OP(code) == BPF_EXIT)
413
			continue;
414
		/* Adjust offset of jmps if we cross patch boundaries. */
415
		if (BPF_OP(code) == BPF_CALL) {
416
			if (insn->src_reg != BPF_PSEUDO_CALL)
417
				continue;
418
			ret = bpf_adj_delta_to_imm(insn, pos, end_old,
419
						   end_new, i, probe_pass);
420
		} else {
421
			ret = bpf_adj_delta_to_off(insn, pos, end_old,
422
						   end_new, i, probe_pass);
423
		}
424
		if (ret)
425
			break;
426
	}
427

428
	return ret;
429
}
430

431
static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
432
{
433
	struct bpf_line_info *linfo;
434
	u32 i, nr_linfo;
435

436
	nr_linfo = prog->aux->nr_linfo;
437
	if (!nr_linfo || !delta)
438
		return;
439

440
	linfo = prog->aux->linfo;
441

442
	for (i = 0; i < nr_linfo; i++)
443
		if (off < linfo[i].insn_off)
444
			break;
445

446
	/* Push all off < linfo[i].insn_off by delta */
447
	for (; i < nr_linfo; i++)
448
		linfo[i].insn_off += delta;
449
}
450

451
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
452
				       const struct bpf_insn *patch, u32 len)
453
{
454
	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
455
	const u32 cnt_max = S16_MAX;
456
	struct bpf_prog *prog_adj;
457
	int err;
458

459
	/* Since our patchlet doesn't expand the image, we're done. */
460
	if (insn_delta == 0) {
461
		memcpy(prog->insnsi + off, patch, sizeof(*patch));
462
		return prog;
463
	}
464

465
	insn_adj_cnt = prog->len + insn_delta;
466

467
	/* Reject anything that would potentially let the insn->off
468
	 * target overflow when we have excessive program expansions.
469
	 * We need to probe here before we do any reallocation where
470
	 * we afterwards may not fail anymore.
471
	 */
472
	if (insn_adj_cnt > cnt_max &&
473
	    (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
474
		return ERR_PTR(err);
475

476
	/* Several new instructions need to be inserted. Make room
477
	 * for them. Likely, there's no need for a new allocation as
478
	 * last page could have large enough tailroom.
479
	 */
480
	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
481
				    GFP_USER);
482
	if (!prog_adj)
483
		return ERR_PTR(-ENOMEM);
484

485
	prog_adj->len = insn_adj_cnt;
486

487
	/* Patching happens in 3 steps:
488
	 *
489
	 * 1) Move over tail of insnsi from next instruction onwards,
490
	 *    so we can patch the single target insn with one or more
491
	 *    new ones (patching is always from 1 to n insns, n > 0).
492
	 * 2) Inject new instructions at the target location.
493
	 * 3) Adjust branch offsets if necessary.
494
	 */
495
	insn_rest = insn_adj_cnt - off - len;
496

497
	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
498
		sizeof(*patch) * insn_rest);
499
	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
500

501
	/* We are guaranteed to not fail at this point, otherwise
502
	 * the ship has sailed to reverse to the original state. An
503
	 * overflow cannot happen at this point.
504
	 */
505
	BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
506

507
	bpf_adj_linfo(prog_adj, off, insn_delta);
508

509
	return prog_adj;
510
}
511

512
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
513
{
514
	int err;
515

516
	/* Branch offsets can't overflow when program is shrinking, no need
517
	 * to call bpf_adj_branches(..., true) here
518
	 */
519
	memmove(prog->insnsi + off, prog->insnsi + off + cnt,
520
		sizeof(struct bpf_insn) * (prog->len - off - cnt));
521
	prog->len -= cnt;
522

523
	err = bpf_adj_branches(prog, off, off + cnt, off, false);
524
	WARN_ON_ONCE(err);
525
	return err;
526
}
527

528
static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
529
{
530
	int i;
531

532
	for (i = 0; i < fp->aux->real_func_cnt; i++)
533
		bpf_prog_kallsyms_del(fp->aux->func[i]);
534
}
535

536
void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
537
{
538
	bpf_prog_kallsyms_del_subprogs(fp);
539
	bpf_prog_kallsyms_del(fp);
540
}
541

542
#ifdef CONFIG_BPF_JIT
543
/* All BPF JIT sysctl knobs here. */
544
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
545
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
546
int bpf_jit_harden   __read_mostly;
547
long bpf_jit_limit   __read_mostly;
548
long bpf_jit_limit_max __read_mostly;
549

550
static void
551
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
552
{
553
	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
554

555
	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
556
	prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
557
}
558

559
static void
560
bpf_prog_ksym_set_name(struct bpf_prog *prog)
561
{
562
	char *sym = prog->aux->ksym.name;
563
	const char *end = sym + KSYM_NAME_LEN;
564
	const struct btf_type *type;
565
	const char *func_name;
566

567
	BUILD_BUG_ON(sizeof("bpf_prog_") +
568
		     sizeof(prog->tag) * 2 +
569
		     /* name has been null terminated.
570
		      * We should need +1 for the '_' preceding
571
		      * the name.  However, the null character
572
		      * is double counted between the name and the
573
		      * sizeof("bpf_prog_") above, so we omit
574
		      * the +1 here.
575
		      */
576
		     sizeof(prog->aux->name) > KSYM_NAME_LEN);
577

578
	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
579
	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
580

581
	/* prog->aux->name will be ignored if full btf name is available */
582
	if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
583
		type = btf_type_by_id(prog->aux->btf,
584
				      prog->aux->func_info[prog->aux->func_idx].type_id);
585
		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
586
		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
587
		return;
588
	}
589

590
	if (prog->aux->name[0])
591
		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
592
	else
593
		*sym = 0;
594
}
595

596
static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
597
{
598
	return container_of(n, struct bpf_ksym, tnode)->start;
599
}
600

601
static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
602
					  struct latch_tree_node *b)
603
{
604
	return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
605
}
606

607
static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
608
{
609
	unsigned long val = (unsigned long)key;
610
	const struct bpf_ksym *ksym;
611

612
	ksym = container_of(n, struct bpf_ksym, tnode);
613

614
	if (val < ksym->start)
615
		return -1;
616
	/* Ensure that we detect return addresses as part of the program, when
617
	 * the final instruction is a call for a program part of the stack
618
	 * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
619
	 */
620
	if (val > ksym->end)
621
		return  1;
622

623
	return 0;
624
}
625

626
static const struct latch_tree_ops bpf_tree_ops = {
627
	.less	= bpf_tree_less,
628
	.comp	= bpf_tree_comp,
629
};
630

631
static DEFINE_SPINLOCK(bpf_lock);
632
static LIST_HEAD(bpf_kallsyms);
633
static struct latch_tree_root bpf_tree __cacheline_aligned;
634

635
void bpf_ksym_add(struct bpf_ksym *ksym)
636
{
637
	spin_lock_bh(&bpf_lock);
638
	WARN_ON_ONCE(!list_empty(&ksym->lnode));
639
	list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
640
	latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
641
	spin_unlock_bh(&bpf_lock);
642
}
643

644
static void __bpf_ksym_del(struct bpf_ksym *ksym)
645
{
646
	if (list_empty(&ksym->lnode))
647
		return;
648

649
	latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
650
	list_del_rcu(&ksym->lnode);
651
}
652

653
void bpf_ksym_del(struct bpf_ksym *ksym)
654
{
655
	spin_lock_bh(&bpf_lock);
656
	__bpf_ksym_del(ksym);
657
	spin_unlock_bh(&bpf_lock);
658
}
659

660
static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
661
{
662
	return fp->jited && !bpf_prog_was_classic(fp);
663
}
664

665
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
666
{
667
	if (!bpf_prog_kallsyms_candidate(fp) ||
668
	    !bpf_token_capable(fp->aux->token, CAP_BPF))
669
		return;
670

671
	bpf_prog_ksym_set_addr(fp);
672
	bpf_prog_ksym_set_name(fp);
673
	fp->aux->ksym.prog = true;
674

675
	bpf_ksym_add(&fp->aux->ksym);
676

677
#ifdef CONFIG_FINEIBT
678
	/*
679
	 * When FineIBT, code in the __cfi_foo() symbols can get executed
680
	 * and hence unwinder needs help.
681
	 */
682
	if (cfi_mode != CFI_FINEIBT)
683
		return;
684

685
	snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
686
		 "__cfi_%s", fp->aux->ksym.name);
687

688
	fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
689
	fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;
690

691
	bpf_ksym_add(&fp->aux->ksym_prefix);
692
#endif
693
}
694

695
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
696
{
697
	if (!bpf_prog_kallsyms_candidate(fp))
698
		return;
699

700
	bpf_ksym_del(&fp->aux->ksym);
701
#ifdef CONFIG_FINEIBT
702
	if (cfi_mode != CFI_FINEIBT)
703
		return;
704
	bpf_ksym_del(&fp->aux->ksym_prefix);
705
#endif
706
}
707

708
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
709
{
710
	struct latch_tree_node *n;
711

712
	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
713
	return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
714
}
715

716
int __bpf_address_lookup(unsigned long addr, unsigned long *size,
717
				 unsigned long *off, char *sym)
718
{
719
	struct bpf_ksym *ksym;
720
	int ret = 0;
721

722
	rcu_read_lock();
723
	ksym = bpf_ksym_find(addr);
724
	if (ksym) {
725
		unsigned long symbol_start = ksym->start;
726
		unsigned long symbol_end = ksym->end;
727

728
		ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
729

730
		if (size)
731
			*size = symbol_end - symbol_start;
732
		if (off)
733
			*off  = addr - symbol_start;
734
	}
735
	rcu_read_unlock();
736

737
	return ret;
738
}
739

740
bool is_bpf_text_address(unsigned long addr)
741
{
742
	bool ret;
743

744
	rcu_read_lock();
745
	ret = bpf_ksym_find(addr) != NULL;
746
	rcu_read_unlock();
747

748
	return ret;
749
}
750

751
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
752
{
753
	struct bpf_ksym *ksym;
754

755
	WARN_ON_ONCE(!rcu_read_lock_held());
756
	ksym = bpf_ksym_find(addr);
757

758
	return ksym && ksym->prog ?
759
	       container_of(ksym, struct bpf_prog_aux, ksym)->prog :
760
	       NULL;
761
}
762

763
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
764
{
765
	const struct exception_table_entry *e = NULL;
766
	struct bpf_prog *prog;
767

768
	rcu_read_lock();
769
	prog = bpf_prog_ksym_find(addr);
770
	if (!prog)
771
		goto out;
772
	if (!prog->aux->num_exentries)
773
		goto out;
774

775
	e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
776
out:
777
	rcu_read_unlock();
778
	return e;
779
}
780

781
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
782
		    char *sym)
783
{
784
	struct bpf_ksym *ksym;
785
	unsigned int it = 0;
786
	int ret = -ERANGE;
787

788
	if (!bpf_jit_kallsyms_enabled())
789
		return ret;
790

791
	rcu_read_lock();
792
	list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
793
		if (it++ != symnum)
794
			continue;
795

796
		strscpy(sym, ksym->name, KSYM_NAME_LEN);
797

798
		*value = ksym->start;
799
		*type  = BPF_SYM_ELF_TYPE;
800

801
		ret = 0;
802
		break;
803
	}
804
	rcu_read_unlock();
805

806
	return ret;
807
}
808

809
int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
810
				struct bpf_jit_poke_descriptor *poke)
811
{
812
	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
813
	static const u32 poke_tab_max = 1024;
814
	u32 slot = prog->aux->size_poke_tab;
815
	u32 size = slot + 1;
816

817
	if (size > poke_tab_max)
818
		return -ENOSPC;
819
	if (poke->tailcall_target || poke->tailcall_target_stable ||
820
	    poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
821
		return -EINVAL;
822

823
	switch (poke->reason) {
824
	case BPF_POKE_REASON_TAIL_CALL:
825
		if (!poke->tail_call.map)
826
			return -EINVAL;
827
		break;
828
	default:
829
		return -EINVAL;
830
	}
831

832
	tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
833
	if (!tab)
834
		return -ENOMEM;
835

836
	memcpy(&tab[slot], poke, sizeof(*poke));
837
	prog->aux->size_poke_tab = size;
838
	prog->aux->poke_tab = tab;
839

840
	return slot;
841
}
842

843
/*
844
 * BPF program pack allocator.
845
 *
846
 * Most BPF programs are pretty small. Allocating a hole page for each
847
 * program is sometime a waste. Many small bpf program also adds pressure
848
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
849
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
850
 * to host BPF programs.
851
 */
852
#define BPF_PROG_CHUNK_SHIFT	6
853
#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
854
#define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
855

856
struct bpf_prog_pack {
857
	struct list_head list;
858
	void *ptr;
859
	unsigned long bitmap[];
860
};
861

862
void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
863
{
864
	memset(area, 0, size);
865
}
866

867
#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
868

869
static DEFINE_MUTEX(pack_mutex);
870
static LIST_HEAD(pack_list);
871

872
/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
873
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
874
 */
875
#ifdef PMD_SIZE
876
/* PMD_SIZE is really big for some archs. It doesn't make sense to
877
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
878
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
879
 * greater than or equal to 2MB.
880
 */
881
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
882
#else
883
#define BPF_PROG_PACK_SIZE PAGE_SIZE
884
#endif
885

886
#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
887

888
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
889
{
890
	struct bpf_prog_pack *pack;
891
	int err;
892

893
	pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
894
		       GFP_KERNEL);
895
	if (!pack)
896
		return NULL;
897
	pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
898
	if (!pack->ptr)
899
		goto out;
900
	bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
901
	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
902

903
	set_vm_flush_reset_perms(pack->ptr);
904
	err = set_memory_rox((unsigned long)pack->ptr,
905
			     BPF_PROG_PACK_SIZE / PAGE_SIZE);
906
	if (err)
907
		goto out;
908
	list_add_tail(&pack->list, &pack_list);
909
	return pack;
910

911
out:
912
	bpf_jit_free_exec(pack->ptr);
913
	kfree(pack);
914
	return NULL;
915
}
916

917
void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
918
{
919
	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
920
	struct bpf_prog_pack *pack;
921
	unsigned long pos;
922
	void *ptr = NULL;
923

924
	mutex_lock(&pack_mutex);
925
	if (size > BPF_PROG_PACK_SIZE) {
926
		size = round_up(size, PAGE_SIZE);
927
		ptr = bpf_jit_alloc_exec(size);
928
		if (ptr) {
929
			int err;
930

931
			bpf_fill_ill_insns(ptr, size);
932
			set_vm_flush_reset_perms(ptr);
933
			err = set_memory_rox((unsigned long)ptr,
934
					     size / PAGE_SIZE);
935
			if (err) {
936
				bpf_jit_free_exec(ptr);
937
				ptr = NULL;
938
			}
939
		}
940
		goto out;
941
	}
942
	list_for_each_entry(pack, &pack_list, list) {
943
		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
944
						 nbits, 0);
945
		if (pos < BPF_PROG_CHUNK_COUNT)
946
			goto found_free_area;
947
	}
948

949
	pack = alloc_new_pack(bpf_fill_ill_insns);
950
	if (!pack)
951
		goto out;
952

953
	pos = 0;
954

955
found_free_area:
956
	bitmap_set(pack->bitmap, pos, nbits);
957
	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
958

959
out:
960
	mutex_unlock(&pack_mutex);
961
	return ptr;
962
}
963

964
void bpf_prog_pack_free(void *ptr, u32 size)
965
{
966
	struct bpf_prog_pack *pack = NULL, *tmp;
967
	unsigned int nbits;
968
	unsigned long pos;
969

970
	mutex_lock(&pack_mutex);
971
	if (size > BPF_PROG_PACK_SIZE) {
972
		bpf_jit_free_exec(ptr);
973
		goto out;
974
	}
975

976
	list_for_each_entry(tmp, &pack_list, list) {
977
		if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
978
			pack = tmp;
979
			break;
980
		}
981
	}
982

983
	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
984
		goto out;
985

986
	nbits = BPF_PROG_SIZE_TO_NBITS(size);
987
	pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
988

989
	WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
990
		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
991

992
	bitmap_clear(pack->bitmap, pos, nbits);
993
	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
994
				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
995
		list_del(&pack->list);
996
		bpf_jit_free_exec(pack->ptr);
997
		kfree(pack);
998
	}
999
out:
1000
	mutex_unlock(&pack_mutex);
1001
}
1002

1003
static atomic_long_t bpf_jit_current;
1004

1005
/* Can be overridden by an arch's JIT compiler if it has a custom,
1006
 * dedicated BPF backend memory area, or if neither of the two
1007
 * below apply.
1008
 */
1009
u64 __weak bpf_jit_alloc_exec_limit(void)
1010
{
1011
#if defined(MODULES_VADDR)
1012
	return MODULES_END - MODULES_VADDR;
1013
#else
1014
	return VMALLOC_END - VMALLOC_START;
1015
#endif
1016
}
1017

1018
static int __init bpf_jit_charge_init(void)
1019
{
1020
	/* Only used as heuristic here to derive limit. */
1021
	bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
1022
	bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
1023
					    PAGE_SIZE), LONG_MAX);
1024
	return 0;
1025
}
1026
pure_initcall(bpf_jit_charge_init);
1027

1028
int bpf_jit_charge_modmem(u32 size)
1029
{
1030
	if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
1031
		if (!bpf_capable()) {
1032
			atomic_long_sub(size, &bpf_jit_current);
1033
			return -EPERM;
1034
		}
1035
	}
1036

1037
	return 0;
1038
}
1039

1040
void bpf_jit_uncharge_modmem(u32 size)
1041
{
1042
	atomic_long_sub(size, &bpf_jit_current);
1043
}
1044

1045
void *__weak bpf_jit_alloc_exec(unsigned long size)
1046
{
1047
	return execmem_alloc(EXECMEM_BPF, size);
1048
}
1049

1050
void __weak bpf_jit_free_exec(void *addr)
1051
{
1052
	execmem_free(addr);
1053
}
1054

1055
struct bpf_binary_header *
1056
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
1057
		     unsigned int alignment,
1058
		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
1059
{
1060
	struct bpf_binary_header *hdr;
1061
	u32 size, hole, start;
1062

1063
	WARN_ON_ONCE(!is_power_of_2(alignment) ||
1064
		     alignment > BPF_IMAGE_ALIGNMENT);
1065

1066
	/* Most of BPF filters are really small, but if some of them
1067
	 * fill a page, allow at least 128 extra bytes to insert a
1068
	 * random section of illegal instructions.
1069
	 */
1070
	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
1071

1072
	if (bpf_jit_charge_modmem(size))
1073
		return NULL;
1074
	hdr = bpf_jit_alloc_exec(size);
1075
	if (!hdr) {
1076
		bpf_jit_uncharge_modmem(size);
1077
		return NULL;
1078
	}
1079

1080
	/* Fill space with illegal/arch-dep instructions. */
1081
	bpf_fill_ill_insns(hdr, size);
1082

1083
	hdr->size = size;
1084
	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
1085
		     PAGE_SIZE - sizeof(*hdr));
1086
	start = get_random_u32_below(hole) & ~(alignment - 1);
1087

1088
	/* Leave a random number of instructions before BPF code. */
1089
	*image_ptr = &hdr->image[start];
1090

1091
	return hdr;
1092
}
1093

1094
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
1095
{
1096
	u32 size = hdr->size;
1097

1098
	bpf_jit_free_exec(hdr);
1099
	bpf_jit_uncharge_modmem(size);
1100
}
1101

1102
/* Allocate jit binary from bpf_prog_pack allocator.
1103
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
1104
 * to the memory. To solve this problem, a RW buffer is also allocated at
1105
 * as the same time. The JIT engine should calculate offsets based on the
1106
 * RO memory address, but write JITed program to the RW buffer. Once the
1107
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
1108
 * the JITed program to the RO memory.
1109
 */
1110
struct bpf_binary_header *
1111
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
1112
			  unsigned int alignment,
1113
			  struct bpf_binary_header **rw_header,
1114
			  u8 **rw_image,
1115
			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
1116
{
1117
	struct bpf_binary_header *ro_header;
1118
	u32 size, hole, start;
1119

1120
	WARN_ON_ONCE(!is_power_of_2(alignment) ||
1121
		     alignment > BPF_IMAGE_ALIGNMENT);
1122

1123
	/* add 16 bytes for a random section of illegal instructions */
1124
	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
1125

1126
	if (bpf_jit_charge_modmem(size))
1127
		return NULL;
1128
	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
1129
	if (!ro_header) {
1130
		bpf_jit_uncharge_modmem(size);
1131
		return NULL;
1132
	}
1133

1134
	*rw_header = kvmalloc(size, GFP_KERNEL);
1135
	if (!*rw_header) {
1136
		bpf_prog_pack_free(ro_header, size);
1137
		bpf_jit_uncharge_modmem(size);
1138
		return NULL;
1139
	}
1140

1141
	/* Fill space with illegal/arch-dep instructions. */
1142
	bpf_fill_ill_insns(*rw_header, size);
1143
	(*rw_header)->size = size;
1144

1145
	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
1146
		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
1147
	start = get_random_u32_below(hole) & ~(alignment - 1);
1148

1149
	*image_ptr = &ro_header->image[start];
1150
	*rw_image = &(*rw_header)->image[start];
1151

1152
	return ro_header;
1153
}
1154

1155
/* Copy JITed text from rw_header to its final location, the ro_header. */
1156
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
1157
				 struct bpf_binary_header *rw_header)
1158
{
1159
	void *ptr;
1160

1161
	ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
1162

1163
	kvfree(rw_header);
1164

1165
	if (IS_ERR(ptr)) {
1166
		bpf_prog_pack_free(ro_header, ro_header->size);
1167
		return PTR_ERR(ptr);
1168
	}
1169
	return 0;
1170
}
1171

1172
/* bpf_jit_binary_pack_free is called in two different scenarios:
1173
 *   1) when the program is freed after;
1174
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
1175
 * For case 2), we need to free both the RO memory and the RW buffer.
1176
 *
1177
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
1178
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
1179
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
1180
 * bpf_arch_text_copy (when jit fails).
1181
 */
1182
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
1183
			      struct bpf_binary_header *rw_header)
1184
{
1185
	u32 size = ro_header->size;
1186

1187
	bpf_prog_pack_free(ro_header, size);
1188
	kvfree(rw_header);
1189
	bpf_jit_uncharge_modmem(size);
1190
}
1191

1192
struct bpf_binary_header *
1193
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
1194
{
1195
	unsigned long real_start = (unsigned long)fp->bpf_func;
1196
	unsigned long addr;
1197

1198
	addr = real_start & BPF_PROG_CHUNK_MASK;
1199
	return (void *)addr;
1200
}
1201

1202
static inline struct bpf_binary_header *
1203
bpf_jit_binary_hdr(const struct bpf_prog *fp)
1204
{
1205
	unsigned long real_start = (unsigned long)fp->bpf_func;
1206
	unsigned long addr;
1207

1208
	addr = real_start & PAGE_MASK;
1209
	return (void *)addr;
1210
}
1211

1212
/* This symbol is only overridden by archs that have different
1213
 * requirements than the usual eBPF JITs, f.e. when they only
1214
 * implement cBPF JIT, do not set images read-only, etc.
1215
 */
1216
void __weak bpf_jit_free(struct bpf_prog *fp)
1217
{
1218
	if (fp->jited) {
1219
		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
1220

1221
		bpf_jit_binary_free(hdr);
1222
		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
1223
	}
1224

1225
	bpf_prog_unlock_free(fp);
1226
}
1227

1228
int bpf_jit_get_func_addr(const struct bpf_prog *prog,
1229
			  const struct bpf_insn *insn, bool extra_pass,
1230
			  u64 *func_addr, bool *func_addr_fixed)
1231
{
1232
	s16 off = insn->off;
1233
	s32 imm = insn->imm;
1234
	u8 *addr;
1235
	int err;
1236

1237
	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
1238
	if (!*func_addr_fixed) {
1239
		/* Place-holder address till the last pass has collected
1240
		 * all addresses for JITed subprograms in which case we
1241
		 * can pick them up from prog->aux.
1242
		 */
1243
		if (!extra_pass)
1244
			addr = NULL;
1245
		else if (prog->aux->func &&
1246
			 off >= 0 && off < prog->aux->real_func_cnt)
1247
			addr = (u8 *)prog->aux->func[off]->bpf_func;
1248
		else
1249
			return -EINVAL;
1250
	} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
1251
		   bpf_jit_supports_far_kfunc_call()) {
1252
		err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
1253
		if (err)
1254
			return err;
1255
	} else {
1256
		/* Address of a BPF helper call. Since part of the core
1257
		 * kernel, it's always at a fixed location. __bpf_call_base
1258
		 * and the helper with imm relative to it are both in core
1259
		 * kernel.
1260
		 */
1261
		addr = (u8 *)__bpf_call_base + imm;
1262
	}
1263

1264
	*func_addr = (unsigned long)addr;
1265
	return 0;
1266
}
1267

1268
const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
1269
{
1270
	if (prog->aux->ksym.prog)
1271
		return prog->aux->ksym.name;
1272
	return prog->aux->name;
1273
}
1274

1275
static int bpf_jit_blind_insn(const struct bpf_insn *from,
1276
			      const struct bpf_insn *aux,
1277
			      struct bpf_insn *to_buff,
1278
			      bool emit_zext)
1279
{
1280
	struct bpf_insn *to = to_buff;
1281
	u32 imm_rnd = get_random_u32();
1282
	s16 off;
1283

1284
	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
1285
	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
1286

1287
	/* Constraints on AX register:
1288
	 *
1289
	 * AX register is inaccessible from user space. It is mapped in
1290
	 * all JITs, and used here for constant blinding rewrites. It is
1291
	 * typically "stateless" meaning its contents are only valid within
1292
	 * the executed instruction, but not across several instructions.
1293
	 * There are a few exceptions however which are further detailed
1294
	 * below.
1295
	 *
1296
	 * Constant blinding is only used by JITs, not in the interpreter.
1297
	 * The interpreter uses AX in some occasions as a local temporary
1298
	 * register e.g. in DIV or MOD instructions.
1299
	 *
1300
	 * In restricted circumstances, the verifier can also use the AX
1301
	 * register for rewrites as long as they do not interfere with
1302
	 * the above cases!
1303
	 */
1304
	if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
1305
		goto out;
1306

1307
	if (from->imm == 0 &&
1308
	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
1309
	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
1310
		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
1311
		goto out;
1312
	}
1313

1314
	switch (from->code) {
1315
	case BPF_ALU | BPF_ADD | BPF_K:
1316
	case BPF_ALU | BPF_SUB | BPF_K:
1317
	case BPF_ALU | BPF_AND | BPF_K:
1318
	case BPF_ALU | BPF_OR  | BPF_K:
1319
	case BPF_ALU | BPF_XOR | BPF_K:
1320
	case BPF_ALU | BPF_MUL | BPF_K:
1321
	case BPF_ALU | BPF_MOV | BPF_K:
1322
	case BPF_ALU | BPF_DIV | BPF_K:
1323
	case BPF_ALU | BPF_MOD | BPF_K:
1324
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1325
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1326
		*to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1327
		break;
1328

1329
	case BPF_ALU64 | BPF_ADD | BPF_K:
1330
	case BPF_ALU64 | BPF_SUB | BPF_K:
1331
	case BPF_ALU64 | BPF_AND | BPF_K:
1332
	case BPF_ALU64 | BPF_OR  | BPF_K:
1333
	case BPF_ALU64 | BPF_XOR | BPF_K:
1334
	case BPF_ALU64 | BPF_MUL | BPF_K:
1335
	case BPF_ALU64 | BPF_MOV | BPF_K:
1336
	case BPF_ALU64 | BPF_DIV | BPF_K:
1337
	case BPF_ALU64 | BPF_MOD | BPF_K:
1338
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1339
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1340
		*to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
1341
		break;
1342

1343
	case BPF_JMP | BPF_JEQ  | BPF_K:
1344
	case BPF_JMP | BPF_JNE  | BPF_K:
1345
	case BPF_JMP | BPF_JGT  | BPF_K:
1346
	case BPF_JMP | BPF_JLT  | BPF_K:
1347
	case BPF_JMP | BPF_JGE  | BPF_K:
1348
	case BPF_JMP | BPF_JLE  | BPF_K:
1349
	case BPF_JMP | BPF_JSGT | BPF_K:
1350
	case BPF_JMP | BPF_JSLT | BPF_K:
1351
	case BPF_JMP | BPF_JSGE | BPF_K:
1352
	case BPF_JMP | BPF_JSLE | BPF_K:
1353
	case BPF_JMP | BPF_JSET | BPF_K:
1354
		/* Accommodate for extra offset in case of a backjump. */
1355
		off = from->off;
1356
		if (off < 0)
1357
			off -= 2;
1358
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1359
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1360
		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
1361
		break;
1362

1363
	case BPF_JMP32 | BPF_JEQ  | BPF_K:
1364
	case BPF_JMP32 | BPF_JNE  | BPF_K:
1365
	case BPF_JMP32 | BPF_JGT  | BPF_K:
1366
	case BPF_JMP32 | BPF_JLT  | BPF_K:
1367
	case BPF_JMP32 | BPF_JGE  | BPF_K:
1368
	case BPF_JMP32 | BPF_JLE  | BPF_K:
1369
	case BPF_JMP32 | BPF_JSGT | BPF_K:
1370
	case BPF_JMP32 | BPF_JSLT | BPF_K:
1371
	case BPF_JMP32 | BPF_JSGE | BPF_K:
1372
	case BPF_JMP32 | BPF_JSLE | BPF_K:
1373
	case BPF_JMP32 | BPF_JSET | BPF_K:
1374
		/* Accommodate for extra offset in case of a backjump. */
1375
		off = from->off;
1376
		if (off < 0)
1377
			off -= 2;
1378
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1379
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1380
		*to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
1381
				      off);
1382
		break;
1383

1384
	case BPF_LD | BPF_IMM | BPF_DW:
1385
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
1386
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1387
		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
1388
		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
1389
		break;
1390
	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
1391
		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
1392
		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1393
		if (emit_zext)
1394
			*to++ = BPF_ZEXT_REG(BPF_REG_AX);
1395
		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
1396
		break;
1397

1398
	case BPF_ST | BPF_MEM | BPF_DW:
1399
	case BPF_ST | BPF_MEM | BPF_W:
1400
	case BPF_ST | BPF_MEM | BPF_H:
1401
	case BPF_ST | BPF_MEM | BPF_B:
1402
		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
1403
		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1404
		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
1405
		break;
1406
	}
1407
out:
1408
	return to - to_buff;
1409
}
1410

1411
static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
1412
					      gfp_t gfp_extra_flags)
1413
{
1414
	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
1415
	struct bpf_prog *fp;
1416

1417
	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
1418
	if (fp != NULL) {
1419
		/* aux->prog still points to the fp_other one, so
1420
		 * when promoting the clone to the real program,
1421
		 * this still needs to be adapted.
1422
		 */
1423
		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
1424
	}
1425

1426
	return fp;
1427
}
1428

1429
static void bpf_prog_clone_free(struct bpf_prog *fp)
1430
{
1431
	/* aux was stolen by the other clone, so we cannot free
1432
	 * it from this path! It will be freed eventually by the
1433
	 * other program on release.
1434
	 *
1435
	 * At this point, we don't need a deferred release since
1436
	 * clone is guaranteed to not be locked.
1437
	 */
1438
	fp->aux = NULL;
1439
	fp->stats = NULL;
1440
	fp->active = NULL;
1441
	__bpf_prog_free(fp);
1442
}
1443

1444
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
1445
{
1446
	/* We have to repoint aux->prog to self, as we don't
1447
	 * know whether fp here is the clone or the original.
1448
	 */
1449
	fp->aux->prog = fp;
1450
	bpf_prog_clone_free(fp_other);
1451
}
1452

1453
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
1454
{
1455
	struct bpf_insn insn_buff[16], aux[2];
1456
	struct bpf_prog *clone, *tmp;
1457
	int insn_delta, insn_cnt;
1458
	struct bpf_insn *insn;
1459
	int i, rewritten;
1460

1461
	if (!prog->blinding_requested || prog->blinded)
1462
		return prog;
1463

1464
	clone = bpf_prog_clone_create(prog, GFP_USER);
1465
	if (!clone)
1466
		return ERR_PTR(-ENOMEM);
1467

1468
	insn_cnt = clone->len;
1469
	insn = clone->insnsi;
1470

1471
	for (i = 0; i < insn_cnt; i++, insn++) {
1472
		if (bpf_pseudo_func(insn)) {
1473
			/* ld_imm64 with an address of bpf subprog is not
1474
			 * a user controlled constant. Don't randomize it,
1475
			 * since it will conflict with jit_subprogs() logic.
1476
			 */
1477
			insn++;
1478
			i++;
1479
			continue;
1480
		}
1481

1482
		/* We temporarily need to hold the original ld64 insn
1483
		 * so that we can still access the first part in the
1484
		 * second blinding run.
1485
		 */
1486
		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
1487
		    insn[1].code == 0)
1488
			memcpy(aux, insn, sizeof(aux));
1489

1490
		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
1491
						clone->aux->verifier_zext);
1492
		if (!rewritten)
1493
			continue;
1494

1495
		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
1496
		if (IS_ERR(tmp)) {
1497
			/* Patching may have repointed aux->prog during
1498
			 * realloc from the original one, so we need to
1499
			 * fix it up here on error.
1500
			 */
1501
			bpf_jit_prog_release_other(prog, clone);
1502
			return tmp;
1503
		}
1504

1505
		clone = tmp;
1506
		insn_delta = rewritten - 1;
1507

1508
		/* Walk new program and skip insns we just inserted. */
1509
		insn = clone->insnsi + i + insn_delta;
1510
		insn_cnt += insn_delta;
1511
		i        += insn_delta;
1512
	}
1513

1514
	clone->blinded = 1;
1515
	return clone;
1516
}
1517
#endif /* CONFIG_BPF_JIT */
1518

1519
/* Base function for offset calculation. Needs to go into .text section,
1520
 * therefore keeping it non-static as well; will also be used by JITs
1521
 * anyway later on, so do not let the compiler omit it. This also needs
1522
 * to go into kallsyms for correlation from e.g. bpftool, so naming
1523
 * must not change.
1524
 */
1525
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1526
{
1527
	return 0;
1528
}
1529
EXPORT_SYMBOL_GPL(__bpf_call_base);
1530

1531
/* All UAPI available opcodes. */
1532
#define BPF_INSN_MAP(INSN_2, INSN_3)		\
1533
	/* 32 bit ALU operations. */		\
1534
	/*   Register based. */			\
1535
	INSN_3(ALU, ADD,  X),			\
1536
	INSN_3(ALU, SUB,  X),			\
1537
	INSN_3(ALU, AND,  X),			\
1538
	INSN_3(ALU, OR,   X),			\
1539
	INSN_3(ALU, LSH,  X),			\
1540
	INSN_3(ALU, RSH,  X),			\
1541
	INSN_3(ALU, XOR,  X),			\
1542
	INSN_3(ALU, MUL,  X),			\
1543
	INSN_3(ALU, MOV,  X),			\
1544
	INSN_3(ALU, ARSH, X),			\
1545
	INSN_3(ALU, DIV,  X),			\
1546
	INSN_3(ALU, MOD,  X),			\
1547
	INSN_2(ALU, NEG),			\
1548
	INSN_3(ALU, END, TO_BE),		\
1549
	INSN_3(ALU, END, TO_LE),		\
1550
	/*   Immediate based. */		\
1551
	INSN_3(ALU, ADD,  K),			\
1552
	INSN_3(ALU, SUB,  K),			\
1553
	INSN_3(ALU, AND,  K),			\
1554
	INSN_3(ALU, OR,   K),			\
1555
	INSN_3(ALU, LSH,  K),			\
1556
	INSN_3(ALU, RSH,  K),			\
1557
	INSN_3(ALU, XOR,  K),			\
1558
	INSN_3(ALU, MUL,  K),			\
1559
	INSN_3(ALU, MOV,  K),			\
1560
	INSN_3(ALU, ARSH, K),			\
1561
	INSN_3(ALU, DIV,  K),			\
1562
	INSN_3(ALU, MOD,  K),			\
1563
	/* 64 bit ALU operations. */		\
1564
	/*   Register based. */			\
1565
	INSN_3(ALU64, ADD,  X),			\
1566
	INSN_3(ALU64, SUB,  X),			\
1567
	INSN_3(ALU64, AND,  X),			\
1568
	INSN_3(ALU64, OR,   X),			\
1569
	INSN_3(ALU64, LSH,  X),			\
1570
	INSN_3(ALU64, RSH,  X),			\
1571
	INSN_3(ALU64, XOR,  X),			\
1572
	INSN_3(ALU64, MUL,  X),			\
1573
	INSN_3(ALU64, MOV,  X),			\
1574
	INSN_3(ALU64, ARSH, X),			\
1575
	INSN_3(ALU64, DIV,  X),			\
1576
	INSN_3(ALU64, MOD,  X),			\
1577
	INSN_2(ALU64, NEG),			\
1578
	INSN_3(ALU64, END, TO_LE),		\
1579
	/*   Immediate based. */		\
1580
	INSN_3(ALU64, ADD,  K),			\
1581
	INSN_3(ALU64, SUB,  K),			\
1582
	INSN_3(ALU64, AND,  K),			\
1583
	INSN_3(ALU64, OR,   K),			\
1584
	INSN_3(ALU64, LSH,  K),			\
1585
	INSN_3(ALU64, RSH,  K),			\
1586
	INSN_3(ALU64, XOR,  K),			\
1587
	INSN_3(ALU64, MUL,  K),			\
1588
	INSN_3(ALU64, MOV,  K),			\
1589
	INSN_3(ALU64, ARSH, K),			\
1590
	INSN_3(ALU64, DIV,  K),			\
1591
	INSN_3(ALU64, MOD,  K),			\
1592
	/* Call instruction. */			\
1593
	INSN_2(JMP, CALL),			\
1594
	/* Exit instruction. */			\
1595
	INSN_2(JMP, EXIT),			\
1596
	/* 32-bit Jump instructions. */		\
1597
	/*   Register based. */			\
1598
	INSN_3(JMP32, JEQ,  X),			\
1599
	INSN_3(JMP32, JNE,  X),			\
1600
	INSN_3(JMP32, JGT,  X),			\
1601
	INSN_3(JMP32, JLT,  X),			\
1602
	INSN_3(JMP32, JGE,  X),			\
1603
	INSN_3(JMP32, JLE,  X),			\
1604
	INSN_3(JMP32, JSGT, X),			\
1605
	INSN_3(JMP32, JSLT, X),			\
1606
	INSN_3(JMP32, JSGE, X),			\
1607
	INSN_3(JMP32, JSLE, X),			\
1608
	INSN_3(JMP32, JSET, X),			\
1609
	/*   Immediate based. */		\
1610
	INSN_3(JMP32, JEQ,  K),			\
1611
	INSN_3(JMP32, JNE,  K),			\
1612
	INSN_3(JMP32, JGT,  K),			\
1613
	INSN_3(JMP32, JLT,  K),			\
1614
	INSN_3(JMP32, JGE,  K),			\
1615
	INSN_3(JMP32, JLE,  K),			\
1616
	INSN_3(JMP32, JSGT, K),			\
1617
	INSN_3(JMP32, JSLT, K),			\
1618
	INSN_3(JMP32, JSGE, K),			\
1619
	INSN_3(JMP32, JSLE, K),			\
1620
	INSN_3(JMP32, JSET, K),			\
1621
	/* Jump instructions. */		\
1622
	/*   Register based. */			\
1623
	INSN_3(JMP, JEQ,  X),			\
1624
	INSN_3(JMP, JNE,  X),			\
1625
	INSN_3(JMP, JGT,  X),			\
1626
	INSN_3(JMP, JLT,  X),			\
1627
	INSN_3(JMP, JGE,  X),			\
1628
	INSN_3(JMP, JLE,  X),			\
1629
	INSN_3(JMP, JSGT, X),			\
1630
	INSN_3(JMP, JSLT, X),			\
1631
	INSN_3(JMP, JSGE, X),			\
1632
	INSN_3(JMP, JSLE, X),			\
1633
	INSN_3(JMP, JSET, X),			\
1634
	/*   Immediate based. */		\
1635
	INSN_3(JMP, JEQ,  K),			\
1636
	INSN_3(JMP, JNE,  K),			\
1637
	INSN_3(JMP, JGT,  K),			\
1638
	INSN_3(JMP, JLT,  K),			\
1639
	INSN_3(JMP, JGE,  K),			\
1640
	INSN_3(JMP, JLE,  K),			\
1641
	INSN_3(JMP, JSGT, K),			\
1642
	INSN_3(JMP, JSLT, K),			\
1643
	INSN_3(JMP, JSGE, K),			\
1644
	INSN_3(JMP, JSLE, K),			\
1645
	INSN_3(JMP, JSET, K),			\
1646
	INSN_2(JMP, JA),			\
1647
	INSN_2(JMP32, JA),			\
1648
	/* Atomic operations. */		\
1649
	INSN_3(STX, ATOMIC, B),			\
1650
	INSN_3(STX, ATOMIC, H),			\
1651
	INSN_3(STX, ATOMIC, W),			\
1652
	INSN_3(STX, ATOMIC, DW),		\
1653
	/* Store instructions. */		\
1654
	/*   Register based. */			\
1655
	INSN_3(STX, MEM,  B),			\
1656
	INSN_3(STX, MEM,  H),			\
1657
	INSN_3(STX, MEM,  W),			\
1658
	INSN_3(STX, MEM,  DW),			\
1659
	/*   Immediate based. */		\
1660
	INSN_3(ST, MEM, B),			\
1661
	INSN_3(ST, MEM, H),			\
1662
	INSN_3(ST, MEM, W),			\
1663
	INSN_3(ST, MEM, DW),			\
1664
	/* Load instructions. */		\
1665
	/*   Register based. */			\
1666
	INSN_3(LDX, MEM, B),			\
1667
	INSN_3(LDX, MEM, H),			\
1668
	INSN_3(LDX, MEM, W),			\
1669
	INSN_3(LDX, MEM, DW),			\
1670
	INSN_3(LDX, MEMSX, B),			\
1671
	INSN_3(LDX, MEMSX, H),			\
1672
	INSN_3(LDX, MEMSX, W),			\
1673
	/*   Immediate based. */		\
1674
	INSN_3(LD, IMM, DW)
1675

1676
bool bpf_opcode_in_insntable(u8 code)
1677
{
1678
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
1679
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
1680
	static const bool public_insntable[256] = {
1681
		[0 ... 255] = false,
1682
		/* Now overwrite non-defaults ... */
1683
		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
1684
		/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
1685
		[BPF_LD | BPF_ABS | BPF_B] = true,
1686
		[BPF_LD | BPF_ABS | BPF_H] = true,
1687
		[BPF_LD | BPF_ABS | BPF_W] = true,
1688
		[BPF_LD | BPF_IND | BPF_B] = true,
1689
		[BPF_LD | BPF_IND | BPF_H] = true,
1690
		[BPF_LD | BPF_IND | BPF_W] = true,
1691
		[BPF_JMP | BPF_JCOND] = true,
1692
	};
1693
#undef BPF_INSN_3_TBL
1694
#undef BPF_INSN_2_TBL
1695
	return public_insntable[code];
1696
}
1697

1698
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1699
/**
1700
 *	___bpf_prog_run - run eBPF program on a given context
1701
 *	@regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
1702
 *	@insn: is the array of eBPF instructions
1703
 *
1704
 * Decode and execute eBPF instructions.
1705
 *
1706
 * Return: whatever value is in %BPF_R0 at program exit
1707
 */
1708
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
1709
{
1710
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
1711
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
1712
	static const void * const jumptable[256] __annotate_jump_table = {
1713
		[0 ... 255] = &&default_label,
1714
		/* Now overwrite non-defaults ... */
1715
		BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
1716
		/* Non-UAPI available opcodes. */
1717
		[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
1718
		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
1719
		[BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
1720
		[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
1721
		[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
1722
		[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
1723
		[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
1724
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
1725
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
1726
		[BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
1727
	};
1728
#undef BPF_INSN_3_LBL
1729
#undef BPF_INSN_2_LBL
1730
	u32 tail_call_cnt = 0;
1731

1732
#define CONT	 ({ insn++; goto select_insn; })
1733
#define CONT_JMP ({ insn++; goto select_insn; })
1734

1735
select_insn:
1736
	goto *jumptable[insn->code];
1737

1738
	/* Explicitly mask the register-based shift amounts with 63 or 31
1739
	 * to avoid undefined behavior. Normally this won't affect the
1740
	 * generated code, for example, in case of native 64 bit archs such
1741
	 * as x86-64 or arm64, the compiler is optimizing the AND away for
1742
	 * the interpreter. In case of JITs, each of the JIT backends compiles
1743
	 * the BPF shift operations to machine instructions which produce
1744
	 * implementation-defined results in such a case; the resulting
1745
	 * contents of the register may be arbitrary, but program behaviour
1746
	 * as a whole remains defined. In other words, in case of JIT backends,
1747
	 * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
1748
	 */
1749
	/* ALU (shifts) */
1750
#define SHT(OPCODE, OP)					\
1751
	ALU64_##OPCODE##_X:				\
1752
		DST = DST OP (SRC & 63);		\
1753
		CONT;					\
1754
	ALU_##OPCODE##_X:				\
1755
		DST = (u32) DST OP ((u32) SRC & 31);	\
1756
		CONT;					\
1757
	ALU64_##OPCODE##_K:				\
1758
		DST = DST OP IMM;			\
1759
		CONT;					\
1760
	ALU_##OPCODE##_K:				\
1761
		DST = (u32) DST OP (u32) IMM;		\
1762
		CONT;
1763
	/* ALU (rest) */
1764
#define ALU(OPCODE, OP)					\
1765
	ALU64_##OPCODE##_X:				\
1766
		DST = DST OP SRC;			\
1767
		CONT;					\
1768
	ALU_##OPCODE##_X:				\
1769
		DST = (u32) DST OP (u32) SRC;		\
1770
		CONT;					\
1771
	ALU64_##OPCODE##_K:				\
1772
		DST = DST OP IMM;			\
1773
		CONT;					\
1774
	ALU_##OPCODE##_K:				\
1775
		DST = (u32) DST OP (u32) IMM;		\
1776
		CONT;
1777
	ALU(ADD,  +)
1778
	ALU(SUB,  -)
1779
	ALU(AND,  &)
1780
	ALU(OR,   |)
1781
	ALU(XOR,  ^)
1782
	ALU(MUL,  *)
1783
	SHT(LSH, <<)
1784
	SHT(RSH, >>)
1785
#undef SHT
1786
#undef ALU
1787
	ALU_NEG:
1788
		DST = (u32) -DST;
1789
		CONT;
1790
	ALU64_NEG:
1791
		DST = -DST;
1792
		CONT;
1793
	ALU_MOV_X:
1794
		switch (OFF) {
1795
		case 0:
1796
			DST = (u32) SRC;
1797
			break;
1798
		case 8:
1799
			DST = (u32)(s8) SRC;
1800
			break;
1801
		case 16:
1802
			DST = (u32)(s16) SRC;
1803
			break;
1804
		}
1805
		CONT;
1806
	ALU_MOV_K:
1807
		DST = (u32) IMM;
1808
		CONT;
1809
	ALU64_MOV_X:
1810
		switch (OFF) {
1811
		case 0:
1812
			DST = SRC;
1813
			break;
1814
		case 8:
1815
			DST = (s8) SRC;
1816
			break;
1817
		case 16:
1818
			DST = (s16) SRC;
1819
			break;
1820
		case 32:
1821
			DST = (s32) SRC;
1822
			break;
1823
		}
1824
		CONT;
1825
	ALU64_MOV_K:
1826
		DST = IMM;
1827
		CONT;
1828
	LD_IMM_DW:
1829
		DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
1830
		insn++;
1831
		CONT;
1832
	ALU_ARSH_X:
1833
		DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
1834
		CONT;
1835
	ALU_ARSH_K:
1836
		DST = (u64) (u32) (((s32) DST) >> IMM);
1837
		CONT;
1838
	ALU64_ARSH_X:
1839
		(*(s64 *) &DST) >>= (SRC & 63);
1840
		CONT;
1841
	ALU64_ARSH_K:
1842
		(*(s64 *) &DST) >>= IMM;
1843
		CONT;
1844
	ALU64_MOD_X:
1845
		switch (OFF) {
1846
		case 0:
1847
			div64_u64_rem(DST, SRC, &AX);
1848
			DST = AX;
1849
			break;
1850
		case 1:
1851
			AX = div64_s64(DST, SRC);
1852
			DST = DST - AX * SRC;
1853
			break;
1854
		}
1855
		CONT;
1856
	ALU_MOD_X:
1857
		switch (OFF) {
1858
		case 0:
1859
			AX = (u32) DST;
1860
			DST = do_div(AX, (u32) SRC);
1861
			break;
1862
		case 1:
1863
			AX = abs((s32)DST);
1864
			AX = do_div(AX, abs((s32)SRC));
1865
			if ((s32)DST < 0)
1866
				DST = (u32)-AX;
1867
			else
1868
				DST = (u32)AX;
1869
			break;
1870
		}
1871
		CONT;
1872
	ALU64_MOD_K:
1873
		switch (OFF) {
1874
		case 0:
1875
			div64_u64_rem(DST, IMM, &AX);
1876
			DST = AX;
1877
			break;
1878
		case 1:
1879
			AX = div64_s64(DST, IMM);
1880
			DST = DST - AX * IMM;
1881
			break;
1882
		}
1883
		CONT;
1884
	ALU_MOD_K:
1885
		switch (OFF) {
1886
		case 0:
1887
			AX = (u32) DST;
1888
			DST = do_div(AX, (u32) IMM);
1889
			break;
1890
		case 1:
1891
			AX = abs((s32)DST);
1892
			AX = do_div(AX, abs((s32)IMM));
1893
			if ((s32)DST < 0)
1894
				DST = (u32)-AX;
1895
			else
1896
				DST = (u32)AX;
1897
			break;
1898
		}
1899
		CONT;
1900
	ALU64_DIV_X:
1901
		switch (OFF) {
1902
		case 0:
1903
			DST = div64_u64(DST, SRC);
1904
			break;
1905
		case 1:
1906
			DST = div64_s64(DST, SRC);
1907
			break;
1908
		}
1909
		CONT;
1910
	ALU_DIV_X:
1911
		switch (OFF) {
1912
		case 0:
1913
			AX = (u32) DST;
1914
			do_div(AX, (u32) SRC);
1915
			DST = (u32) AX;
1916
			break;
1917
		case 1:
1918
			AX = abs((s32)DST);
1919
			do_div(AX, abs((s32)SRC));
1920
			if (((s32)DST < 0) == ((s32)SRC < 0))
1921
				DST = (u32)AX;
1922
			else
1923
				DST = (u32)-AX;
1924
			break;
1925
		}
1926
		CONT;
1927
	ALU64_DIV_K:
1928
		switch (OFF) {
1929
		case 0:
1930
			DST = div64_u64(DST, IMM);
1931
			break;
1932
		case 1:
1933
			DST = div64_s64(DST, IMM);
1934
			break;
1935
		}
1936
		CONT;
1937
	ALU_DIV_K:
1938
		switch (OFF) {
1939
		case 0:
1940
			AX = (u32) DST;
1941
			do_div(AX, (u32) IMM);
1942
			DST = (u32) AX;
1943
			break;
1944
		case 1:
1945
			AX = abs((s32)DST);
1946
			do_div(AX, abs((s32)IMM));
1947
			if (((s32)DST < 0) == ((s32)IMM < 0))
1948
				DST = (u32)AX;
1949
			else
1950
				DST = (u32)-AX;
1951
			break;
1952
		}
1953
		CONT;
1954
	ALU_END_TO_BE:
1955
		switch (IMM) {
1956
		case 16:
1957
			DST = (__force u16) cpu_to_be16(DST);
1958
			break;
1959
		case 32:
1960
			DST = (__force u32) cpu_to_be32(DST);
1961
			break;
1962
		case 64:
1963
			DST = (__force u64) cpu_to_be64(DST);
1964
			break;
1965
		}
1966
		CONT;
1967
	ALU_END_TO_LE:
1968
		switch (IMM) {
1969
		case 16:
1970
			DST = (__force u16) cpu_to_le16(DST);
1971
			break;
1972
		case 32:
1973
			DST = (__force u32) cpu_to_le32(DST);
1974
			break;
1975
		case 64:
1976
			DST = (__force u64) cpu_to_le64(DST);
1977
			break;
1978
		}
1979
		CONT;
1980
	ALU64_END_TO_LE:
1981
		switch (IMM) {
1982
		case 16:
1983
			DST = (__force u16) __swab16(DST);
1984
			break;
1985
		case 32:
1986
			DST = (__force u32) __swab32(DST);
1987
			break;
1988
		case 64:
1989
			DST = (__force u64) __swab64(DST);
1990
			break;
1991
		}
1992
		CONT;
1993

1994
	/* CALL */
1995
	JMP_CALL:
1996
		/* Function call scratches BPF_R1-BPF_R5 registers,
1997
		 * preserves BPF_R6-BPF_R9, and stores return value
1998
		 * into BPF_R0.
1999
		 */
2000
		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
2001
						       BPF_R4, BPF_R5);
2002
		CONT;
2003

2004
	JMP_CALL_ARGS:
2005
		BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
2006
							    BPF_R3, BPF_R4,
2007
							    BPF_R5,
2008
							    insn + insn->off + 1);
2009
		CONT;
2010

2011
	JMP_TAIL_CALL: {
2012
		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
2013
		struct bpf_array *array = container_of(map, struct bpf_array, map);
2014
		struct bpf_prog *prog;
2015
		u32 index = BPF_R3;
2016

2017
		if (unlikely(index >= array->map.max_entries))
2018
			goto out;
2019

2020
		if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
2021
			goto out;
2022

2023
		tail_call_cnt++;
2024

2025
		prog = READ_ONCE(array->ptrs[index]);
2026
		if (!prog)
2027
			goto out;
2028

2029
		/* ARG1 at this point is guaranteed to point to CTX from
2030
		 * the verifier side due to the fact that the tail call is
2031
		 * handled like a helper, that is, bpf_tail_call_proto,
2032
		 * where arg1_type is ARG_PTR_TO_CTX.
2033
		 */
2034
		insn = prog->insnsi;
2035
		goto select_insn;
2036
out:
2037
		CONT;
2038
	}
2039
	JMP_JA:
2040
		insn += insn->off;
2041
		CONT;
2042
	JMP32_JA:
2043
		insn += insn->imm;
2044
		CONT;
2045
	JMP_EXIT:
2046
		return BPF_R0;
2047
	/* JMP */
2048
#define COND_JMP(SIGN, OPCODE, CMP_OP)				\
2049
	JMP_##OPCODE##_X:					\
2050
		if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {	\
2051
			insn += insn->off;			\
2052
			CONT_JMP;				\
2053
		}						\
2054
		CONT;						\
2055
	JMP32_##OPCODE##_X:					\
2056
		if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {	\
2057
			insn += insn->off;			\
2058
			CONT_JMP;				\
2059
		}						\
2060
		CONT;						\
2061
	JMP_##OPCODE##_K:					\
2062
		if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {	\
2063
			insn += insn->off;			\
2064
			CONT_JMP;				\
2065
		}						\
2066
		CONT;						\
2067
	JMP32_##OPCODE##_K:					\
2068
		if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {	\
2069
			insn += insn->off;			\
2070
			CONT_JMP;				\
2071
		}						\
2072
		CONT;
2073
	COND_JMP(u, JEQ, ==)
2074
	COND_JMP(u, JNE, !=)
2075
	COND_JMP(u, JGT, >)
2076
	COND_JMP(u, JLT, <)
2077
	COND_JMP(u, JGE, >=)
2078
	COND_JMP(u, JLE, <=)
2079
	COND_JMP(u, JSET, &)
2080
	COND_JMP(s, JSGT, >)
2081
	COND_JMP(s, JSLT, <)
2082
	COND_JMP(s, JSGE, >=)
2083
	COND_JMP(s, JSLE, <=)
2084
#undef COND_JMP
2085
	/* ST, STX and LDX*/
2086
	ST_NOSPEC:
2087
		/* Speculation barrier for mitigating Speculative Store Bypass,
2088
		 * Bounds-Check Bypass and Type Confusion. In case of arm64, we
2089
		 * rely on the firmware mitigation as controlled via the ssbd
2090
		 * kernel parameter. Whenever the mitigation is enabled, it
2091
		 * works for all of the kernel code with no need to provide any
2092
		 * additional instructions here. In case of x86, we use 'lfence'
2093
		 * insn for mitigation. We reuse preexisting logic from Spectre
2094
		 * v1 mitigation that happens to produce the required code on
2095
		 * x86 for v4 as well.
2096
		 */
2097
		barrier_nospec();
2098
		CONT;
2099
#define LDST(SIZEOP, SIZE)						\
2100
	STX_MEM_##SIZEOP:						\
2101
		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\
2102
		CONT;							\
2103
	ST_MEM_##SIZEOP:						\
2104
		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\
2105
		CONT;							\
2106
	LDX_MEM_##SIZEOP:						\
2107
		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
2108
		CONT;							\
2109
	LDX_PROBE_MEM_##SIZEOP:						\
2110
		bpf_probe_read_kernel_common(&DST, sizeof(SIZE),	\
2111
			      (const void *)(long) (SRC + insn->off));	\
2112
		DST = *((SIZE *)&DST);					\
2113
		CONT;
2114

2115
	LDST(B,   u8)
2116
	LDST(H,  u16)
2117
	LDST(W,  u32)
2118
	LDST(DW, u64)
2119
#undef LDST
2120

2121
#define LDSX(SIZEOP, SIZE)						\
2122
	LDX_MEMSX_##SIZEOP:						\
2123
		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
2124
		CONT;							\
2125
	LDX_PROBE_MEMSX_##SIZEOP:					\
2126
		bpf_probe_read_kernel_common(&DST, sizeof(SIZE),		\
2127
				      (const void *)(long) (SRC + insn->off));	\
2128
		DST = *((SIZE *)&DST);					\
2129
		CONT;
2130

2131
	LDSX(B,   s8)
2132
	LDSX(H,  s16)
2133
	LDSX(W,  s32)
2134
#undef LDSX
2135

2136
#define ATOMIC_ALU_OP(BOP, KOP)						\
2137
		case BOP:						\
2138
			if (BPF_SIZE(insn->code) == BPF_W)		\
2139
				atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
2140
					     (DST + insn->off));	\
2141
			else if (BPF_SIZE(insn->code) == BPF_DW)	\
2142
				atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
2143
					       (DST + insn->off));	\
2144
			else						\
2145
				goto default_label;			\
2146
			break;						\
2147
		case BOP | BPF_FETCH:					\
2148
			if (BPF_SIZE(insn->code) == BPF_W)		\
2149
				SRC = (u32) atomic_fetch_##KOP(		\
2150
					(u32) SRC,			\
2151
					(atomic_t *)(unsigned long) (DST + insn->off)); \
2152
			else if (BPF_SIZE(insn->code) == BPF_DW)	\
2153
				SRC = (u64) atomic64_fetch_##KOP(	\
2154
					(u64) SRC,			\
2155
					(atomic64_t *)(unsigned long) (DST + insn->off)); \
2156
			else						\
2157
				goto default_label;			\
2158
			break;
2159

2160
	STX_ATOMIC_DW:
2161
	STX_ATOMIC_W:
2162
	STX_ATOMIC_H:
2163
	STX_ATOMIC_B:
2164
		switch (IMM) {
2165
		/* Atomic read-modify-write instructions support only W and DW
2166
		 * size modifiers.
2167
		 */
2168
		ATOMIC_ALU_OP(BPF_ADD, add)
2169
		ATOMIC_ALU_OP(BPF_AND, and)
2170
		ATOMIC_ALU_OP(BPF_OR, or)
2171
		ATOMIC_ALU_OP(BPF_XOR, xor)
2172
#undef ATOMIC_ALU_OP
2173

2174
		case BPF_XCHG:
2175
			if (BPF_SIZE(insn->code) == BPF_W)
2176
				SRC = (u32) atomic_xchg(
2177
					(atomic_t *)(unsigned long) (DST + insn->off),
2178
					(u32) SRC);
2179
			else if (BPF_SIZE(insn->code) == BPF_DW)
2180
				SRC = (u64) atomic64_xchg(
2181
					(atomic64_t *)(unsigned long) (DST + insn->off),
2182
					(u64) SRC);
2183
			else
2184
				goto default_label;
2185
			break;
2186
		case BPF_CMPXCHG:
2187
			if (BPF_SIZE(insn->code) == BPF_W)
2188
				BPF_R0 = (u32) atomic_cmpxchg(
2189
					(atomic_t *)(unsigned long) (DST + insn->off),
2190
					(u32) BPF_R0, (u32) SRC);
2191
			else if (BPF_SIZE(insn->code) == BPF_DW)
2192
				BPF_R0 = (u64) atomic64_cmpxchg(
2193
					(atomic64_t *)(unsigned long) (DST + insn->off),
2194
					(u64) BPF_R0, (u64) SRC);
2195
			else
2196
				goto default_label;
2197
			break;
2198
		/* Atomic load and store instructions support all size
2199
		 * modifiers.
2200
		 */
2201
		case BPF_LOAD_ACQ:
2202
			switch (BPF_SIZE(insn->code)) {
2203
#define LOAD_ACQUIRE(SIZEOP, SIZE)				\
2204
			case BPF_##SIZEOP:			\
2205
				DST = (SIZE)smp_load_acquire(	\
2206
					(SIZE *)(unsigned long)(SRC + insn->off));	\
2207
				break;
2208
			LOAD_ACQUIRE(B,   u8)
2209
			LOAD_ACQUIRE(H,  u16)
2210
			LOAD_ACQUIRE(W,  u32)
2211
#ifdef CONFIG_64BIT
2212
			LOAD_ACQUIRE(DW, u64)
2213
#endif
2214
#undef LOAD_ACQUIRE
2215
			default:
2216
				goto default_label;
2217
			}
2218
			break;
2219
		case BPF_STORE_REL:
2220
			switch (BPF_SIZE(insn->code)) {
2221
#define STORE_RELEASE(SIZEOP, SIZE)			\
2222
			case BPF_##SIZEOP:		\
2223
				smp_store_release(	\
2224
					(SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC);	\
2225
				break;
2226
			STORE_RELEASE(B,   u8)
2227
			STORE_RELEASE(H,  u16)
2228
			STORE_RELEASE(W,  u32)
2229
#ifdef CONFIG_64BIT
2230
			STORE_RELEASE(DW, u64)
2231
#endif
2232
#undef STORE_RELEASE
2233
			default:
2234
				goto default_label;
2235
			}
2236
			break;
2237

2238
		default:
2239
			goto default_label;
2240
		}
2241
		CONT;
2242

2243
	default_label:
2244
		/* If we ever reach this, we have a bug somewhere. Die hard here
2245
		 * instead of just returning 0; we could be somewhere in a subprog,
2246
		 * so execution could continue otherwise which we do /not/ want.
2247
		 *
2248
		 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
2249
		 */
2250
		pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
2251
			insn->code, insn->imm);
2252
		BUG_ON(1);
2253
		return 0;
2254
}
2255

2256
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
2257
#define DEFINE_BPF_PROG_RUN(stack_size) \
2258
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
2259
{ \
2260
	u64 stack[stack_size / sizeof(u64)]; \
2261
	u64 regs[MAX_BPF_EXT_REG] = {}; \
2262
\
2263
	kmsan_unpoison_memory(stack, sizeof(stack)); \
2264
	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2265
	ARG1 = (u64) (unsigned long) ctx; \
2266
	return ___bpf_prog_run(regs, insn); \
2267
}
2268

2269
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
2270
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
2271
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
2272
				      const struct bpf_insn *insn) \
2273
{ \
2274
	u64 stack[stack_size / sizeof(u64)]; \
2275
	u64 regs[MAX_BPF_EXT_REG]; \
2276
\
2277
	kmsan_unpoison_memory(stack, sizeof(stack)); \
2278
	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
2279
	BPF_R1 = r1; \
2280
	BPF_R2 = r2; \
2281
	BPF_R3 = r3; \
2282
	BPF_R4 = r4; \
2283
	BPF_R5 = r5; \
2284
	return ___bpf_prog_run(regs, insn); \
2285
}
2286

2287
#define EVAL1(FN, X) FN(X)
2288
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
2289
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
2290
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
2291
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
2292
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
2293

2294
EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
2295
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
2296
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
2297

2298
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
2299
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
2300
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
2301

2302
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
2303

2304
static unsigned int (*interpreters[])(const void *ctx,
2305
				      const struct bpf_insn *insn) = {
2306
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2307
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2308
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2309
};
2310
#undef PROG_NAME_LIST
2311
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
2312
static __maybe_unused
2313
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
2314
			   const struct bpf_insn *insn) = {
2315
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
2316
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
2317
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
2318
};
2319
#undef PROG_NAME_LIST
2320

2321
#ifdef CONFIG_BPF_SYSCALL
2322
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
2323
{
2324
	stack_depth = max_t(u32, stack_depth, 1);
2325
	insn->off = (s16) insn->imm;
2326
	insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
2327
		__bpf_call_base_args;
2328
	insn->code = BPF_JMP | BPF_CALL_ARGS;
2329
}
2330
#endif
2331
#endif
2332

2333
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
2334
					 const struct bpf_insn *insn)
2335
{
2336
	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
2337
	 * is not working properly, so warn about it!
2338
	 */
2339
	WARN_ON_ONCE(1);
2340
	return 0;
2341
}
2342

2343
static bool __bpf_prog_map_compatible(struct bpf_map *map,
2344
				      const struct bpf_prog *fp)
2345
{
2346
	enum bpf_prog_type prog_type = resolve_prog_type(fp);
2347
	struct bpf_prog_aux *aux = fp->aux;
2348
	enum bpf_cgroup_storage_type i;
2349
	bool ret = false;
2350
	u64 cookie;
2351

2352
	if (fp->kprobe_override)
2353
		return ret;
2354

2355
	spin_lock(&map->owner_lock);
2356
	/* There's no owner yet where we could check for compatibility. */
2357
	if (!map->owner) {
2358
		map->owner = bpf_map_owner_alloc(map);
2359
		if (!map->owner)
2360
			goto err;
2361
		map->owner->type  = prog_type;
2362
		map->owner->jited = fp->jited;
2363
		map->owner->xdp_has_frags = aux->xdp_has_frags;
2364
		map->owner->expected_attach_type = fp->expected_attach_type;
2365
		map->owner->attach_func_proto = aux->attach_func_proto;
2366
		for_each_cgroup_storage_type(i) {
2367
			map->owner->storage_cookie[i] =
2368
				aux->cgroup_storage[i] ?
2369
				aux->cgroup_storage[i]->cookie : 0;
2370
		}
2371
		ret = true;
2372
	} else {
2373
		ret = map->owner->type  == prog_type &&
2374
		      map->owner->jited == fp->jited &&
2375
		      map->owner->xdp_has_frags == aux->xdp_has_frags;
2376
		if (ret &&
2377
		    map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
2378
		    map->owner->expected_attach_type != fp->expected_attach_type)
2379
			ret = false;
2380
		for_each_cgroup_storage_type(i) {
2381
			if (!ret)
2382
				break;
2383
			cookie = aux->cgroup_storage[i] ?
2384
				 aux->cgroup_storage[i]->cookie : 0;
2385
			ret = map->owner->storage_cookie[i] == cookie ||
2386
			      !cookie;
2387
		}
2388
		if (ret &&
2389
		    map->owner->attach_func_proto != aux->attach_func_proto) {
2390
			switch (prog_type) {
2391
			case BPF_PROG_TYPE_TRACING:
2392
			case BPF_PROG_TYPE_LSM:
2393
			case BPF_PROG_TYPE_EXT:
2394
			case BPF_PROG_TYPE_STRUCT_OPS:
2395
				ret = false;
2396
				break;
2397
			default:
2398
				break;
2399
			}
2400
		}
2401
	}
2402
err:
2403
	spin_unlock(&map->owner_lock);
2404
	return ret;
2405
}
2406

2407
bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
2408
{
2409
	/* XDP programs inserted into maps are not guaranteed to run on
2410
	 * a particular netdev (and can run outside driver context entirely
2411
	 * in the case of devmap and cpumap). Until device checks
2412
	 * are implemented, prohibit adding dev-bound programs to program maps.
2413
	 */
2414
	if (bpf_prog_is_dev_bound(fp->aux))
2415
		return false;
2416

2417
	return __bpf_prog_map_compatible(map, fp);
2418
}
2419

2420
static int bpf_check_tail_call(const struct bpf_prog *fp)
2421
{
2422
	struct bpf_prog_aux *aux = fp->aux;
2423
	int i, ret = 0;
2424

2425
	mutex_lock(&aux->used_maps_mutex);
2426
	for (i = 0; i < aux->used_map_cnt; i++) {
2427
		struct bpf_map *map = aux->used_maps[i];
2428

2429
		if (!map_type_contains_progs(map))
2430
			continue;
2431

2432
		if (!__bpf_prog_map_compatible(map, fp)) {
2433
			ret = -EINVAL;
2434
			goto out;
2435
		}
2436
	}
2437

2438
out:
2439
	mutex_unlock(&aux->used_maps_mutex);
2440
	return ret;
2441
}
2442

2443
static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
2444
{
2445
	bool select_interpreter = false;
2446
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
2447
	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
2448
	u32 idx = (round_up(stack_depth, 32) / 32) - 1;
2449

2450
	/* may_goto may cause stack size > 512, leading to idx out-of-bounds.
2451
	 * But for non-JITed programs, we don't need bpf_func, so no bounds
2452
	 * check needed.
2453
	 */
2454
	if (idx < ARRAY_SIZE(interpreters)) {
2455
		fp->bpf_func = interpreters[idx];
2456
		select_interpreter = true;
2457
	} else {
2458
		fp->bpf_func = __bpf_prog_ret0_warn;
2459
	}
2460
#else
2461
	fp->bpf_func = __bpf_prog_ret0_warn;
2462
#endif
2463
	return select_interpreter;
2464
}
2465

2466
/**
2467
 *	bpf_prog_select_runtime - select exec runtime for BPF program
2468
 *	@fp: bpf_prog populated with BPF program
2469
 *	@err: pointer to error variable
2470
 *
2471
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
2472
 * The BPF program will be executed via bpf_prog_run() function.
2473
 *
2474
 * Return: the &fp argument along with &err set to 0 for success or
2475
 * a negative errno code on failure
2476
 */
2477
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
2478
{
2479
	/* In case of BPF to BPF calls, verifier did all the prep
2480
	 * work with regards to JITing, etc.
2481
	 */
2482
	bool jit_needed = false;
2483

2484
	if (fp->bpf_func)
2485
		goto finalize;
2486

2487
	if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
2488
	    bpf_prog_has_kfunc_call(fp))
2489
		jit_needed = true;
2490

2491
	if (!bpf_prog_select_interpreter(fp))
2492
		jit_needed = true;
2493

2494
	/* eBPF JITs can rewrite the program in case constant
2495
	 * blinding is active. However, in case of error during
2496
	 * blinding, bpf_int_jit_compile() must always return a
2497
	 * valid program, which in this case would simply not
2498
	 * be JITed, but falls back to the interpreter.
2499
	 */
2500
	if (!bpf_prog_is_offloaded(fp->aux)) {
2501
		*err = bpf_prog_alloc_jited_linfo(fp);
2502
		if (*err)
2503
			return fp;
2504

2505
		fp = bpf_int_jit_compile(fp);
2506
		bpf_prog_jit_attempt_done(fp);
2507
		if (!fp->jited && jit_needed) {
2508
			*err = -ENOTSUPP;
2509
			return fp;
2510
		}
2511
	} else {
2512
		*err = bpf_prog_offload_compile(fp);
2513
		if (*err)
2514
			return fp;
2515
	}
2516

2517
finalize:
2518
	*err = bpf_prog_lock_ro(fp);
2519
	if (*err)
2520
		return fp;
2521

2522
	/* The tail call compatibility check can only be done at
2523
	 * this late stage as we need to determine, if we deal
2524
	 * with JITed or non JITed program concatenations and not
2525
	 * all eBPF JITs might immediately support all features.
2526
	 */
2527
	*err = bpf_check_tail_call(fp);
2528

2529
	return fp;
2530
}
2531
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
2532

2533
static unsigned int __bpf_prog_ret1(const void *ctx,
2534
				    const struct bpf_insn *insn)
2535
{
2536
	return 1;
2537
}
2538

2539
static struct bpf_prog_dummy {
2540
	struct bpf_prog prog;
2541
} dummy_bpf_prog = {
2542
	.prog = {
2543
		.bpf_func = __bpf_prog_ret1,
2544
	},
2545
};
2546

2547
struct bpf_empty_prog_array bpf_empty_prog_array = {
2548
	.null_prog = NULL,
2549
};
2550
EXPORT_SYMBOL(bpf_empty_prog_array);
2551

2552
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
2553
{
2554
	struct bpf_prog_array *p;
2555

2556
	if (prog_cnt)
2557
		p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
2558
	else
2559
		p = &bpf_empty_prog_array.hdr;
2560

2561
	return p;
2562
}
2563

2564
void bpf_prog_array_free(struct bpf_prog_array *progs)
2565
{
2566
	if (!progs || progs == &bpf_empty_prog_array.hdr)
2567
		return;
2568
	kfree_rcu(progs, rcu);
2569
}
2570

2571
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2572
{
2573
	struct bpf_prog_array *progs;
2574

2575
	/* If RCU Tasks Trace grace period implies RCU grace period, there is
2576
	 * no need to call kfree_rcu(), just call kfree() directly.
2577
	 */
2578
	progs = container_of(rcu, struct bpf_prog_array, rcu);
2579
	if (rcu_trace_implies_rcu_gp())
2580
		kfree(progs);
2581
	else
2582
		kfree_rcu(progs, rcu);
2583
}
2584

2585
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2586
{
2587
	if (!progs || progs == &bpf_empty_prog_array.hdr)
2588
		return;
2589
	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2590
}
2591

2592
int bpf_prog_array_length(struct bpf_prog_array *array)
2593
{
2594
	struct bpf_prog_array_item *item;
2595
	u32 cnt = 0;
2596

2597
	for (item = array->items; item->prog; item++)
2598
		if (item->prog != &dummy_bpf_prog.prog)
2599
			cnt++;
2600
	return cnt;
2601
}
2602

2603
bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
2604
{
2605
	struct bpf_prog_array_item *item;
2606

2607
	for (item = array->items; item->prog; item++)
2608
		if (item->prog != &dummy_bpf_prog.prog)
2609
			return false;
2610
	return true;
2611
}
2612

2613
static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
2614
				     u32 *prog_ids,
2615
				     u32 request_cnt)
2616
{
2617
	struct bpf_prog_array_item *item;
2618
	int i = 0;
2619

2620
	for (item = array->items; item->prog; item++) {
2621
		if (item->prog == &dummy_bpf_prog.prog)
2622
			continue;
2623
		prog_ids[i] = item->prog->aux->id;
2624
		if (++i == request_cnt) {
2625
			item++;
2626
			break;
2627
		}
2628
	}
2629

2630
	return !!(item->prog);
2631
}
2632

2633
int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
2634
				__u32 __user *prog_ids, u32 cnt)
2635
{
2636
	unsigned long err = 0;
2637
	bool nospc;
2638
	u32 *ids;
2639

2640
	/* users of this function are doing:
2641
	 * cnt = bpf_prog_array_length();
2642
	 * if (cnt > 0)
2643
	 *     bpf_prog_array_copy_to_user(..., cnt);
2644
	 * so below kcalloc doesn't need extra cnt > 0 check.
2645
	 */
2646
	ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
2647
	if (!ids)
2648
		return -ENOMEM;
2649
	nospc = bpf_prog_array_copy_core(array, ids, cnt);
2650
	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
2651
	kfree(ids);
2652
	if (err)
2653
		return -EFAULT;
2654
	if (nospc)
2655
		return -ENOSPC;
2656
	return 0;
2657
}
2658

2659
void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
2660
				struct bpf_prog *old_prog)
2661
{
2662
	struct bpf_prog_array_item *item;
2663

2664
	for (item = array->items; item->prog; item++)
2665
		if (item->prog == old_prog) {
2666
			WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
2667
			break;
2668
		}
2669
}
2670

2671
/**
2672
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
2673
 *                                   index into the program array with
2674
 *                                   a dummy no-op program.
2675
 * @array: a bpf_prog_array
2676
 * @index: the index of the program to replace
2677
 *
2678
 * Skips over dummy programs, by not counting them, when calculating
2679
 * the position of the program to replace.
2680
 *
2681
 * Return:
2682
 * * 0		- Success
2683
 * * -EINVAL	- Invalid index value. Must be a non-negative integer.
2684
 * * -ENOENT	- Index out of range
2685
 */
2686
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
2687
{
2688
	return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
2689
}
2690

2691
/**
2692
 * bpf_prog_array_update_at() - Updates the program at the given index
2693
 *                              into the program array.
2694
 * @array: a bpf_prog_array
2695
 * @index: the index of the program to update
2696
 * @prog: the program to insert into the array
2697
 *
2698
 * Skips over dummy programs, by not counting them, when calculating
2699
 * the position of the program to update.
2700
 *
2701
 * Return:
2702
 * * 0		- Success
2703
 * * -EINVAL	- Invalid index value. Must be a non-negative integer.
2704
 * * -ENOENT	- Index out of range
2705
 */
2706
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
2707
			     struct bpf_prog *prog)
2708
{
2709
	struct bpf_prog_array_item *item;
2710

2711
	if (unlikely(index < 0))
2712
		return -EINVAL;
2713

2714
	for (item = array->items; item->prog; item++) {
2715
		if (item->prog == &dummy_bpf_prog.prog)
2716
			continue;
2717
		if (!index) {
2718
			WRITE_ONCE(item->prog, prog);
2719
			return 0;
2720
		}
2721
		index--;
2722
	}
2723
	return -ENOENT;
2724
}
2725

2726
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
2727
			struct bpf_prog *exclude_prog,
2728
			struct bpf_prog *include_prog,
2729
			u64 bpf_cookie,
2730
			struct bpf_prog_array **new_array)
2731
{
2732
	int new_prog_cnt, carry_prog_cnt = 0;
2733
	struct bpf_prog_array_item *existing, *new;
2734
	struct bpf_prog_array *array;
2735
	bool found_exclude = false;
2736

2737
	/* Figure out how many existing progs we need to carry over to
2738
	 * the new array.
2739
	 */
2740
	if (old_array) {
2741
		existing = old_array->items;
2742
		for (; existing->prog; existing++) {
2743
			if (existing->prog == exclude_prog) {
2744
				found_exclude = true;
2745
				continue;
2746
			}
2747
			if (existing->prog != &dummy_bpf_prog.prog)
2748
				carry_prog_cnt++;
2749
			if (existing->prog == include_prog)
2750
				return -EEXIST;
2751
		}
2752
	}
2753

2754
	if (exclude_prog && !found_exclude)
2755
		return -ENOENT;
2756

2757
	/* How many progs (not NULL) will be in the new array? */
2758
	new_prog_cnt = carry_prog_cnt;
2759
	if (include_prog)
2760
		new_prog_cnt += 1;
2761

2762
	/* Do we have any prog (not NULL) in the new array? */
2763
	if (!new_prog_cnt) {
2764
		*new_array = NULL;
2765
		return 0;
2766
	}
2767

2768
	/* +1 as the end of prog_array is marked with NULL */
2769
	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
2770
	if (!array)
2771
		return -ENOMEM;
2772
	new = array->items;
2773

2774
	/* Fill in the new prog array */
2775
	if (carry_prog_cnt) {
2776
		existing = old_array->items;
2777
		for (; existing->prog; existing++) {
2778
			if (existing->prog == exclude_prog ||
2779
			    existing->prog == &dummy_bpf_prog.prog)
2780
				continue;
2781

2782
			new->prog = existing->prog;
2783
			new->bpf_cookie = existing->bpf_cookie;
2784
			new++;
2785
		}
2786
	}
2787
	if (include_prog) {
2788
		new->prog = include_prog;
2789
		new->bpf_cookie = bpf_cookie;
2790
		new++;
2791
	}
2792
	new->prog = NULL;
2793
	*new_array = array;
2794
	return 0;
2795
}
2796

2797
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
2798
			     u32 *prog_ids, u32 request_cnt,
2799
			     u32 *prog_cnt)
2800
{
2801
	u32 cnt = 0;
2802

2803
	if (array)
2804
		cnt = bpf_prog_array_length(array);
2805

2806
	*prog_cnt = cnt;
2807

2808
	/* return early if user requested only program count or nothing to copy */
2809
	if (!request_cnt || !cnt)
2810
		return 0;
2811

2812
	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
2813
	return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
2814
								     : 0;
2815
}
2816

2817
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
2818
			  struct bpf_map **used_maps, u32 len)
2819
{
2820
	struct bpf_map *map;
2821
	bool sleepable;
2822
	u32 i;
2823

2824
	sleepable = aux->prog->sleepable;
2825
	for (i = 0; i < len; i++) {
2826
		map = used_maps[i];
2827
		if (map->ops->map_poke_untrack)
2828
			map->ops->map_poke_untrack(map, aux);
2829
		if (sleepable)
2830
			atomic64_dec(&map->sleepable_refcnt);
2831
		bpf_map_put(map);
2832
	}
2833
}
2834

2835
static void bpf_free_used_maps(struct bpf_prog_aux *aux)
2836
{
2837
	__bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
2838
	kfree(aux->used_maps);
2839
}
2840

2841
void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
2842
{
2843
#ifdef CONFIG_BPF_SYSCALL
2844
	struct btf_mod_pair *btf_mod;
2845
	u32 i;
2846

2847
	for (i = 0; i < len; i++) {
2848
		btf_mod = &used_btfs[i];
2849
		if (btf_mod->module)
2850
			module_put(btf_mod->module);
2851
		btf_put(btf_mod->btf);
2852
	}
2853
#endif
2854
}
2855

2856
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
2857
{
2858
	__bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
2859
	kfree(aux->used_btfs);
2860
}
2861

2862
static void bpf_prog_free_deferred(struct work_struct *work)
2863
{
2864
	struct bpf_prog_aux *aux;
2865
	int i;
2866

2867
	aux = container_of(work, struct bpf_prog_aux, work);
2868
#ifdef CONFIG_BPF_SYSCALL
2869
	bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
2870
	bpf_prog_stream_free(aux->prog);
2871
#endif
2872
#ifdef CONFIG_CGROUP_BPF
2873
	if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
2874
		bpf_cgroup_atype_put(aux->cgroup_atype);
2875
#endif
2876
	bpf_free_used_maps(aux);
2877
	bpf_free_used_btfs(aux);
2878
	if (bpf_prog_is_dev_bound(aux))
2879
		bpf_prog_dev_bound_destroy(aux->prog);
2880
#ifdef CONFIG_PERF_EVENTS
2881
	if (aux->prog->has_callchain_buf)
2882
		put_callchain_buffers();
2883
#endif
2884
	if (aux->dst_trampoline)
2885
		bpf_trampoline_put(aux->dst_trampoline);
2886
	for (i = 0; i < aux->real_func_cnt; i++) {
2887
		/* We can just unlink the subprog poke descriptor table as
2888
		 * it was originally linked to the main program and is also
2889
		 * released along with it.
2890
		 */
2891
		aux->func[i]->aux->poke_tab = NULL;
2892
		bpf_jit_free(aux->func[i]);
2893
	}
2894
	if (aux->real_func_cnt) {
2895
		kfree(aux->func);
2896
		bpf_prog_unlock_free(aux->prog);
2897
	} else {
2898
		bpf_jit_free(aux->prog);
2899
	}
2900
}
2901

2902
void bpf_prog_free(struct bpf_prog *fp)
2903
{
2904
	struct bpf_prog_aux *aux = fp->aux;
2905

2906
	if (aux->dst_prog)
2907
		bpf_prog_put(aux->dst_prog);
2908
	bpf_token_put(aux->token);
2909
	INIT_WORK(&aux->work, bpf_prog_free_deferred);
2910
	schedule_work(&aux->work);
2911
}
2912
EXPORT_SYMBOL_GPL(bpf_prog_free);
2913

2914
/* RNG for unprivileged user space with separated state from prandom_u32(). */
2915
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
2916

2917
void bpf_user_rnd_init_once(void)
2918
{
2919
	prandom_init_once(&bpf_user_rnd_state);
2920
}
2921

2922
BPF_CALL_0(bpf_user_rnd_u32)
2923
{
2924
	/* Should someone ever have the rather unwise idea to use some
2925
	 * of the registers passed into this function, then note that
2926
	 * this function is called from native eBPF and classic-to-eBPF
2927
	 * transformations. Register assignments from both sides are
2928
	 * different, f.e. classic always sets fn(ctx, A, X) here.
2929
	 */
2930
	struct rnd_state *state;
2931
	u32 res;
2932

2933
	state = &get_cpu_var(bpf_user_rnd_state);
2934
	res = prandom_u32_state(state);
2935
	put_cpu_var(bpf_user_rnd_state);
2936

2937
	return res;
2938
}
2939

2940
BPF_CALL_0(bpf_get_raw_cpu_id)
2941
{
2942
	return raw_smp_processor_id();
2943
}
2944

2945
/* Weak definitions of helper functions in case we don't have bpf syscall. */
2946
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
2947
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
2948
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
2949
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
2950
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
2951
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
2952
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
2953
const struct bpf_func_proto bpf_spin_lock_proto __weak;
2954
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
2955
const struct bpf_func_proto bpf_jiffies64_proto __weak;
2956

2957
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
2958
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
2959
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
2960
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
2961
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
2962
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
2963
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
2964

2965
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
2966
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
2967
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
2968
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
2969
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
2970
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
2971
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
2972
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
2973
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
2974
const struct bpf_func_proto bpf_set_retval_proto __weak;
2975
const struct bpf_func_proto bpf_get_retval_proto __weak;
2976

2977
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
2978
{
2979
	return NULL;
2980
}
2981

2982
const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
2983
{
2984
	return NULL;
2985
}
2986

2987
const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
2988
{
2989
	return NULL;
2990
}
2991

2992
u64 __weak
2993
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
2994
		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
2995
{
2996
	return -ENOTSUPP;
2997
}
2998
EXPORT_SYMBOL_GPL(bpf_event_output);
2999

3000
/* Always built-in helper functions. */
3001
const struct bpf_func_proto bpf_tail_call_proto = {
3002
	/* func is unused for tail_call, we set it to pass the
3003
	 * get_helper_proto check
3004
	 */
3005
	.func		= BPF_PTR_POISON,
3006
	.gpl_only	= false,
3007
	.ret_type	= RET_VOID,
3008
	.arg1_type	= ARG_PTR_TO_CTX,
3009
	.arg2_type	= ARG_CONST_MAP_PTR,
3010
	.arg3_type	= ARG_ANYTHING,
3011
};
3012

3013
/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
3014
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
3015
 * eBPF and implicitly also cBPF can get JITed!
3016
 */
3017
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
3018
{
3019
	return prog;
3020
}
3021

3022
/* Stub for JITs that support eBPF. All cBPF code gets transformed into
3023
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
3024
 */
3025
void __weak bpf_jit_compile(struct bpf_prog *prog)
3026
{
3027
}
3028

3029
bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
3030
{
3031
	return false;
3032
}
3033

3034
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
3035
 * analysis code and wants explicit zero extension inserted by verifier.
3036
 * Otherwise, return FALSE.
3037
 *
3038
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
3039
 * you don't override this. JITs that don't want these extra insns can detect
3040
 * them using insn_is_zext.
3041
 */
3042
bool __weak bpf_jit_needs_zext(void)
3043
{
3044
	return false;
3045
}
3046

3047
/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
3048
 * all archs. The value returned must not change at runtime as there is
3049
 * currently no support for reloading programs that were loaded without
3050
 * mitigations.
3051
 */
3052
bool __weak bpf_jit_bypass_spec_v1(void)
3053
{
3054
	return false;
3055
}
3056

3057
bool __weak bpf_jit_bypass_spec_v4(void)
3058
{
3059
	return false;
3060
}
3061

3062
/* Return true if the JIT inlines the call to the helper corresponding to
3063
 * the imm.
3064
 *
3065
 * The verifier will not patch the insn->imm for the call to the helper if
3066
 * this returns true.
3067
 */
3068
bool __weak bpf_jit_inlines_helper_call(s32 imm)
3069
{
3070
	return false;
3071
}
3072

3073
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
3074
bool __weak bpf_jit_supports_subprog_tailcalls(void)
3075
{
3076
	return false;
3077
}
3078

3079
bool __weak bpf_jit_supports_percpu_insn(void)
3080
{
3081
	return false;
3082
}
3083

3084
bool __weak bpf_jit_supports_kfunc_call(void)
3085
{
3086
	return false;
3087
}
3088

3089
bool __weak bpf_jit_supports_far_kfunc_call(void)
3090
{
3091
	return false;
3092
}
3093

3094
bool __weak bpf_jit_supports_arena(void)
3095
{
3096
	return false;
3097
}
3098

3099
bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
3100
{
3101
	return false;
3102
}
3103

3104
u64 __weak bpf_arch_uaddress_limit(void)
3105
{
3106
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
3107
	return TASK_SIZE;
3108
#else
3109
	return 0;
3110
#endif
3111
}
3112

3113
/* Return TRUE if the JIT backend satisfies the following two conditions:
3114
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
3115
 * 2) Under the specific arch, the implementation of xchg() is the same
3116
 *    as atomic_xchg() on pointer-sized words.
3117
 */
3118
bool __weak bpf_jit_supports_ptr_xchg(void)
3119
{
3120
	return false;
3121
}
3122

3123
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
3124
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
3125
 */
3126
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
3127
			 int len)
3128
{
3129
	return -EFAULT;
3130
}
3131

3132
int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
3133
			      void *addr1, void *addr2)
3134
{
3135
	return -ENOTSUPP;
3136
}
3137

3138
void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
3139
{
3140
	return ERR_PTR(-ENOTSUPP);
3141
}
3142

3143
int __weak bpf_arch_text_invalidate(void *dst, size_t len)
3144
{
3145
	return -ENOTSUPP;
3146
}
3147

3148
bool __weak bpf_jit_supports_exceptions(void)
3149
{
3150
	return false;
3151
}
3152

3153
bool __weak bpf_jit_supports_private_stack(void)
3154
{
3155
	return false;
3156
}
3157

3158
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
3159
{
3160
}
3161

3162
bool __weak bpf_jit_supports_timed_may_goto(void)
3163
{
3164
	return false;
3165
}
3166

3167
u64 __weak arch_bpf_timed_may_goto(void)
3168
{
3169
	return 0;
3170
}
3171

3172
static noinline void bpf_prog_report_may_goto_violation(void)
3173
{
3174
#ifdef CONFIG_BPF_SYSCALL
3175
	struct bpf_stream_stage ss;
3176
	struct bpf_prog *prog;
3177

3178
	prog = bpf_prog_find_from_stack();
3179
	if (!prog)
3180
		return;
3181
	bpf_stream_stage(ss, prog, BPF_STDERR, ({
3182
		bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
3183
		bpf_stream_dump_stack(ss);
3184
	}));
3185
#endif
3186
}
3187

3188
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
3189
{
3190
	u64 time = ktime_get_mono_fast_ns();
3191

3192
	/* Populate the timestamp for this stack frame, and refresh count. */
3193
	if (!p->timestamp) {
3194
		p->timestamp = time;
3195
		return BPF_MAX_TIMED_LOOPS;
3196
	}
3197
	/* Check if we've exhausted our time slice, and zero count. */
3198
	if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
3199
		bpf_prog_report_may_goto_violation();
3200
		return 0;
3201
	}
3202
	/* Refresh the count for the stack frame. */
3203
	return BPF_MAX_TIMED_LOOPS;
3204
}
3205

3206
/* for configs without MMU or 32-bit */
3207
__weak const struct bpf_map_ops arena_map_ops;
3208
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
3209
{
3210
	return 0;
3211
}
3212
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
3213
{
3214
	return 0;
3215
}
3216

3217
#ifdef CONFIG_BPF_SYSCALL
3218
static int __init bpf_global_ma_init(void)
3219
{
3220
	int ret;
3221

3222
	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
3223
	bpf_global_ma_set = !ret;
3224
	return ret;
3225
}
3226
late_initcall(bpf_global_ma_init);
3227
#endif
3228

3229
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
3230
EXPORT_SYMBOL(bpf_stats_enabled_key);
3231

3232
/* All definitions of tracepoints related to BPF. */
3233
#define CREATE_TRACE_POINTS
3234
#include <linux/bpf_trace.h>
3235

3236
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
3237
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
3238

3239
#ifdef CONFIG_BPF_SYSCALL
3240

3241
int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
3242
			   const char **linep, int *nump)
3243
{
3244
	int idx = -1, insn_start, insn_end, len;
3245
	struct bpf_line_info *linfo;
3246
	void **jited_linfo;
3247
	struct btf *btf;
3248
	int nr_linfo;
3249

3250
	btf = prog->aux->btf;
3251
	linfo = prog->aux->linfo;
3252
	jited_linfo = prog->aux->jited_linfo;
3253

3254
	if (!btf || !linfo || !jited_linfo)
3255
		return -EINVAL;
3256
	len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
3257

3258
	linfo = &prog->aux->linfo[prog->aux->linfo_idx];
3259
	jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
3260

3261
	insn_start = linfo[0].insn_off;
3262
	insn_end = insn_start + len;
3263
	nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
3264

3265
	for (int i = 0; i < nr_linfo &&
3266
	     linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
3267
		if (jited_linfo[i] >= (void *)ip)
3268
			break;
3269
		idx = i;
3270
	}
3271

3272
	if (idx == -1)
3273
		return -ENOENT;
3274

3275
	/* Get base component of the file path. */
3276
	*filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
3277
	*filep = kbasename(*filep);
3278
	/* Obtain the source line, and strip whitespace in prefix. */
3279
	*linep = btf_name_by_offset(btf, linfo[idx].line_off);
3280
	while (isspace(**linep))
3281
		*linep += 1;
3282
	*nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
3283
	return 0;
3284
}
3285

3286
struct walk_stack_ctx {
3287
	struct bpf_prog *prog;
3288
};
3289

3290
static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
3291
{
3292
	struct walk_stack_ctx *ctxp = cookie;
3293
	struct bpf_prog *prog;
3294

3295
	/*
3296
	 * The RCU read lock is held to safely traverse the latch tree, but we
3297
	 * don't need its protection when accessing the prog, since it has an
3298
	 * active stack frame on the current stack trace, and won't disappear.
3299
	 */
3300
	rcu_read_lock();
3301
	prog = bpf_prog_ksym_find(ip);
3302
	rcu_read_unlock();
3303
	if (!prog)
3304
		return true;
3305
	/* Make sure we return the main prog if we found a subprog */
3306
	ctxp->prog = prog->aux->main_prog_aux->prog;
3307
	return false;
3308
}
3309

3310
struct bpf_prog *bpf_prog_find_from_stack(void)
3311
{
3312
	struct walk_stack_ctx ctx = {};
3313

3314
	arch_bpf_stack_walk(find_from_stack_cb, &ctx);
3315
	return ctx.prog;
3316
}
3317

3318
#endif
3319

3320
Product

Resources

Company