CoCalc -- cgroup.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/cgroup.c
²⁹²⁶⁷ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Functions to manage eBPF programs attached to cgroups
4
 *
5
 * Copyright (c) 2016 Daniel Mack
6
 */
7

8
#include <linux/kernel.h>
9
#include <linux/atomic.h>
10
#include <linux/cgroup.h>
11
#include <linux/filter.h>
12
#include <linux/slab.h>
13
#include <linux/sysctl.h>
14
#include <linux/string.h>
15
#include <linux/bpf.h>
16
#include <linux/bpf-cgroup.h>
17
#include <linux/bpf_lsm.h>
18
#include <linux/bpf_verifier.h>
19
#include <net/sock.h>
20
#include <net/bpf_sk_storage.h>
21

22
#include "../cgroup/cgroup-internal.h"
23

24
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
25
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
26

27
/*
28
 * cgroup bpf destruction makes heavy use of work items and there can be a lot
29
 * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
30
 * destruction work items don't end up filling up max_active of system_percpu_wq
31
 * which may lead to deadlock.
32
 */
33
static struct workqueue_struct *cgroup_bpf_destroy_wq;
34

35
static int __init cgroup_bpf_wq_init(void)
36
{
37
	cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
38
						WQ_PERCPU, 1);
39
	if (!cgroup_bpf_destroy_wq)
40
		panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
41
	return 0;
42
}
43
core_initcall(cgroup_bpf_wq_init);
44

45
static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
46
				      unsigned long action, void *data);
47

48
static struct notifier_block cgroup_bpf_lifetime_nb = {
49
	.notifier_call = cgroup_bpf_lifetime_notify,
50
};
51

52
void __init cgroup_bpf_lifetime_notifier_init(void)
53
{
54
	BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
55
						&cgroup_bpf_lifetime_nb));
56
}
57

58
/* __always_inline is necessary to prevent indirect call through run_prog
59
 * function pointer.
60
 */
61
static __always_inline int
62
bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
63
		      enum cgroup_bpf_attach_type atype,
64
		      const void *ctx, bpf_prog_run_fn run_prog,
65
		      int retval, u32 *ret_flags)
66
{
67
	const struct bpf_prog_array_item *item;
68
	const struct bpf_prog *prog;
69
	const struct bpf_prog_array *array;
70
	struct bpf_run_ctx *old_run_ctx;
71
	struct bpf_cg_run_ctx run_ctx;
72
	u32 func_ret;
73

74
	run_ctx.retval = retval;
75
	rcu_read_lock_dont_migrate();
76
	array = rcu_dereference(cgrp->effective[atype]);
77
	item = &array->items[0];
78
	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
79
	while ((prog = READ_ONCE(item->prog))) {
80
		run_ctx.prog_item = item;
81
		func_ret = run_prog(prog, ctx);
82
		if (ret_flags) {
83
			*(ret_flags) |= (func_ret >> 1);
84
			func_ret &= 1;
85
		}
86
		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
87
			run_ctx.retval = -EPERM;
88
		item++;
89
	}
90
	bpf_reset_run_ctx(old_run_ctx);
91
	rcu_read_unlock_migrate();
92
	return run_ctx.retval;
93
}
94

95
unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
96
				       const struct bpf_insn *insn)
97
{
98
	const struct bpf_prog *shim_prog;
99
	struct sock *sk;
100
	struct cgroup *cgrp;
101
	int ret = 0;
102
	u64 *args;
103

104
	args = (u64 *)ctx;
105
	sk = (void *)(unsigned long)args[0];
106
	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
107
	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
108

109
	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
110
	if (likely(cgrp))
111
		ret = bpf_prog_run_array_cg(&cgrp->bpf,
112
					    shim_prog->aux->cgroup_atype,
113
					    ctx, bpf_prog_run, 0, NULL);
114
	return ret;
115
}
116

117
unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
118
					 const struct bpf_insn *insn)
119
{
120
	const struct bpf_prog *shim_prog;
121
	struct socket *sock;
122
	struct cgroup *cgrp;
123
	int ret = 0;
124
	u64 *args;
125

126
	args = (u64 *)ctx;
127
	sock = (void *)(unsigned long)args[0];
128
	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
129
	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
130

131
	cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
132
	if (likely(cgrp))
133
		ret = bpf_prog_run_array_cg(&cgrp->bpf,
134
					    shim_prog->aux->cgroup_atype,
135
					    ctx, bpf_prog_run, 0, NULL);
136
	return ret;
137
}
138

139
unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
140
					  const struct bpf_insn *insn)
141
{
142
	const struct bpf_prog *shim_prog;
143
	struct cgroup *cgrp;
144
	int ret = 0;
145

146
	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
147
	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
148

149
	/* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
150
	cgrp = task_dfl_cgroup(current);
151
	if (likely(cgrp))
152
		ret = bpf_prog_run_array_cg(&cgrp->bpf,
153
					    shim_prog->aux->cgroup_atype,
154
					    ctx, bpf_prog_run, 0, NULL);
155
	return ret;
156
}
157

158
#ifdef CONFIG_BPF_LSM
159
struct cgroup_lsm_atype {
160
	u32 attach_btf_id;
161
	int refcnt;
162
};
163

164
static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
165

166
static enum cgroup_bpf_attach_type
167
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
168
{
169
	int i;
170

171
	lockdep_assert_held(&cgroup_mutex);
172

173
	if (attach_type != BPF_LSM_CGROUP)
174
		return to_cgroup_bpf_attach_type(attach_type);
175

176
	for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
177
		if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
178
			return CGROUP_LSM_START + i;
179

180
	for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
181
		if (cgroup_lsm_atype[i].attach_btf_id == 0)
182
			return CGROUP_LSM_START + i;
183

184
	return -E2BIG;
185

186
}
187

188
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
189
{
190
	int i = cgroup_atype - CGROUP_LSM_START;
191

192
	lockdep_assert_held(&cgroup_mutex);
193

194
	WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
195
		     cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
196

197
	cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
198
	cgroup_lsm_atype[i].refcnt++;
199
}
200

201
void bpf_cgroup_atype_put(int cgroup_atype)
202
{
203
	int i = cgroup_atype - CGROUP_LSM_START;
204

205
	cgroup_lock();
206
	if (--cgroup_lsm_atype[i].refcnt <= 0)
207
		cgroup_lsm_atype[i].attach_btf_id = 0;
208
	WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
209
	cgroup_unlock();
210
}
211
#else
212
static enum cgroup_bpf_attach_type
213
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
214
{
215
	if (attach_type != BPF_LSM_CGROUP)
216
		return to_cgroup_bpf_attach_type(attach_type);
217
	return -EOPNOTSUPP;
218
}
219
#endif /* CONFIG_BPF_LSM */
220

221
static void cgroup_bpf_offline(struct cgroup *cgrp)
222
{
223
	cgroup_get(cgrp);
224
	percpu_ref_kill(&cgrp->bpf.refcnt);
225
}
226

227
static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
228
{
229
	enum bpf_cgroup_storage_type stype;
230

231
	for_each_cgroup_storage_type(stype)
232
		bpf_cgroup_storage_free(storages[stype]);
233
}
234

235
static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
236
				     struct bpf_cgroup_storage *new_storages[],
237
				     enum bpf_attach_type type,
238
				     struct bpf_prog *prog,
239
				     struct cgroup *cgrp)
240
{
241
	enum bpf_cgroup_storage_type stype;
242
	struct bpf_cgroup_storage_key key;
243
	struct bpf_map *map;
244

245
	key.cgroup_inode_id = cgroup_id(cgrp);
246
	key.attach_type = type;
247

248
	for_each_cgroup_storage_type(stype) {
249
		map = prog->aux->cgroup_storage[stype];
250
		if (!map)
251
			continue;
252

253
		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
254
		if (storages[stype])
255
			continue;
256

257
		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
258
		if (IS_ERR(storages[stype])) {
259
			bpf_cgroup_storages_free(new_storages);
260
			return -ENOMEM;
261
		}
262

263
		new_storages[stype] = storages[stype];
264
	}
265

266
	return 0;
267
}
268

269
static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
270
				       struct bpf_cgroup_storage *src[])
271
{
272
	enum bpf_cgroup_storage_type stype;
273

274
	for_each_cgroup_storage_type(stype)
275
		dst[stype] = src[stype];
276
}
277

278
static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
279
				     struct cgroup *cgrp,
280
				     enum bpf_attach_type attach_type)
281
{
282
	enum bpf_cgroup_storage_type stype;
283

284
	for_each_cgroup_storage_type(stype)
285
		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
286
}
287

288
/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
289
 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
290
 * doesn't free link memory, which will eventually be done by bpf_link's
291
 * release() callback, when its last FD is closed.
292
 */
293
static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
294
{
295
	cgroup_put(link->cgroup);
296
	link->cgroup = NULL;
297
}
298

299
/**
300
 * cgroup_bpf_release() - put references of all bpf programs and
301
 *                        release all cgroup bpf data
302
 * @work: work structure embedded into the cgroup to modify
303
 */
304
static void cgroup_bpf_release(struct work_struct *work)
305
{
306
	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
307
					       bpf.release_work);
308
	struct bpf_prog_array *old_array;
309
	struct list_head *storages = &cgrp->bpf.storages;
310
	struct bpf_cgroup_storage *storage, *stmp;
311

312
	unsigned int atype;
313

314
	cgroup_lock();
315

316
	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
317
		struct hlist_head *progs = &cgrp->bpf.progs[atype];
318
		struct bpf_prog_list *pl;
319
		struct hlist_node *pltmp;
320

321
		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
322
			hlist_del(&pl->node);
323
			if (pl->prog) {
324
				if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
325
					bpf_trampoline_unlink_cgroup_shim(pl->prog);
326
				bpf_prog_put(pl->prog);
327
			}
328
			if (pl->link) {
329
				if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
330
					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
331
				bpf_cgroup_link_auto_detach(pl->link);
332
			}
333
			kfree(pl);
334
			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
335
		}
336
		old_array = rcu_dereference_protected(
337
				cgrp->bpf.effective[atype],
338
				lockdep_is_held(&cgroup_mutex));
339
		bpf_prog_array_free(old_array);
340
	}
341

342
	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
343
		bpf_cgroup_storage_unlink(storage);
344
		bpf_cgroup_storage_free(storage);
345
	}
346

347
	cgroup_unlock();
348

349
	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
350
		cgroup_bpf_put(p);
351

352
	percpu_ref_exit(&cgrp->bpf.refcnt);
353
	cgroup_put(cgrp);
354
}
355

356
/**
357
 * cgroup_bpf_release_fn() - callback used to schedule releasing
358
 *                           of bpf cgroup data
359
 * @ref: percpu ref counter structure
360
 */
361
static void cgroup_bpf_release_fn(struct percpu_ref *ref)
362
{
363
	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
364

365
	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
366
	queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
367
}
368

369
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
370
 * link or direct prog.
371
 */
372
static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
373
{
374
	if (pl->prog)
375
		return pl->prog;
376
	if (pl->link)
377
		return pl->link->link.prog;
378
	return NULL;
379
}
380

381
/* count number of elements in the list.
382
 * it's slow but the list cannot be long
383
 */
384
static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
385
{
386
	struct bpf_prog_list *pl;
387
	u32 cnt = 0;
388

389
	hlist_for_each_entry(pl, head, node) {
390
		if (!prog_list_prog(pl))
391
			continue;
392
		if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
393
			(*preorder_cnt)++;
394
		cnt++;
395
	}
396
	return cnt;
397
}
398

399
/* if parent has non-overridable prog attached,
400
 * disallow attaching new programs to the descendent cgroup.
401
 * if parent has overridable or multi-prog, allow attaching
402
 */
403
static bool hierarchy_allows_attach(struct cgroup *cgrp,
404
				    enum cgroup_bpf_attach_type atype)
405
{
406
	struct cgroup *p;
407

408
	p = cgroup_parent(cgrp);
409
	if (!p)
410
		return true;
411
	do {
412
		u32 flags = p->bpf.flags[atype];
413
		u32 cnt;
414

415
		if (flags & BPF_F_ALLOW_MULTI)
416
			return true;
417
		cnt = prog_list_length(&p->bpf.progs[atype], NULL);
418
		WARN_ON_ONCE(cnt > 1);
419
		if (cnt == 1)
420
			return !!(flags & BPF_F_ALLOW_OVERRIDE);
421
		p = cgroup_parent(p);
422
	} while (p);
423
	return true;
424
}
425

426
/* compute a chain of effective programs for a given cgroup:
427
 * start from the list of programs in this cgroup and add
428
 * all parent programs.
429
 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
430
 * to programs in this cgroup
431
 */
432
static int compute_effective_progs(struct cgroup *cgrp,
433
				   enum cgroup_bpf_attach_type atype,
434
				   struct bpf_prog_array **array)
435
{
436
	struct bpf_prog_array_item *item;
437
	struct bpf_prog_array *progs;
438
	struct bpf_prog_list *pl;
439
	struct cgroup *p = cgrp;
440
	int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
441

442
	/* count number of effective programs by walking parents */
443
	do {
444
		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
445
			cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
446
		p = cgroup_parent(p);
447
	} while (p);
448

449
	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
450
	if (!progs)
451
		return -ENOMEM;
452

453
	/* populate the array with effective progs */
454
	cnt = 0;
455
	p = cgrp;
456
	fstart = preorder_cnt;
457
	bstart = preorder_cnt - 1;
458
	do {
459
		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
460
			continue;
461

462
		init_bstart = bstart;
463
		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
464
			if (!prog_list_prog(pl))
465
				continue;
466

467
			if (pl->flags & BPF_F_PREORDER) {
468
				item = &progs->items[bstart];
469
				bstart--;
470
			} else {
471
				item = &progs->items[fstart];
472
				fstart++;
473
			}
474
			item->prog = prog_list_prog(pl);
475
			bpf_cgroup_storages_assign(item->cgroup_storage,
476
						   pl->storage);
477
			cnt++;
478
		}
479

480
		/* reverse pre-ordering progs at this cgroup level */
481
		for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
482
			swap(progs->items[i], progs->items[j]);
483

484
	} while ((p = cgroup_parent(p)));
485

486
	*array = progs;
487
	return 0;
488
}
489

490
static void activate_effective_progs(struct cgroup *cgrp,
491
				     enum cgroup_bpf_attach_type atype,
492
				     struct bpf_prog_array *old_array)
493
{
494
	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
495
					lockdep_is_held(&cgroup_mutex));
496
	/* free prog array after grace period, since __cgroup_bpf_run_*()
497
	 * might be still walking the array
498
	 */
499
	bpf_prog_array_free(old_array);
500
}
501

502
/**
503
 * cgroup_bpf_inherit() - inherit effective programs from parent
504
 * @cgrp: the cgroup to modify
505
 */
506
static int cgroup_bpf_inherit(struct cgroup *cgrp)
507
{
508
/* has to use marco instead of const int, since compiler thinks
509
 * that array below is variable length
510
 */
511
#define	NR ARRAY_SIZE(cgrp->bpf.effective)
512
	struct bpf_prog_array *arrays[NR] = {};
513
	struct cgroup *p;
514
	int ret, i;
515

516
	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
517
			      GFP_KERNEL);
518
	if (ret)
519
		return ret;
520

521
	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
522
		cgroup_bpf_get(p);
523

524
	for (i = 0; i < NR; i++)
525
		INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
526

527
	INIT_LIST_HEAD(&cgrp->bpf.storages);
528

529
	for (i = 0; i < NR; i++)
530
		if (compute_effective_progs(cgrp, i, &arrays[i]))
531
			goto cleanup;
532

533
	for (i = 0; i < NR; i++)
534
		activate_effective_progs(cgrp, i, arrays[i]);
535

536
	return 0;
537
cleanup:
538
	for (i = 0; i < NR; i++)
539
		bpf_prog_array_free(arrays[i]);
540

541
	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
542
		cgroup_bpf_put(p);
543

544
	percpu_ref_exit(&cgrp->bpf.refcnt);
545

546
	return -ENOMEM;
547
}
548

549
static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
550
				      unsigned long action, void *data)
551
{
552
	struct cgroup *cgrp = data;
553
	int ret = 0;
554

555
	if (cgrp->root != &cgrp_dfl_root)
556
		return NOTIFY_OK;
557

558
	switch (action) {
559
	case CGROUP_LIFETIME_ONLINE:
560
		ret = cgroup_bpf_inherit(cgrp);
561
		break;
562
	case CGROUP_LIFETIME_OFFLINE:
563
		cgroup_bpf_offline(cgrp);
564
		break;
565
	}
566

567
	return notifier_from_errno(ret);
568
}
569

570
static int update_effective_progs(struct cgroup *cgrp,
571
				  enum cgroup_bpf_attach_type atype)
572
{
573
	struct cgroup_subsys_state *css;
574
	int err;
575

576
	/* allocate and recompute effective prog arrays */
577
	css_for_each_descendant_pre(css, &cgrp->self) {
578
		struct cgroup *desc = container_of(css, struct cgroup, self);
579

580
		if (percpu_ref_is_zero(&desc->bpf.refcnt))
581
			continue;
582

583
		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
584
		if (err)
585
			goto cleanup;
586
	}
587

588
	/* all allocations were successful. Activate all prog arrays */
589
	css_for_each_descendant_pre(css, &cgrp->self) {
590
		struct cgroup *desc = container_of(css, struct cgroup, self);
591

592
		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
593
			if (unlikely(desc->bpf.inactive)) {
594
				bpf_prog_array_free(desc->bpf.inactive);
595
				desc->bpf.inactive = NULL;
596
			}
597
			continue;
598
		}
599

600
		activate_effective_progs(desc, atype, desc->bpf.inactive);
601
		desc->bpf.inactive = NULL;
602
	}
603

604
	return 0;
605

606
cleanup:
607
	/* oom while computing effective. Free all computed effective arrays
608
	 * since they were not activated
609
	 */
610
	css_for_each_descendant_pre(css, &cgrp->self) {
611
		struct cgroup *desc = container_of(css, struct cgroup, self);
612

613
		bpf_prog_array_free(desc->bpf.inactive);
614
		desc->bpf.inactive = NULL;
615
	}
616

617
	return err;
618
}
619

620
#define BPF_CGROUP_MAX_PROGS 64
621

622
static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
623
					       struct bpf_prog *prog,
624
					       struct bpf_cgroup_link *link,
625
					       struct bpf_prog *replace_prog,
626
					       bool allow_multi)
627
{
628
	struct bpf_prog_list *pl;
629

630
	/* single-attach case */
631
	if (!allow_multi) {
632
		if (hlist_empty(progs))
633
			return NULL;
634
		return hlist_entry(progs->first, typeof(*pl), node);
635
	}
636

637
	hlist_for_each_entry(pl, progs, node) {
638
		if (prog && pl->prog == prog && prog != replace_prog)
639
			/* disallow attaching the same prog twice */
640
			return ERR_PTR(-EINVAL);
641
		if (link && pl->link == link)
642
			/* disallow attaching the same link twice */
643
			return ERR_PTR(-EINVAL);
644
	}
645

646
	/* direct prog multi-attach w/ replacement case */
647
	if (replace_prog) {
648
		hlist_for_each_entry(pl, progs, node) {
649
			if (pl->prog == replace_prog)
650
				/* a match found */
651
				return pl;
652
		}
653
		/* prog to replace not found for cgroup */
654
		return ERR_PTR(-ENOENT);
655
	}
656

657
	return NULL;
658
}
659

660
static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
661
{
662
	struct bpf_link *link = ERR_PTR(-EINVAL);
663

664
	if (flags & BPF_F_ID)
665
		link = bpf_link_by_id(id_or_fd);
666
	else if (id_or_fd)
667
		link = bpf_link_get_from_fd(id_or_fd);
668
	return link;
669
}
670

671
static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
672
{
673
	struct bpf_prog *prog = ERR_PTR(-EINVAL);
674

675
	if (flags & BPF_F_ID)
676
		prog = bpf_prog_by_id(id_or_fd);
677
	else if (id_or_fd)
678
		prog = bpf_prog_get(id_or_fd);
679
	return prog;
680
}
681

682
static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
683
					   struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
684
{
685
	bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
686
	struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
687
	bool preorder = flags & BPF_F_PREORDER;
688
	struct bpf_link *anchor_link = NULL;
689
	struct bpf_prog *anchor_prog = NULL;
690
	bool is_before, is_after;
691

692
	is_before = flags & BPF_F_BEFORE;
693
	is_after = flags & BPF_F_AFTER;
694
	if (is_link || is_id || id_or_fd) {
695
		/* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
696
		if (is_before == is_after)
697
			return ERR_PTR(-EINVAL);
698
		if ((is_link && !link) || (!is_link && !prog))
699
			return ERR_PTR(-EINVAL);
700
	} else if (!hlist_empty(progs)) {
701
		/* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
702
		if (is_before && is_after)
703
			return ERR_PTR(-EINVAL);
704
	}
705

706
	if (is_link) {
707
		anchor_link = bpf_get_anchor_link(flags, id_or_fd);
708
		if (IS_ERR(anchor_link))
709
			return ERR_CAST(anchor_link);
710
	} else if (is_id || id_or_fd) {
711
		anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
712
		if (IS_ERR(anchor_prog))
713
			return ERR_CAST(anchor_prog);
714
	}
715

716
	if (!anchor_prog && !anchor_link) {
717
		/* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
718
		 * doesn't matter since either prepend or append to a combined
719
		 * list of progs will end up with correct result.
720
		 */
721
		hlist_for_each_entry(pltmp, progs, node) {
722
			if (is_before)
723
				return pltmp;
724
			if (pltmp->node.next)
725
				continue;
726
			return pltmp;
727
		}
728
		return NULL;
729
	}
730

731
	hlist_for_each_entry(pltmp, progs, node) {
732
		if ((anchor_prog && anchor_prog == pltmp->prog) ||
733
		    (anchor_link && anchor_link == &pltmp->link->link)) {
734
			if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
735
				goto out;
736
			pl = pltmp;
737
			goto out;
738
		}
739
	}
740

741
	pl = ERR_PTR(-ENOENT);
742
out:
743
	if (anchor_link)
744
		bpf_link_put(anchor_link);
745
	else
746
		bpf_prog_put(anchor_prog);
747
	return pl;
748
}
749

750
static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
751
			      struct bpf_prog *prog, struct bpf_cgroup_link *link,
752
			      u32 flags, u32 id_or_fd)
753
{
754
	struct bpf_prog_list *pltmp;
755

756
	pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
757
	if (IS_ERR(pltmp))
758
		return PTR_ERR(pltmp);
759

760
	if (!pltmp)
761
		hlist_add_head(&pl->node, progs);
762
	else if (flags & BPF_F_BEFORE)
763
		hlist_add_before(&pl->node, &pltmp->node);
764
	else
765
		hlist_add_behind(&pl->node, &pltmp->node);
766

767
	return 0;
768
}
769

770
/**
771
 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
772
 *                         propagate the change to descendants
773
 * @cgrp: The cgroup which descendants to traverse
774
 * @prog: A program to attach
775
 * @link: A link to attach
776
 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
777
 * @type: Type of attach operation
778
 * @flags: Option flags
779
 * @id_or_fd: Relative prog id or fd
780
 * @revision: bpf_prog_list revision
781
 *
782
 * Exactly one of @prog or @link can be non-null.
783
 * Must be called with cgroup_mutex held.
784
 */
785
static int __cgroup_bpf_attach(struct cgroup *cgrp,
786
			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
787
			       struct bpf_cgroup_link *link,
788
			       enum bpf_attach_type type, u32 flags, u32 id_or_fd,
789
			       u64 revision)
790
{
791
	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
792
	struct bpf_prog *old_prog = NULL;
793
	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
794
	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
795
	struct bpf_prog *new_prog = prog ? : link->link.prog;
796
	enum cgroup_bpf_attach_type atype;
797
	struct bpf_prog_list *pl;
798
	struct hlist_head *progs;
799
	int err;
800

801
	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
802
	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
803
		/* invalid combination */
804
		return -EINVAL;
805
	if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
806
		/* only either replace or insertion with before/after */
807
		return -EINVAL;
808
	if (link && (prog || replace_prog))
809
		/* only either link or prog/replace_prog can be specified */
810
		return -EINVAL;
811
	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
812
		/* replace_prog implies BPF_F_REPLACE, and vice versa */
813
		return -EINVAL;
814

815
	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
816
	if (atype < 0)
817
		return -EINVAL;
818
	if (revision && revision != cgrp->bpf.revisions[atype])
819
		return -ESTALE;
820

821
	progs = &cgrp->bpf.progs[atype];
822

823
	if (!hierarchy_allows_attach(cgrp, atype))
824
		return -EPERM;
825

826
	if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
827
		/* Disallow attaching non-overridable on top
828
		 * of existing overridable in this cgroup.
829
		 * Disallow attaching multi-prog if overridable or none
830
		 */
831
		return -EPERM;
832

833
	if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
834
		return -E2BIG;
835

836
	pl = find_attach_entry(progs, prog, link, replace_prog,
837
			       flags & BPF_F_ALLOW_MULTI);
838
	if (IS_ERR(pl))
839
		return PTR_ERR(pl);
840

841
	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
842
				      prog ? : link->link.prog, cgrp))
843
		return -ENOMEM;
844

845
	if (pl) {
846
		old_prog = pl->prog;
847
	} else {
848
		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
849
		if (!pl) {
850
			bpf_cgroup_storages_free(new_storage);
851
			return -ENOMEM;
852
		}
853

854
		err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
855
		if (err) {
856
			kfree(pl);
857
			bpf_cgroup_storages_free(new_storage);
858
			return err;
859
		}
860
	}
861

862
	pl->prog = prog;
863
	pl->link = link;
864
	pl->flags = flags;
865
	bpf_cgroup_storages_assign(pl->storage, storage);
866
	cgrp->bpf.flags[atype] = saved_flags;
867

868
	if (type == BPF_LSM_CGROUP) {
869
		err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
870
		if (err)
871
			goto cleanup;
872
	}
873

874
	err = update_effective_progs(cgrp, atype);
875
	if (err)
876
		goto cleanup_trampoline;
877

878
	cgrp->bpf.revisions[atype] += 1;
879
	if (old_prog) {
880
		if (type == BPF_LSM_CGROUP)
881
			bpf_trampoline_unlink_cgroup_shim(old_prog);
882
		bpf_prog_put(old_prog);
883
	} else {
884
		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
885
	}
886
	bpf_cgroup_storages_link(new_storage, cgrp, type);
887
	return 0;
888

889
cleanup_trampoline:
890
	if (type == BPF_LSM_CGROUP)
891
		bpf_trampoline_unlink_cgroup_shim(new_prog);
892

893
cleanup:
894
	if (old_prog) {
895
		pl->prog = old_prog;
896
		pl->link = NULL;
897
	}
898
	bpf_cgroup_storages_free(new_storage);
899
	if (!old_prog) {
900
		hlist_del(&pl->node);
901
		kfree(pl);
902
	}
903
	return err;
904
}
905

906
static int cgroup_bpf_attach(struct cgroup *cgrp,
907
			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
908
			     struct bpf_cgroup_link *link,
909
			     enum bpf_attach_type type,
910
			     u32 flags, u32 id_or_fd, u64 revision)
911
{
912
	int ret;
913

914
	cgroup_lock();
915
	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
916
				  id_or_fd, revision);
917
	cgroup_unlock();
918
	return ret;
919
}
920

921
/* Swap updated BPF program for given link in effective program arrays across
922
 * all descendant cgroups. This function is guaranteed to succeed.
923
 */
924
static void replace_effective_prog(struct cgroup *cgrp,
925
				   enum cgroup_bpf_attach_type atype,
926
				   struct bpf_cgroup_link *link)
927
{
928
	struct bpf_prog_array_item *item;
929
	struct cgroup_subsys_state *css;
930
	struct bpf_prog_array *progs;
931
	struct bpf_prog_list *pl;
932
	struct hlist_head *head;
933
	struct cgroup *cg;
934
	int pos;
935

936
	css_for_each_descendant_pre(css, &cgrp->self) {
937
		struct cgroup *desc = container_of(css, struct cgroup, self);
938

939
		if (percpu_ref_is_zero(&desc->bpf.refcnt))
940
			continue;
941

942
		/* find position of link in effective progs array */
943
		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
944
			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
945
				continue;
946

947
			head = &cg->bpf.progs[atype];
948
			hlist_for_each_entry(pl, head, node) {
949
				if (!prog_list_prog(pl))
950
					continue;
951
				if (pl->link == link)
952
					goto found;
953
				pos++;
954
			}
955
		}
956
found:
957
		BUG_ON(!cg);
958
		progs = rcu_dereference_protected(
959
				desc->bpf.effective[atype],
960
				lockdep_is_held(&cgroup_mutex));
961
		item = &progs->items[pos];
962
		WRITE_ONCE(item->prog, link->link.prog);
963
	}
964
}
965

966
/**
967
 * __cgroup_bpf_replace() - Replace link's program and propagate the change
968
 *                          to descendants
969
 * @cgrp: The cgroup which descendants to traverse
970
 * @link: A link for which to replace BPF program
971
 * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
972
 *            incremented
973
 *
974
 * Must be called with cgroup_mutex held.
975
 */
976
static int __cgroup_bpf_replace(struct cgroup *cgrp,
977
				struct bpf_cgroup_link *link,
978
				struct bpf_prog *new_prog)
979
{
980
	enum cgroup_bpf_attach_type atype;
981
	struct bpf_prog *old_prog;
982
	struct bpf_prog_list *pl;
983
	struct hlist_head *progs;
984
	bool found = false;
985

986
	atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
987
	if (atype < 0)
988
		return -EINVAL;
989

990
	progs = &cgrp->bpf.progs[atype];
991

992
	if (link->link.prog->type != new_prog->type)
993
		return -EINVAL;
994

995
	hlist_for_each_entry(pl, progs, node) {
996
		if (pl->link == link) {
997
			found = true;
998
			break;
999
		}
1000
	}
1001
	if (!found)
1002
		return -ENOENT;
1003

1004
	cgrp->bpf.revisions[atype] += 1;
1005
	old_prog = xchg(&link->link.prog, new_prog);
1006
	replace_effective_prog(cgrp, atype, link);
1007
	bpf_prog_put(old_prog);
1008
	return 0;
1009
}
1010

1011
static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
1012
			      struct bpf_prog *old_prog)
1013
{
1014
	struct bpf_cgroup_link *cg_link;
1015
	int ret;
1016

1017
	cg_link = container_of(link, struct bpf_cgroup_link, link);
1018

1019
	cgroup_lock();
1020
	/* link might have been auto-released by dying cgroup, so fail */
1021
	if (!cg_link->cgroup) {
1022
		ret = -ENOLINK;
1023
		goto out_unlock;
1024
	}
1025
	if (old_prog && link->prog != old_prog) {
1026
		ret = -EPERM;
1027
		goto out_unlock;
1028
	}
1029
	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
1030
out_unlock:
1031
	cgroup_unlock();
1032
	return ret;
1033
}
1034

1035
static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
1036
					       struct bpf_prog *prog,
1037
					       struct bpf_cgroup_link *link,
1038
					       bool allow_multi)
1039
{
1040
	struct bpf_prog_list *pl;
1041

1042
	if (!allow_multi) {
1043
		if (hlist_empty(progs))
1044
			/* report error when trying to detach and nothing is attached */
1045
			return ERR_PTR(-ENOENT);
1046

1047
		/* to maintain backward compatibility NONE and OVERRIDE cgroups
1048
		 * allow detaching with invalid FD (prog==NULL) in legacy mode
1049
		 */
1050
		return hlist_entry(progs->first, typeof(*pl), node);
1051
	}
1052

1053
	if (!prog && !link)
1054
		/* to detach MULTI prog the user has to specify valid FD
1055
		 * of the program or link to be detached
1056
		 */
1057
		return ERR_PTR(-EINVAL);
1058

1059
	/* find the prog or link and detach it */
1060
	hlist_for_each_entry(pl, progs, node) {
1061
		if (pl->prog == prog && pl->link == link)
1062
			return pl;
1063
	}
1064
	return ERR_PTR(-ENOENT);
1065
}
1066

1067
/**
1068
 * purge_effective_progs() - After compute_effective_progs fails to alloc new
1069
 *                           cgrp->bpf.inactive table we can recover by
1070
 *                           recomputing the array in place.
1071
 *
1072
 * @cgrp: The cgroup which descendants to travers
1073
 * @prog: A program to detach or NULL
1074
 * @link: A link to detach or NULL
1075
 * @atype: Type of detach operation
1076
 */
1077
static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
1078
				  struct bpf_cgroup_link *link,
1079
				  enum cgroup_bpf_attach_type atype)
1080
{
1081
	struct cgroup_subsys_state *css;
1082
	struct bpf_prog_array *progs;
1083
	struct bpf_prog_list *pl;
1084
	struct hlist_head *head;
1085
	struct cgroup *cg;
1086
	int pos;
1087

1088
	/* recompute effective prog array in place */
1089
	css_for_each_descendant_pre(css, &cgrp->self) {
1090
		struct cgroup *desc = container_of(css, struct cgroup, self);
1091

1092
		if (percpu_ref_is_zero(&desc->bpf.refcnt))
1093
			continue;
1094

1095
		/* find position of link or prog in effective progs array */
1096
		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
1097
			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
1098
				continue;
1099

1100
			head = &cg->bpf.progs[atype];
1101
			hlist_for_each_entry(pl, head, node) {
1102
				if (!prog_list_prog(pl))
1103
					continue;
1104
				if (pl->prog == prog && pl->link == link)
1105
					goto found;
1106
				pos++;
1107
			}
1108
		}
1109

1110
		/* no link or prog match, skip the cgroup of this layer */
1111
		continue;
1112
found:
1113
		progs = rcu_dereference_protected(
1114
				desc->bpf.effective[atype],
1115
				lockdep_is_held(&cgroup_mutex));
1116

1117
		/* Remove the program from the array */
1118
		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
1119
			  "Failed to purge a prog from array at index %d", pos);
1120
	}
1121
}
1122

1123
/**
1124
 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
1125
 *                         propagate the change to descendants
1126
 * @cgrp: The cgroup which descendants to traverse
1127
 * @prog: A program to detach or NULL
1128
 * @link: A link to detach or NULL
1129
 * @type: Type of detach operation
1130
 * @revision: bpf_prog_list revision
1131
 *
1132
 * At most one of @prog or @link can be non-NULL.
1133
 * Must be called with cgroup_mutex held.
1134
 */
1135
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1136
			       struct bpf_cgroup_link *link, enum bpf_attach_type type,
1137
			       u64 revision)
1138
{
1139
	enum cgroup_bpf_attach_type atype;
1140
	struct bpf_prog *old_prog;
1141
	struct bpf_prog_list *pl;
1142
	struct hlist_head *progs;
1143
	u32 attach_btf_id = 0;
1144
	u32 flags;
1145

1146
	if (prog)
1147
		attach_btf_id = prog->aux->attach_btf_id;
1148
	if (link)
1149
		attach_btf_id = link->link.prog->aux->attach_btf_id;
1150

1151
	atype = bpf_cgroup_atype_find(type, attach_btf_id);
1152
	if (atype < 0)
1153
		return -EINVAL;
1154

1155
	if (revision && revision != cgrp->bpf.revisions[atype])
1156
		return -ESTALE;
1157

1158
	progs = &cgrp->bpf.progs[atype];
1159
	flags = cgrp->bpf.flags[atype];
1160

1161
	if (prog && link)
1162
		/* only one of prog or link can be specified */
1163
		return -EINVAL;
1164

1165
	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
1166
	if (IS_ERR(pl))
1167
		return PTR_ERR(pl);
1168

1169
	/* mark it deleted, so it's ignored while recomputing effective */
1170
	old_prog = pl->prog;
1171
	pl->prog = NULL;
1172
	pl->link = NULL;
1173

1174
	if (update_effective_progs(cgrp, atype)) {
1175
		/* if update effective array failed replace the prog with a dummy prog*/
1176
		pl->prog = old_prog;
1177
		pl->link = link;
1178
		purge_effective_progs(cgrp, old_prog, link, atype);
1179
	}
1180

1181
	/* now can actually delete it from this cgroup list */
1182
	hlist_del(&pl->node);
1183
	cgrp->bpf.revisions[atype] += 1;
1184

1185
	kfree(pl);
1186
	if (hlist_empty(progs))
1187
		/* last program was detached, reset flags to zero */
1188
		cgrp->bpf.flags[atype] = 0;
1189
	if (old_prog) {
1190
		if (type == BPF_LSM_CGROUP)
1191
			bpf_trampoline_unlink_cgroup_shim(old_prog);
1192
		bpf_prog_put(old_prog);
1193
	}
1194
	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1195
	return 0;
1196
}
1197

1198
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1199
			     enum bpf_attach_type type, u64 revision)
1200
{
1201
	int ret;
1202

1203
	cgroup_lock();
1204
	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
1205
	cgroup_unlock();
1206
	return ret;
1207
}
1208

1209
/* Must be called with cgroup_mutex held to avoid races. */
1210
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1211
			      union bpf_attr __user *uattr)
1212
{
1213
	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1214
	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1215
	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1216
	enum bpf_attach_type type = attr->query.attach_type;
1217
	enum cgroup_bpf_attach_type from_atype, to_atype;
1218
	enum cgroup_bpf_attach_type atype;
1219
	struct bpf_prog_array *effective;
1220
	int cnt, ret = 0, i;
1221
	int total_cnt = 0;
1222
	u64 revision = 0;
1223
	u32 flags;
1224

1225
	if (effective_query && prog_attach_flags)
1226
		return -EINVAL;
1227

1228
	if (type == BPF_LSM_CGROUP) {
1229
		if (!effective_query && attr->query.prog_cnt &&
1230
		    prog_ids && !prog_attach_flags)
1231
			return -EINVAL;
1232

1233
		from_atype = CGROUP_LSM_START;
1234
		to_atype = CGROUP_LSM_END;
1235
		flags = 0;
1236
	} else {
1237
		from_atype = to_cgroup_bpf_attach_type(type);
1238
		if (from_atype < 0)
1239
			return -EINVAL;
1240
		to_atype = from_atype;
1241
		flags = cgrp->bpf.flags[from_atype];
1242
	}
1243

1244
	for (atype = from_atype; atype <= to_atype; atype++) {
1245
		if (effective_query) {
1246
			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1247
							      lockdep_is_held(&cgroup_mutex));
1248
			total_cnt += bpf_prog_array_length(effective);
1249
		} else {
1250
			total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
1251
		}
1252
	}
1253

1254
	/* always output uattr->query.attach_flags as 0 during effective query */
1255
	flags = effective_query ? 0 : flags;
1256
	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1257
		return -EFAULT;
1258
	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
1259
		return -EFAULT;
1260
	if (!effective_query && from_atype == to_atype)
1261
		revision = cgrp->bpf.revisions[from_atype];
1262
	if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
1263
		return -EFAULT;
1264
	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
1265
		/* return early if user requested only program count + flags */
1266
		return 0;
1267

1268
	if (attr->query.prog_cnt < total_cnt) {
1269
		total_cnt = attr->query.prog_cnt;
1270
		ret = -ENOSPC;
1271
	}
1272

1273
	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1274
		if (effective_query) {
1275
			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1276
							      lockdep_is_held(&cgroup_mutex));
1277
			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1278
			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1279
		} else {
1280
			struct hlist_head *progs;
1281
			struct bpf_prog_list *pl;
1282
			struct bpf_prog *prog;
1283
			u32 id;
1284

1285
			progs = &cgrp->bpf.progs[atype];
1286
			cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
1287
			i = 0;
1288
			hlist_for_each_entry(pl, progs, node) {
1289
				prog = prog_list_prog(pl);
1290
				id = prog->aux->id;
1291
				if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1292
					return -EFAULT;
1293
				if (++i == cnt)
1294
					break;
1295
			}
1296

1297
			if (prog_attach_flags) {
1298
				flags = cgrp->bpf.flags[atype];
1299

1300
				for (i = 0; i < cnt; i++)
1301
					if (copy_to_user(prog_attach_flags + i,
1302
							 &flags, sizeof(flags)))
1303
						return -EFAULT;
1304
				prog_attach_flags += cnt;
1305
			}
1306
		}
1307

1308
		prog_ids += cnt;
1309
		total_cnt -= cnt;
1310
	}
1311
	return ret;
1312
}
1313

1314
static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1315
			    union bpf_attr __user *uattr)
1316
{
1317
	int ret;
1318

1319
	cgroup_lock();
1320
	ret = __cgroup_bpf_query(cgrp, attr, uattr);
1321
	cgroup_unlock();
1322
	return ret;
1323
}
1324

1325
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1326
			   enum bpf_prog_type ptype, struct bpf_prog *prog)
1327
{
1328
	struct bpf_prog *replace_prog = NULL;
1329
	struct cgroup *cgrp;
1330
	int ret;
1331

1332
	cgrp = cgroup_get_from_fd(attr->target_fd);
1333
	if (IS_ERR(cgrp))
1334
		return PTR_ERR(cgrp);
1335

1336
	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1337
	    (attr->attach_flags & BPF_F_REPLACE)) {
1338
		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1339
		if (IS_ERR(replace_prog)) {
1340
			cgroup_put(cgrp);
1341
			return PTR_ERR(replace_prog);
1342
		}
1343
	}
1344

1345
	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1346
				attr->attach_type, attr->attach_flags,
1347
				attr->relative_fd, attr->expected_revision);
1348

1349
	if (replace_prog)
1350
		bpf_prog_put(replace_prog);
1351
	cgroup_put(cgrp);
1352
	return ret;
1353
}
1354

1355
int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1356
{
1357
	struct bpf_prog *prog;
1358
	struct cgroup *cgrp;
1359
	int ret;
1360

1361
	cgrp = cgroup_get_from_fd(attr->target_fd);
1362
	if (IS_ERR(cgrp))
1363
		return PTR_ERR(cgrp);
1364

1365
	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1366
	if (IS_ERR(prog))
1367
		prog = NULL;
1368

1369
	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
1370
	if (prog)
1371
		bpf_prog_put(prog);
1372

1373
	cgroup_put(cgrp);
1374
	return ret;
1375
}
1376

1377
static void bpf_cgroup_link_release(struct bpf_link *link)
1378
{
1379
	struct bpf_cgroup_link *cg_link =
1380
		container_of(link, struct bpf_cgroup_link, link);
1381
	struct cgroup *cg;
1382

1383
	/* link might have been auto-detached by dying cgroup already,
1384
	 * in that case our work is done here
1385
	 */
1386
	if (!cg_link->cgroup)
1387
		return;
1388

1389
	cgroup_lock();
1390

1391
	/* re-check cgroup under lock again */
1392
	if (!cg_link->cgroup) {
1393
		cgroup_unlock();
1394
		return;
1395
	}
1396

1397
	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1398
				    link->attach_type, 0));
1399
	if (link->attach_type == BPF_LSM_CGROUP)
1400
		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1401

1402
	cg = cg_link->cgroup;
1403
	cg_link->cgroup = NULL;
1404

1405
	cgroup_unlock();
1406

1407
	cgroup_put(cg);
1408
}
1409

1410
static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1411
{
1412
	struct bpf_cgroup_link *cg_link =
1413
		container_of(link, struct bpf_cgroup_link, link);
1414

1415
	kfree(cg_link);
1416
}
1417

1418
static int bpf_cgroup_link_detach(struct bpf_link *link)
1419
{
1420
	bpf_cgroup_link_release(link);
1421

1422
	return 0;
1423
}
1424

1425
static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1426
					struct seq_file *seq)
1427
{
1428
	struct bpf_cgroup_link *cg_link =
1429
		container_of(link, struct bpf_cgroup_link, link);
1430
	u64 cg_id = 0;
1431

1432
	cgroup_lock();
1433
	if (cg_link->cgroup)
1434
		cg_id = cgroup_id(cg_link->cgroup);
1435
	cgroup_unlock();
1436

1437
	seq_printf(seq,
1438
		   "cgroup_id:\t%llu\n"
1439
		   "attach_type:\t%d\n",
1440
		   cg_id,
1441
		   link->attach_type);
1442
}
1443

1444
static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1445
					  struct bpf_link_info *info)
1446
{
1447
	struct bpf_cgroup_link *cg_link =
1448
		container_of(link, struct bpf_cgroup_link, link);
1449
	u64 cg_id = 0;
1450

1451
	cgroup_lock();
1452
	if (cg_link->cgroup)
1453
		cg_id = cgroup_id(cg_link->cgroup);
1454
	cgroup_unlock();
1455

1456
	info->cgroup.cgroup_id = cg_id;
1457
	info->cgroup.attach_type = link->attach_type;
1458
	return 0;
1459
}
1460

1461
static const struct bpf_link_ops bpf_cgroup_link_lops = {
1462
	.release = bpf_cgroup_link_release,
1463
	.dealloc = bpf_cgroup_link_dealloc,
1464
	.detach = bpf_cgroup_link_detach,
1465
	.update_prog = cgroup_bpf_replace,
1466
	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1467
	.fill_link_info = bpf_cgroup_link_fill_link_info,
1468
};
1469

1470
#define BPF_F_LINK_ATTACH_MASK	\
1471
	(BPF_F_ID |		\
1472
	 BPF_F_BEFORE |		\
1473
	 BPF_F_AFTER |		\
1474
	 BPF_F_PREORDER |	\
1475
	 BPF_F_LINK)
1476

1477
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1478
{
1479
	struct bpf_link_primer link_primer;
1480
	struct bpf_cgroup_link *link;
1481
	struct cgroup *cgrp;
1482
	int err;
1483

1484
	if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
1485
		return -EINVAL;
1486

1487
	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1488
	if (IS_ERR(cgrp))
1489
		return PTR_ERR(cgrp);
1490

1491
	link = kzalloc(sizeof(*link), GFP_USER);
1492
	if (!link) {
1493
		err = -ENOMEM;
1494
		goto out_put_cgroup;
1495
	}
1496
	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1497
		      prog, attr->link_create.attach_type);
1498
	link->cgroup = cgrp;
1499

1500
	err = bpf_link_prime(&link->link, &link_primer);
1501
	if (err) {
1502
		kfree(link);
1503
		goto out_put_cgroup;
1504
	}
1505

1506
	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1507
				link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
1508
				attr->link_create.cgroup.relative_fd,
1509
				attr->link_create.cgroup.expected_revision);
1510
	if (err) {
1511
		bpf_link_cleanup(&link_primer);
1512
		goto out_put_cgroup;
1513
	}
1514

1515
	return bpf_link_settle(&link_primer);
1516

1517
out_put_cgroup:
1518
	cgroup_put(cgrp);
1519
	return err;
1520
}
1521

1522
int cgroup_bpf_prog_query(const union bpf_attr *attr,
1523
			  union bpf_attr __user *uattr)
1524
{
1525
	struct cgroup *cgrp;
1526
	int ret;
1527

1528
	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1529
	if (IS_ERR(cgrp))
1530
		return PTR_ERR(cgrp);
1531

1532
	ret = cgroup_bpf_query(cgrp, attr, uattr);
1533

1534
	cgroup_put(cgrp);
1535
	return ret;
1536
}
1537

1538
/**
1539
 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1540
 * @sk: The socket sending or receiving traffic
1541
 * @skb: The skb that is being sent or received
1542
 * @atype: The type of program to be executed
1543
 *
1544
 * If no socket is passed, or the socket is not of type INET or INET6,
1545
 * this function does nothing and returns 0.
1546
 *
1547
 * The program type passed in via @type must be suitable for network
1548
 * filtering. No further check is performed to assert that.
1549
 *
1550
 * For egress packets, this function can return:
1551
 *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1552
 *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1553
 *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1554
 *				  to call cwr
1555
 *   -err			- drop packet
1556
 *
1557
 * For ingress packets, this function will return -EPERM if any
1558
 * attached program was found and if it returned != 1 during execution.
1559
 * Otherwise 0 is returned.
1560
 */
1561
int __cgroup_bpf_run_filter_skb(struct sock *sk,
1562
				struct sk_buff *skb,
1563
				enum cgroup_bpf_attach_type atype)
1564
{
1565
	unsigned int offset = -skb_network_offset(skb);
1566
	struct sock *save_sk;
1567
	void *saved_data_end;
1568
	struct cgroup *cgrp;
1569
	int ret;
1570

1571
	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1572
		return 0;
1573

1574
	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1575
	save_sk = skb->sk;
1576
	skb->sk = sk;
1577
	__skb_push(skb, offset);
1578

1579
	/* compute pointers for the bpf prog */
1580
	bpf_compute_and_save_data_end(skb, &saved_data_end);
1581

1582
	if (atype == CGROUP_INET_EGRESS) {
1583
		u32 flags = 0;
1584
		bool cn;
1585

1586
		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1587
					    __bpf_prog_run_save_cb, 0, &flags);
1588

1589
		/* Return values of CGROUP EGRESS BPF programs are:
1590
		 *   0: drop packet
1591
		 *   1: keep packet
1592
		 *   2: drop packet and cn
1593
		 *   3: keep packet and cn
1594
		 *
1595
		 * The returned value is then converted to one of the NET_XMIT
1596
		 * or an error code that is then interpreted as drop packet
1597
		 * (and no cn):
1598
		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1599
		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1600
		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1601
		 *   3: -err              skb should be dropped
1602
		 */
1603

1604
		cn = flags & BPF_RET_SET_CN;
1605
		if (ret && !IS_ERR_VALUE((long)ret))
1606
			ret = -EFAULT;
1607
		if (!ret)
1608
			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1609
		else
1610
			ret = (cn ? NET_XMIT_DROP : ret);
1611
	} else {
1612
		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1613
					    skb, __bpf_prog_run_save_cb, 0,
1614
					    NULL);
1615
		if (ret && !IS_ERR_VALUE((long)ret))
1616
			ret = -EFAULT;
1617
	}
1618
	bpf_restore_data_end(skb, saved_data_end);
1619
	__skb_pull(skb, offset);
1620
	skb->sk = save_sk;
1621

1622
	return ret;
1623
}
1624
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1625

1626
/**
1627
 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1628
 * @sk: sock structure to manipulate
1629
 * @atype: The type of program to be executed
1630
 *
1631
 * socket is passed is expected to be of type INET or INET6.
1632
 *
1633
 * The program type passed in via @type must be suitable for sock
1634
 * filtering. No further check is performed to assert that.
1635
 *
1636
 * This function will return %-EPERM if any if an attached program was found
1637
 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1638
 */
1639
int __cgroup_bpf_run_filter_sk(struct sock *sk,
1640
			       enum cgroup_bpf_attach_type atype)
1641
{
1642
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1643

1644
	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1645
				     NULL);
1646
}
1647
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1648

1649
/**
1650
 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1651
 *                                       provided by user sockaddr
1652
 * @sk: sock struct that will use sockaddr
1653
 * @uaddr: sockaddr struct provided by user
1654
 * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
1655
 *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
1656
 *            uaddr.
1657
 * @atype: The type of program to be executed
1658
 * @t_ctx: Pointer to attach type specific context
1659
 * @flags: Pointer to u32 which contains higher bits of BPF program
1660
 *         return value (OR'ed together).
1661
 *
1662
 * socket is expected to be of type INET, INET6 or UNIX.
1663
 *
1664
 * This function will return %-EPERM if an attached program is found and
1665
 * returned value != 1 during execution. In all other cases, 0 is returned.
1666
 */
1667
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1668
				      struct sockaddr *uaddr,
1669
				      int *uaddrlen,
1670
				      enum cgroup_bpf_attach_type atype,
1671
				      void *t_ctx,
1672
				      u32 *flags)
1673
{
1674
	struct bpf_sock_addr_kern ctx = {
1675
		.sk = sk,
1676
		.uaddr = uaddr,
1677
		.t_ctx = t_ctx,
1678
	};
1679
	struct sockaddr_storage unspec;
1680
	struct cgroup *cgrp;
1681
	int ret;
1682

1683
	/* Check socket family since not all sockets represent network
1684
	 * endpoint (e.g. AF_UNIX).
1685
	 */
1686
	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
1687
	    sk->sk_family != AF_UNIX)
1688
		return 0;
1689

1690
	if (!ctx.uaddr) {
1691
		memset(&unspec, 0, sizeof(unspec));
1692
		ctx.uaddr = (struct sockaddr *)&unspec;
1693
		ctx.uaddrlen = 0;
1694
	} else {
1695
		ctx.uaddrlen = *uaddrlen;
1696
	}
1697

1698
	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1699
	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1700
				    0, flags);
1701

1702
	if (!ret && uaddr)
1703
		*uaddrlen = ctx.uaddrlen;
1704

1705
	return ret;
1706
}
1707
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1708

1709
/**
1710
 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1711
 * @sk: socket to get cgroup from
1712
 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1713
 * sk with connection information (IP addresses, etc.) May not contain
1714
 * cgroup info if it is a req sock.
1715
 * @atype: The type of program to be executed
1716
 *
1717
 * socket passed is expected to be of type INET or INET6.
1718
 *
1719
 * The program type passed in via @type must be suitable for sock_ops
1720
 * filtering. No further check is performed to assert that.
1721
 *
1722
 * This function will return %-EPERM if any if an attached program was found
1723
 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1724
 */
1725
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1726
				     struct bpf_sock_ops_kern *sock_ops,
1727
				     enum cgroup_bpf_attach_type atype)
1728
{
1729
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1730

1731
	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1732
				     0, NULL);
1733
}
1734
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1735

1736
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1737
				      short access, enum cgroup_bpf_attach_type atype)
1738
{
1739
	struct cgroup *cgrp;
1740
	struct bpf_cgroup_dev_ctx ctx = {
1741
		.access_type = (access << 16) | dev_type,
1742
		.major = major,
1743
		.minor = minor,
1744
	};
1745
	int ret;
1746

1747
	rcu_read_lock();
1748
	cgrp = task_dfl_cgroup(current);
1749
	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1750
				    NULL);
1751
	rcu_read_unlock();
1752

1753
	return ret;
1754
}
1755

1756
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
1757
{
1758
	/* flags argument is not used now,
1759
	 * but provides an ability to extend the API.
1760
	 * verifier checks that its value is correct.
1761
	 */
1762
	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
1763
	struct bpf_cgroup_storage *storage;
1764
	struct bpf_cg_run_ctx *ctx;
1765
	void *ptr;
1766

1767
	/* get current cgroup storage from BPF run context */
1768
	ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1769
	storage = ctx->prog_item->cgroup_storage[stype];
1770

1771
	if (stype == BPF_CGROUP_STORAGE_SHARED)
1772
		ptr = &READ_ONCE(storage->buf)->data[0];
1773
	else
1774
		ptr = this_cpu_ptr(storage->percpu_buf);
1775

1776
	return (unsigned long)ptr;
1777
}
1778

1779
const struct bpf_func_proto bpf_get_local_storage_proto = {
1780
	.func		= bpf_get_local_storage,
1781
	.gpl_only	= false,
1782
	.ret_type	= RET_PTR_TO_MAP_VALUE,
1783
	.arg1_type	= ARG_CONST_MAP_PTR,
1784
	.arg2_type	= ARG_ANYTHING,
1785
};
1786

1787
BPF_CALL_0(bpf_get_retval)
1788
{
1789
	struct bpf_cg_run_ctx *ctx =
1790
		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1791

1792
	return ctx->retval;
1793
}
1794

1795
const struct bpf_func_proto bpf_get_retval_proto = {
1796
	.func		= bpf_get_retval,
1797
	.gpl_only	= false,
1798
	.ret_type	= RET_INTEGER,
1799
};
1800

1801
BPF_CALL_1(bpf_set_retval, int, retval)
1802
{
1803
	struct bpf_cg_run_ctx *ctx =
1804
		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1805

1806
	ctx->retval = retval;
1807
	return 0;
1808
}
1809

1810
const struct bpf_func_proto bpf_set_retval_proto = {
1811
	.func		= bpf_set_retval,
1812
	.gpl_only	= false,
1813
	.ret_type	= RET_INTEGER,
1814
	.arg1_type	= ARG_ANYTHING,
1815
};
1816

1817
static const struct bpf_func_proto *
1818
cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1819
{
1820
	const struct bpf_func_proto *func_proto;
1821

1822
	func_proto = cgroup_common_func_proto(func_id, prog);
1823
	if (func_proto)
1824
		return func_proto;
1825

1826
	switch (func_id) {
1827
	case BPF_FUNC_perf_event_output:
1828
		return &bpf_event_output_data_proto;
1829
	default:
1830
		return bpf_base_func_proto(func_id, prog);
1831
	}
1832
}
1833

1834
static bool cgroup_dev_is_valid_access(int off, int size,
1835
				       enum bpf_access_type type,
1836
				       const struct bpf_prog *prog,
1837
				       struct bpf_insn_access_aux *info)
1838
{
1839
	const int size_default = sizeof(__u32);
1840

1841
	if (type == BPF_WRITE)
1842
		return false;
1843

1844
	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1845
		return false;
1846
	/* The verifier guarantees that size > 0. */
1847
	if (off % size != 0)
1848
		return false;
1849

1850
	switch (off) {
1851
	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1852
		bpf_ctx_record_field_size(info, size_default);
1853
		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1854
			return false;
1855
		break;
1856
	default:
1857
		if (size != size_default)
1858
			return false;
1859
	}
1860

1861
	return true;
1862
}
1863

1864
const struct bpf_prog_ops cg_dev_prog_ops = {
1865
};
1866

1867
const struct bpf_verifier_ops cg_dev_verifier_ops = {
1868
	.get_func_proto		= cgroup_dev_func_proto,
1869
	.is_valid_access	= cgroup_dev_is_valid_access,
1870
};
1871

1872
/**
1873
 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1874
 *
1875
 * @head: sysctl table header
1876
 * @table: sysctl table
1877
 * @write: sysctl is being read (= 0) or written (= 1)
1878
 * @buf: pointer to buffer (in and out)
1879
 * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1880
 *	result is size of @new_buf if program set new value, initial value
1881
 *	otherwise
1882
 * @ppos: value-result argument: value is position at which read from or write
1883
 *	to sysctl is happening, result is new position if program overrode it,
1884
 *	initial value otherwise
1885
 * @atype: type of program to be executed
1886
 *
1887
 * Program is run when sysctl is being accessed, either read or written, and
1888
 * can allow or deny such access.
1889
 *
1890
 * This function will return %-EPERM if an attached program is found and
1891
 * returned value != 1 during execution. In all other cases 0 is returned.
1892
 */
1893
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1894
				   const struct ctl_table *table, int write,
1895
				   char **buf, size_t *pcount, loff_t *ppos,
1896
				   enum cgroup_bpf_attach_type atype)
1897
{
1898
	struct bpf_sysctl_kern ctx = {
1899
		.head = head,
1900
		.table = table,
1901
		.write = write,
1902
		.ppos = ppos,
1903
		.cur_val = NULL,
1904
		.cur_len = PAGE_SIZE,
1905
		.new_val = NULL,
1906
		.new_len = 0,
1907
		.new_updated = 0,
1908
	};
1909
	struct cgroup *cgrp;
1910
	loff_t pos = 0;
1911
	int ret;
1912

1913
	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1914
	if (!ctx.cur_val ||
1915
	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1916
		/* Let BPF program decide how to proceed. */
1917
		ctx.cur_len = 0;
1918
	}
1919

1920
	if (write && *buf && *pcount) {
1921
		/* BPF program should be able to override new value with a
1922
		 * buffer bigger than provided by user.
1923
		 */
1924
		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1925
		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1926
		if (ctx.new_val) {
1927
			memcpy(ctx.new_val, *buf, ctx.new_len);
1928
		} else {
1929
			/* Let BPF program decide how to proceed. */
1930
			ctx.new_len = 0;
1931
		}
1932
	}
1933

1934
	rcu_read_lock();
1935
	cgrp = task_dfl_cgroup(current);
1936
	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1937
				    NULL);
1938
	rcu_read_unlock();
1939

1940
	kfree(ctx.cur_val);
1941

1942
	if (ret == 1 && ctx.new_updated) {
1943
		kfree(*buf);
1944
		*buf = ctx.new_val;
1945
		*pcount = ctx.new_len;
1946
	} else {
1947
		kfree(ctx.new_val);
1948
	}
1949

1950
	return ret;
1951
}
1952

1953
#ifdef CONFIG_NET
1954
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1955
			     struct bpf_sockopt_buf *buf)
1956
{
1957
	if (unlikely(max_optlen < 0))
1958
		return -EINVAL;
1959

1960
	if (unlikely(max_optlen > PAGE_SIZE)) {
1961
		/* We don't expose optvals that are greater than PAGE_SIZE
1962
		 * to the BPF program.
1963
		 */
1964
		max_optlen = PAGE_SIZE;
1965
	}
1966

1967
	if (max_optlen <= sizeof(buf->data)) {
1968
		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1969
		 * bytes avoid the cost of kzalloc.
1970
		 */
1971
		ctx->optval = buf->data;
1972
		ctx->optval_end = ctx->optval + max_optlen;
1973
		return max_optlen;
1974
	}
1975

1976
	ctx->optval = kzalloc(max_optlen, GFP_USER);
1977
	if (!ctx->optval)
1978
		return -ENOMEM;
1979

1980
	ctx->optval_end = ctx->optval + max_optlen;
1981

1982
	return max_optlen;
1983
}
1984

1985
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1986
			     struct bpf_sockopt_buf *buf)
1987
{
1988
	if (ctx->optval == buf->data)
1989
		return;
1990
	kfree(ctx->optval);
1991
}
1992

1993
static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1994
				  struct bpf_sockopt_buf *buf)
1995
{
1996
	return ctx->optval != buf->data;
1997
}
1998

1999
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
2000
				       int *optname, sockptr_t optval,
2001
				       int *optlen, char **kernel_optval)
2002
{
2003
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2004
	struct bpf_sockopt_buf buf = {};
2005
	struct bpf_sockopt_kern ctx = {
2006
		.sk = sk,
2007
		.level = *level,
2008
		.optname = *optname,
2009
	};
2010
	int ret, max_optlen;
2011

2012
	/* Allocate a bit more than the initial user buffer for
2013
	 * BPF program. The canonical use case is overriding
2014
	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
2015
	 */
2016
	max_optlen = max_t(int, 16, *optlen);
2017
	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
2018
	if (max_optlen < 0)
2019
		return max_optlen;
2020

2021
	ctx.optlen = *optlen;
2022

2023
	if (copy_from_sockptr(ctx.optval, optval,
2024
			      min(*optlen, max_optlen))) {
2025
		ret = -EFAULT;
2026
		goto out;
2027
	}
2028

2029
	lock_sock(sk);
2030
	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
2031
				    &ctx, bpf_prog_run, 0, NULL);
2032
	release_sock(sk);
2033

2034
	if (ret)
2035
		goto out;
2036

2037
	if (ctx.optlen == -1) {
2038
		/* optlen set to -1, bypass kernel */
2039
		ret = 1;
2040
	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
2041
		/* optlen is out of bounds */
2042
		if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
2043
			pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2044
				     ctx.optlen, max_optlen);
2045
			ret = 0;
2046
			goto out;
2047
		}
2048
		ret = -EFAULT;
2049
	} else {
2050
		/* optlen within bounds, run kernel handler */
2051
		ret = 0;
2052

2053
		/* export any potential modifications */
2054
		*level = ctx.level;
2055
		*optname = ctx.optname;
2056

2057
		/* optlen == 0 from BPF indicates that we should
2058
		 * use original userspace data.
2059
		 */
2060
		if (ctx.optlen != 0) {
2061
			*optlen = ctx.optlen;
2062
			/* We've used bpf_sockopt_kern->buf as an intermediary
2063
			 * storage, but the BPF program indicates that we need
2064
			 * to pass this data to the kernel setsockopt handler.
2065
			 * No way to export on-stack buf, have to allocate a
2066
			 * new buffer.
2067
			 */
2068
			if (!sockopt_buf_allocated(&ctx, &buf)) {
2069
				void *p = kmalloc(ctx.optlen, GFP_USER);
2070

2071
				if (!p) {
2072
					ret = -ENOMEM;
2073
					goto out;
2074
				}
2075
				memcpy(p, ctx.optval, ctx.optlen);
2076
				*kernel_optval = p;
2077
			} else {
2078
				*kernel_optval = ctx.optval;
2079
			}
2080
			/* export and don't free sockopt buf */
2081
			return 0;
2082
		}
2083
	}
2084

2085
out:
2086
	sockopt_free_buf(&ctx, &buf);
2087
	return ret;
2088
}
2089

2090
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
2091
				       int optname, sockptr_t optval,
2092
				       sockptr_t optlen, int max_optlen,
2093
				       int retval)
2094
{
2095
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2096
	struct bpf_sockopt_buf buf = {};
2097
	struct bpf_sockopt_kern ctx = {
2098
		.sk = sk,
2099
		.level = level,
2100
		.optname = optname,
2101
		.current_task = current,
2102
	};
2103
	int orig_optlen;
2104
	int ret;
2105

2106
	orig_optlen = max_optlen;
2107
	ctx.optlen = max_optlen;
2108
	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
2109
	if (max_optlen < 0)
2110
		return max_optlen;
2111

2112
	if (!retval) {
2113
		/* If kernel getsockopt finished successfully,
2114
		 * copy whatever was returned to the user back
2115
		 * into our temporary buffer. Set optlen to the
2116
		 * one that kernel returned as well to let
2117
		 * BPF programs inspect the value.
2118
		 */
2119
		if (copy_from_sockptr(&ctx.optlen, optlen,
2120
				      sizeof(ctx.optlen))) {
2121
			ret = -EFAULT;
2122
			goto out;
2123
		}
2124

2125
		if (ctx.optlen < 0) {
2126
			ret = -EFAULT;
2127
			goto out;
2128
		}
2129
		orig_optlen = ctx.optlen;
2130

2131
		if (copy_from_sockptr(ctx.optval, optval,
2132
				      min(ctx.optlen, max_optlen))) {
2133
			ret = -EFAULT;
2134
			goto out;
2135
		}
2136
	}
2137

2138
	lock_sock(sk);
2139
	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2140
				    &ctx, bpf_prog_run, retval, NULL);
2141
	release_sock(sk);
2142

2143
	if (ret < 0)
2144
		goto out;
2145

2146
	if (!sockptr_is_null(optval) &&
2147
	    (ctx.optlen > max_optlen || ctx.optlen < 0)) {
2148
		if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
2149
			pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2150
				     ctx.optlen, max_optlen);
2151
			ret = retval;
2152
			goto out;
2153
		}
2154
		ret = -EFAULT;
2155
		goto out;
2156
	}
2157

2158
	if (ctx.optlen != 0) {
2159
		if (!sockptr_is_null(optval) &&
2160
		    copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
2161
			ret = -EFAULT;
2162
			goto out;
2163
		}
2164
		if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
2165
			ret = -EFAULT;
2166
			goto out;
2167
		}
2168
	}
2169

2170
out:
2171
	sockopt_free_buf(&ctx, &buf);
2172
	return ret;
2173
}
2174

2175
int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
2176
					    int optname, void *optval,
2177
					    int *optlen, int retval)
2178
{
2179
	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2180
	struct bpf_sockopt_kern ctx = {
2181
		.sk = sk,
2182
		.level = level,
2183
		.optname = optname,
2184
		.optlen = *optlen,
2185
		.optval = optval,
2186
		.optval_end = optval + *optlen,
2187
		.current_task = current,
2188
	};
2189
	int ret;
2190

2191
	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
2192
	 * user data back into BPF buffer when reval != 0. This is
2193
	 * done as an optimization to avoid extra copy, assuming
2194
	 * kernel won't populate the data in case of an error.
2195
	 * Here we always pass the data and memset() should
2196
	 * be called if that data shouldn't be "exported".
2197
	 */
2198

2199
	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2200
				    &ctx, bpf_prog_run, retval, NULL);
2201
	if (ret < 0)
2202
		return ret;
2203

2204
	if (ctx.optlen > *optlen)
2205
		return -EFAULT;
2206

2207
	/* BPF programs can shrink the buffer, export the modifications.
2208
	 */
2209
	if (ctx.optlen != 0)
2210
		*optlen = ctx.optlen;
2211

2212
	return ret;
2213
}
2214
#endif
2215

2216
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
2217
			      size_t *lenp)
2218
{
2219
	ssize_t tmp_ret = 0, ret;
2220

2221
	if (dir->header.parent) {
2222
		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
2223
		if (tmp_ret < 0)
2224
			return tmp_ret;
2225
	}
2226

2227
	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
2228
	if (ret < 0)
2229
		return ret;
2230
	*bufp += ret;
2231
	*lenp -= ret;
2232
	ret += tmp_ret;
2233

2234
	/* Avoid leading slash. */
2235
	if (!ret)
2236
		return ret;
2237

2238
	tmp_ret = strscpy(*bufp, "/", *lenp);
2239
	if (tmp_ret < 0)
2240
		return tmp_ret;
2241
	*bufp += tmp_ret;
2242
	*lenp -= tmp_ret;
2243

2244
	return ret + tmp_ret;
2245
}
2246

2247
BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
2248
	   size_t, buf_len, u64, flags)
2249
{
2250
	ssize_t tmp_ret = 0, ret;
2251

2252
	if (!buf)
2253
		return -EINVAL;
2254

2255
	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
2256
		if (!ctx->head)
2257
			return -EINVAL;
2258
		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
2259
		if (tmp_ret < 0)
2260
			return tmp_ret;
2261
	}
2262

2263
	ret = strscpy(buf, ctx->table->procname, buf_len);
2264

2265
	return ret < 0 ? ret : tmp_ret + ret;
2266
}
2267

2268
static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2269
	.func		= bpf_sysctl_get_name,
2270
	.gpl_only	= false,
2271
	.ret_type	= RET_INTEGER,
2272
	.arg1_type	= ARG_PTR_TO_CTX,
2273
	.arg2_type	= ARG_PTR_TO_MEM | MEM_WRITE,
2274
	.arg3_type	= ARG_CONST_SIZE,
2275
	.arg4_type	= ARG_ANYTHING,
2276
};
2277

2278
static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
2279
			     size_t src_len)
2280
{
2281
	if (!dst)
2282
		return -EINVAL;
2283

2284
	if (!dst_len)
2285
		return -E2BIG;
2286

2287
	if (!src || !src_len) {
2288
		memset(dst, 0, dst_len);
2289
		return -EINVAL;
2290
	}
2291

2292
	memcpy(dst, src, min(dst_len, src_len));
2293

2294
	if (dst_len > src_len) {
2295
		memset(dst + src_len, '\0', dst_len - src_len);
2296
		return src_len;
2297
	}
2298

2299
	dst[dst_len - 1] = '\0';
2300

2301
	return -E2BIG;
2302
}
2303

2304
BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2305
	   char *, buf, size_t, buf_len)
2306
{
2307
	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
2308
}
2309

2310
static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2311
	.func		= bpf_sysctl_get_current_value,
2312
	.gpl_only	= false,
2313
	.ret_type	= RET_INTEGER,
2314
	.arg1_type	= ARG_PTR_TO_CTX,
2315
	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
2316
	.arg3_type	= ARG_CONST_SIZE,
2317
};
2318

2319
BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
2320
	   size_t, buf_len)
2321
{
2322
	if (!ctx->write) {
2323
		if (buf && buf_len)
2324
			memset(buf, '\0', buf_len);
2325
		return -EINVAL;
2326
	}
2327
	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
2328
}
2329

2330
static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2331
	.func		= bpf_sysctl_get_new_value,
2332
	.gpl_only	= false,
2333
	.ret_type	= RET_INTEGER,
2334
	.arg1_type	= ARG_PTR_TO_CTX,
2335
	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
2336
	.arg3_type	= ARG_CONST_SIZE,
2337
};
2338

2339
BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2340
	   const char *, buf, size_t, buf_len)
2341
{
2342
	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
2343
		return -EINVAL;
2344

2345
	if (buf_len > PAGE_SIZE - 1)
2346
		return -E2BIG;
2347

2348
	memcpy(ctx->new_val, buf, buf_len);
2349
	ctx->new_len = buf_len;
2350
	ctx->new_updated = 1;
2351

2352
	return 0;
2353
}
2354

2355
static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2356
	.func		= bpf_sysctl_set_new_value,
2357
	.gpl_only	= false,
2358
	.ret_type	= RET_INTEGER,
2359
	.arg1_type	= ARG_PTR_TO_CTX,
2360
	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
2361
	.arg3_type	= ARG_CONST_SIZE,
2362
};
2363

2364
static const struct bpf_func_proto *
2365
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2366
{
2367
	const struct bpf_func_proto *func_proto;
2368

2369
	func_proto = cgroup_common_func_proto(func_id, prog);
2370
	if (func_proto)
2371
		return func_proto;
2372

2373
	switch (func_id) {
2374
	case BPF_FUNC_sysctl_get_name:
2375
		return &bpf_sysctl_get_name_proto;
2376
	case BPF_FUNC_sysctl_get_current_value:
2377
		return &bpf_sysctl_get_current_value_proto;
2378
	case BPF_FUNC_sysctl_get_new_value:
2379
		return &bpf_sysctl_get_new_value_proto;
2380
	case BPF_FUNC_sysctl_set_new_value:
2381
		return &bpf_sysctl_set_new_value_proto;
2382
	case BPF_FUNC_ktime_get_coarse_ns:
2383
		return &bpf_ktime_get_coarse_ns_proto;
2384
	case BPF_FUNC_perf_event_output:
2385
		return &bpf_event_output_data_proto;
2386
	default:
2387
		return bpf_base_func_proto(func_id, prog);
2388
	}
2389
}
2390

2391
static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2392
				   const struct bpf_prog *prog,
2393
				   struct bpf_insn_access_aux *info)
2394
{
2395
	const int size_default = sizeof(__u32);
2396

2397
	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2398
		return false;
2399

2400
	switch (off) {
2401
	case bpf_ctx_range(struct bpf_sysctl, write):
2402
		if (type != BPF_READ)
2403
			return false;
2404
		bpf_ctx_record_field_size(info, size_default);
2405
		return bpf_ctx_narrow_access_ok(off, size, size_default);
2406
	case bpf_ctx_range(struct bpf_sysctl, file_pos):
2407
		if (type == BPF_READ) {
2408
			bpf_ctx_record_field_size(info, size_default);
2409
			return bpf_ctx_narrow_access_ok(off, size, size_default);
2410
		} else {
2411
			return size == size_default;
2412
		}
2413
	default:
2414
		return false;
2415
	}
2416
}
2417

2418
static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2419
				     const struct bpf_insn *si,
2420
				     struct bpf_insn *insn_buf,
2421
				     struct bpf_prog *prog, u32 *target_size)
2422
{
2423
	struct bpf_insn *insn = insn_buf;
2424
	u32 read_size;
2425

2426
	switch (si->off) {
2427
	case offsetof(struct bpf_sysctl, write):
2428
		*insn++ = BPF_LDX_MEM(
2429
			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2430
			bpf_target_off(struct bpf_sysctl_kern, write,
2431
				       sizeof_field(struct bpf_sysctl_kern,
2432
						    write),
2433
				       target_size));
2434
		break;
2435
	case offsetof(struct bpf_sysctl, file_pos):
2436
		/* ppos is a pointer so it should be accessed via indirect
2437
		 * loads and stores. Also for stores additional temporary
2438
		 * register is used since neither src_reg nor dst_reg can be
2439
		 * overridden.
2440
		 */
2441
		if (type == BPF_WRITE) {
2442
			int treg = BPF_REG_9;
2443

2444
			if (si->src_reg == treg || si->dst_reg == treg)
2445
				--treg;
2446
			if (si->src_reg == treg || si->dst_reg == treg)
2447
				--treg;
2448
			*insn++ = BPF_STX_MEM(
2449
				BPF_DW, si->dst_reg, treg,
2450
				offsetof(struct bpf_sysctl_kern, tmp_reg));
2451
			*insn++ = BPF_LDX_MEM(
2452
				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2453
				treg, si->dst_reg,
2454
				offsetof(struct bpf_sysctl_kern, ppos));
2455
			*insn++ = BPF_RAW_INSN(
2456
				BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
2457
				treg, si->src_reg,
2458
				bpf_ctx_narrow_access_offset(
2459
					0, sizeof(u32), sizeof(loff_t)),
2460
				si->imm);
2461
			*insn++ = BPF_LDX_MEM(
2462
				BPF_DW, treg, si->dst_reg,
2463
				offsetof(struct bpf_sysctl_kern, tmp_reg));
2464
		} else {
2465
			*insn++ = BPF_LDX_MEM(
2466
				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2467
				si->dst_reg, si->src_reg,
2468
				offsetof(struct bpf_sysctl_kern, ppos));
2469
			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2470
			*insn++ = BPF_LDX_MEM(
2471
				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2472
				bpf_ctx_narrow_access_offset(
2473
					0, read_size, sizeof(loff_t)));
2474
		}
2475
		*target_size = sizeof(u32);
2476
		break;
2477
	}
2478

2479
	return insn - insn_buf;
2480
}
2481

2482
const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2483
	.get_func_proto		= sysctl_func_proto,
2484
	.is_valid_access	= sysctl_is_valid_access,
2485
	.convert_ctx_access	= sysctl_convert_ctx_access,
2486
};
2487

2488
const struct bpf_prog_ops cg_sysctl_prog_ops = {
2489
};
2490

2491
#ifdef CONFIG_NET
2492
BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2493
{
2494
	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2495

2496
	return net->net_cookie;
2497
}
2498

2499
static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2500
	.func		= bpf_get_netns_cookie_sockopt,
2501
	.gpl_only	= false,
2502
	.ret_type	= RET_INTEGER,
2503
	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
2504
};
2505
#endif
2506

2507
static const struct bpf_func_proto *
2508
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2509
{
2510
	const struct bpf_func_proto *func_proto;
2511

2512
	func_proto = cgroup_common_func_proto(func_id, prog);
2513
	if (func_proto)
2514
		return func_proto;
2515

2516
	switch (func_id) {
2517
#ifdef CONFIG_NET
2518
	case BPF_FUNC_get_netns_cookie:
2519
		return &bpf_get_netns_cookie_sockopt_proto;
2520
	case BPF_FUNC_sk_storage_get:
2521
		return &bpf_sk_storage_get_proto;
2522
	case BPF_FUNC_sk_storage_delete:
2523
		return &bpf_sk_storage_delete_proto;
2524
	case BPF_FUNC_setsockopt:
2525
		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2526
			return &bpf_sk_setsockopt_proto;
2527
		return NULL;
2528
	case BPF_FUNC_getsockopt:
2529
		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2530
			return &bpf_sk_getsockopt_proto;
2531
		return NULL;
2532
#endif
2533
#ifdef CONFIG_INET
2534
	case BPF_FUNC_tcp_sock:
2535
		return &bpf_tcp_sock_proto;
2536
#endif
2537
	case BPF_FUNC_perf_event_output:
2538
		return &bpf_event_output_data_proto;
2539
	default:
2540
		return bpf_base_func_proto(func_id, prog);
2541
	}
2542
}
2543

2544
static bool cg_sockopt_is_valid_access(int off, int size,
2545
				       enum bpf_access_type type,
2546
				       const struct bpf_prog *prog,
2547
				       struct bpf_insn_access_aux *info)
2548
{
2549
	const int size_default = sizeof(__u32);
2550

2551
	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2552
		return false;
2553

2554
	if (off % size != 0)
2555
		return false;
2556

2557
	if (type == BPF_WRITE) {
2558
		switch (off) {
2559
		case offsetof(struct bpf_sockopt, retval):
2560
			if (size != size_default)
2561
				return false;
2562
			return prog->expected_attach_type ==
2563
				BPF_CGROUP_GETSOCKOPT;
2564
		case offsetof(struct bpf_sockopt, optname):
2565
			fallthrough;
2566
		case offsetof(struct bpf_sockopt, level):
2567
			if (size != size_default)
2568
				return false;
2569
			return prog->expected_attach_type ==
2570
				BPF_CGROUP_SETSOCKOPT;
2571
		case offsetof(struct bpf_sockopt, optlen):
2572
			return size == size_default;
2573
		default:
2574
			return false;
2575
		}
2576
	}
2577

2578
	switch (off) {
2579
	case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
2580
		if (size != sizeof(__u64))
2581
			return false;
2582
		info->reg_type = PTR_TO_SOCKET;
2583
		break;
2584
	case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
2585
		if (size != sizeof(__u64))
2586
			return false;
2587
		info->reg_type = PTR_TO_PACKET;
2588
		break;
2589
	case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
2590
		if (size != sizeof(__u64))
2591
			return false;
2592
		info->reg_type = PTR_TO_PACKET_END;
2593
		break;
2594
	case bpf_ctx_range(struct bpf_sockopt, retval):
2595
		if (size != size_default)
2596
			return false;
2597
		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2598
	default:
2599
		if (size != size_default)
2600
			return false;
2601
		break;
2602
	}
2603
	return true;
2604
}
2605

2606
#define CG_SOCKOPT_READ_FIELD(F)					\
2607
	BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),	\
2608
		    si->dst_reg, si->src_reg,				\
2609
		    offsetof(struct bpf_sockopt_kern, F))
2610

2611
#define CG_SOCKOPT_WRITE_FIELD(F)					\
2612
	BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) |	\
2613
		      BPF_MEM | BPF_CLASS(si->code)),			\
2614
		     si->dst_reg, si->src_reg,				\
2615
		     offsetof(struct bpf_sockopt_kern, F),		\
2616
		     si->imm)
2617

2618
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2619
					 const struct bpf_insn *si,
2620
					 struct bpf_insn *insn_buf,
2621
					 struct bpf_prog *prog,
2622
					 u32 *target_size)
2623
{
2624
	struct bpf_insn *insn = insn_buf;
2625

2626
	switch (si->off) {
2627
	case offsetof(struct bpf_sockopt, sk):
2628
		*insn++ = CG_SOCKOPT_READ_FIELD(sk);
2629
		break;
2630
	case offsetof(struct bpf_sockopt, level):
2631
		if (type == BPF_WRITE)
2632
			*insn++ = CG_SOCKOPT_WRITE_FIELD(level);
2633
		else
2634
			*insn++ = CG_SOCKOPT_READ_FIELD(level);
2635
		break;
2636
	case offsetof(struct bpf_sockopt, optname):
2637
		if (type == BPF_WRITE)
2638
			*insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
2639
		else
2640
			*insn++ = CG_SOCKOPT_READ_FIELD(optname);
2641
		break;
2642
	case offsetof(struct bpf_sockopt, optlen):
2643
		if (type == BPF_WRITE)
2644
			*insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
2645
		else
2646
			*insn++ = CG_SOCKOPT_READ_FIELD(optlen);
2647
		break;
2648
	case offsetof(struct bpf_sockopt, retval):
2649
		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2650

2651
		if (type == BPF_WRITE) {
2652
			int treg = BPF_REG_9;
2653

2654
			if (si->src_reg == treg || si->dst_reg == treg)
2655
				--treg;
2656
			if (si->src_reg == treg || si->dst_reg == treg)
2657
				--treg;
2658
			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2659
					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2660
			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2661
					      treg, si->dst_reg,
2662
					      offsetof(struct bpf_sockopt_kern, current_task));
2663
			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2664
					      treg, treg,
2665
					      offsetof(struct task_struct, bpf_ctx));
2666
			*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
2667
					       BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2668
					       treg, si->src_reg,
2669
					       offsetof(struct bpf_cg_run_ctx, retval),
2670
					       si->imm);
2671
			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2672
					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2673
		} else {
2674
			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2675
					      si->dst_reg, si->src_reg,
2676
					      offsetof(struct bpf_sockopt_kern, current_task));
2677
			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2678
					      si->dst_reg, si->dst_reg,
2679
					      offsetof(struct task_struct, bpf_ctx));
2680
			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2681
					      si->dst_reg, si->dst_reg,
2682
					      offsetof(struct bpf_cg_run_ctx, retval));
2683
		}
2684
		break;
2685
	case offsetof(struct bpf_sockopt, optval):
2686
		*insn++ = CG_SOCKOPT_READ_FIELD(optval);
2687
		break;
2688
	case offsetof(struct bpf_sockopt, optval_end):
2689
		*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
2690
		break;
2691
	}
2692

2693
	return insn - insn_buf;
2694
}
2695

2696
static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2697
				   bool direct_write,
2698
				   const struct bpf_prog *prog)
2699
{
2700
	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2701
	 */
2702
	return 0;
2703
}
2704

2705
const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2706
	.get_func_proto		= cg_sockopt_func_proto,
2707
	.is_valid_access	= cg_sockopt_is_valid_access,
2708
	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2709
	.gen_prologue		= cg_sockopt_get_prologue,
2710
};
2711

2712
const struct bpf_prog_ops cg_sockopt_prog_ops = {
2713
};
2714

2715
/* Common helpers for cgroup hooks. */
2716
const struct bpf_func_proto *
2717
cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2718
{
2719
	switch (func_id) {
2720
	case BPF_FUNC_get_local_storage:
2721
		return &bpf_get_local_storage_proto;
2722
	case BPF_FUNC_get_retval:
2723
		switch (prog->expected_attach_type) {
2724
		case BPF_CGROUP_INET_INGRESS:
2725
		case BPF_CGROUP_INET_EGRESS:
2726
		case BPF_CGROUP_SOCK_OPS:
2727
		case BPF_CGROUP_UDP4_RECVMSG:
2728
		case BPF_CGROUP_UDP6_RECVMSG:
2729
		case BPF_CGROUP_UNIX_RECVMSG:
2730
		case BPF_CGROUP_INET4_GETPEERNAME:
2731
		case BPF_CGROUP_INET6_GETPEERNAME:
2732
		case BPF_CGROUP_UNIX_GETPEERNAME:
2733
		case BPF_CGROUP_INET4_GETSOCKNAME:
2734
		case BPF_CGROUP_INET6_GETSOCKNAME:
2735
		case BPF_CGROUP_UNIX_GETSOCKNAME:
2736
			return NULL;
2737
		default:
2738
			return &bpf_get_retval_proto;
2739
		}
2740
	case BPF_FUNC_set_retval:
2741
		switch (prog->expected_attach_type) {
2742
		case BPF_CGROUP_INET_INGRESS:
2743
		case BPF_CGROUP_INET_EGRESS:
2744
		case BPF_CGROUP_SOCK_OPS:
2745
		case BPF_CGROUP_UDP4_RECVMSG:
2746
		case BPF_CGROUP_UDP6_RECVMSG:
2747
		case BPF_CGROUP_UNIX_RECVMSG:
2748
		case BPF_CGROUP_INET4_GETPEERNAME:
2749
		case BPF_CGROUP_INET6_GETPEERNAME:
2750
		case BPF_CGROUP_UNIX_GETPEERNAME:
2751
		case BPF_CGROUP_INET4_GETSOCKNAME:
2752
		case BPF_CGROUP_INET6_GETSOCKNAME:
2753
		case BPF_CGROUP_UNIX_GETSOCKNAME:
2754
			return NULL;
2755
		default:
2756
			return &bpf_set_retval_proto;
2757
		}
2758
	default:
2759
		return NULL;
2760
	}
2761
}
2762

2763
Product

Resources

Company