Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/cgroup.c
29267 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Functions to manage eBPF programs attached to cgroups
4
*
5
* Copyright (c) 2016 Daniel Mack
6
*/
7
8
#include <linux/kernel.h>
9
#include <linux/atomic.h>
10
#include <linux/cgroup.h>
11
#include <linux/filter.h>
12
#include <linux/slab.h>
13
#include <linux/sysctl.h>
14
#include <linux/string.h>
15
#include <linux/bpf.h>
16
#include <linux/bpf-cgroup.h>
17
#include <linux/bpf_lsm.h>
18
#include <linux/bpf_verifier.h>
19
#include <net/sock.h>
20
#include <net/bpf_sk_storage.h>
21
22
#include "../cgroup/cgroup-internal.h"
23
24
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
25
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
26
27
/*
28
* cgroup bpf destruction makes heavy use of work items and there can be a lot
29
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
30
* destruction work items don't end up filling up max_active of system_percpu_wq
31
* which may lead to deadlock.
32
*/
33
static struct workqueue_struct *cgroup_bpf_destroy_wq;
34
35
static int __init cgroup_bpf_wq_init(void)
36
{
37
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
38
WQ_PERCPU, 1);
39
if (!cgroup_bpf_destroy_wq)
40
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
41
return 0;
42
}
43
core_initcall(cgroup_bpf_wq_init);
44
45
static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
46
unsigned long action, void *data);
47
48
static struct notifier_block cgroup_bpf_lifetime_nb = {
49
.notifier_call = cgroup_bpf_lifetime_notify,
50
};
51
52
void __init cgroup_bpf_lifetime_notifier_init(void)
53
{
54
BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
55
&cgroup_bpf_lifetime_nb));
56
}
57
58
/* __always_inline is necessary to prevent indirect call through run_prog
59
* function pointer.
60
*/
61
static __always_inline int
62
bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
63
enum cgroup_bpf_attach_type atype,
64
const void *ctx, bpf_prog_run_fn run_prog,
65
int retval, u32 *ret_flags)
66
{
67
const struct bpf_prog_array_item *item;
68
const struct bpf_prog *prog;
69
const struct bpf_prog_array *array;
70
struct bpf_run_ctx *old_run_ctx;
71
struct bpf_cg_run_ctx run_ctx;
72
u32 func_ret;
73
74
run_ctx.retval = retval;
75
rcu_read_lock_dont_migrate();
76
array = rcu_dereference(cgrp->effective[atype]);
77
item = &array->items[0];
78
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
79
while ((prog = READ_ONCE(item->prog))) {
80
run_ctx.prog_item = item;
81
func_ret = run_prog(prog, ctx);
82
if (ret_flags) {
83
*(ret_flags) |= (func_ret >> 1);
84
func_ret &= 1;
85
}
86
if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
87
run_ctx.retval = -EPERM;
88
item++;
89
}
90
bpf_reset_run_ctx(old_run_ctx);
91
rcu_read_unlock_migrate();
92
return run_ctx.retval;
93
}
94
95
unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
96
const struct bpf_insn *insn)
97
{
98
const struct bpf_prog *shim_prog;
99
struct sock *sk;
100
struct cgroup *cgrp;
101
int ret = 0;
102
u64 *args;
103
104
args = (u64 *)ctx;
105
sk = (void *)(unsigned long)args[0];
106
/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
107
shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
108
109
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
110
if (likely(cgrp))
111
ret = bpf_prog_run_array_cg(&cgrp->bpf,
112
shim_prog->aux->cgroup_atype,
113
ctx, bpf_prog_run, 0, NULL);
114
return ret;
115
}
116
117
unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
118
const struct bpf_insn *insn)
119
{
120
const struct bpf_prog *shim_prog;
121
struct socket *sock;
122
struct cgroup *cgrp;
123
int ret = 0;
124
u64 *args;
125
126
args = (u64 *)ctx;
127
sock = (void *)(unsigned long)args[0];
128
/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
129
shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
130
131
cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
132
if (likely(cgrp))
133
ret = bpf_prog_run_array_cg(&cgrp->bpf,
134
shim_prog->aux->cgroup_atype,
135
ctx, bpf_prog_run, 0, NULL);
136
return ret;
137
}
138
139
unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
140
const struct bpf_insn *insn)
141
{
142
const struct bpf_prog *shim_prog;
143
struct cgroup *cgrp;
144
int ret = 0;
145
146
/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
147
shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
148
149
/* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
150
cgrp = task_dfl_cgroup(current);
151
if (likely(cgrp))
152
ret = bpf_prog_run_array_cg(&cgrp->bpf,
153
shim_prog->aux->cgroup_atype,
154
ctx, bpf_prog_run, 0, NULL);
155
return ret;
156
}
157
158
#ifdef CONFIG_BPF_LSM
159
struct cgroup_lsm_atype {
160
u32 attach_btf_id;
161
int refcnt;
162
};
163
164
static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
165
166
static enum cgroup_bpf_attach_type
167
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
168
{
169
int i;
170
171
lockdep_assert_held(&cgroup_mutex);
172
173
if (attach_type != BPF_LSM_CGROUP)
174
return to_cgroup_bpf_attach_type(attach_type);
175
176
for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
177
if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
178
return CGROUP_LSM_START + i;
179
180
for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
181
if (cgroup_lsm_atype[i].attach_btf_id == 0)
182
return CGROUP_LSM_START + i;
183
184
return -E2BIG;
185
186
}
187
188
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
189
{
190
int i = cgroup_atype - CGROUP_LSM_START;
191
192
lockdep_assert_held(&cgroup_mutex);
193
194
WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
195
cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
196
197
cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
198
cgroup_lsm_atype[i].refcnt++;
199
}
200
201
void bpf_cgroup_atype_put(int cgroup_atype)
202
{
203
int i = cgroup_atype - CGROUP_LSM_START;
204
205
cgroup_lock();
206
if (--cgroup_lsm_atype[i].refcnt <= 0)
207
cgroup_lsm_atype[i].attach_btf_id = 0;
208
WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
209
cgroup_unlock();
210
}
211
#else
212
static enum cgroup_bpf_attach_type
213
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
214
{
215
if (attach_type != BPF_LSM_CGROUP)
216
return to_cgroup_bpf_attach_type(attach_type);
217
return -EOPNOTSUPP;
218
}
219
#endif /* CONFIG_BPF_LSM */
220
221
static void cgroup_bpf_offline(struct cgroup *cgrp)
222
{
223
cgroup_get(cgrp);
224
percpu_ref_kill(&cgrp->bpf.refcnt);
225
}
226
227
static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
228
{
229
enum bpf_cgroup_storage_type stype;
230
231
for_each_cgroup_storage_type(stype)
232
bpf_cgroup_storage_free(storages[stype]);
233
}
234
235
static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
236
struct bpf_cgroup_storage *new_storages[],
237
enum bpf_attach_type type,
238
struct bpf_prog *prog,
239
struct cgroup *cgrp)
240
{
241
enum bpf_cgroup_storage_type stype;
242
struct bpf_cgroup_storage_key key;
243
struct bpf_map *map;
244
245
key.cgroup_inode_id = cgroup_id(cgrp);
246
key.attach_type = type;
247
248
for_each_cgroup_storage_type(stype) {
249
map = prog->aux->cgroup_storage[stype];
250
if (!map)
251
continue;
252
253
storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
254
if (storages[stype])
255
continue;
256
257
storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
258
if (IS_ERR(storages[stype])) {
259
bpf_cgroup_storages_free(new_storages);
260
return -ENOMEM;
261
}
262
263
new_storages[stype] = storages[stype];
264
}
265
266
return 0;
267
}
268
269
static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
270
struct bpf_cgroup_storage *src[])
271
{
272
enum bpf_cgroup_storage_type stype;
273
274
for_each_cgroup_storage_type(stype)
275
dst[stype] = src[stype];
276
}
277
278
static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
279
struct cgroup *cgrp,
280
enum bpf_attach_type attach_type)
281
{
282
enum bpf_cgroup_storage_type stype;
283
284
for_each_cgroup_storage_type(stype)
285
bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
286
}
287
288
/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
289
* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
290
* doesn't free link memory, which will eventually be done by bpf_link's
291
* release() callback, when its last FD is closed.
292
*/
293
static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
294
{
295
cgroup_put(link->cgroup);
296
link->cgroup = NULL;
297
}
298
299
/**
300
* cgroup_bpf_release() - put references of all bpf programs and
301
* release all cgroup bpf data
302
* @work: work structure embedded into the cgroup to modify
303
*/
304
static void cgroup_bpf_release(struct work_struct *work)
305
{
306
struct cgroup *p, *cgrp = container_of(work, struct cgroup,
307
bpf.release_work);
308
struct bpf_prog_array *old_array;
309
struct list_head *storages = &cgrp->bpf.storages;
310
struct bpf_cgroup_storage *storage, *stmp;
311
312
unsigned int atype;
313
314
cgroup_lock();
315
316
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
317
struct hlist_head *progs = &cgrp->bpf.progs[atype];
318
struct bpf_prog_list *pl;
319
struct hlist_node *pltmp;
320
321
hlist_for_each_entry_safe(pl, pltmp, progs, node) {
322
hlist_del(&pl->node);
323
if (pl->prog) {
324
if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
325
bpf_trampoline_unlink_cgroup_shim(pl->prog);
326
bpf_prog_put(pl->prog);
327
}
328
if (pl->link) {
329
if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
330
bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
331
bpf_cgroup_link_auto_detach(pl->link);
332
}
333
kfree(pl);
334
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
335
}
336
old_array = rcu_dereference_protected(
337
cgrp->bpf.effective[atype],
338
lockdep_is_held(&cgroup_mutex));
339
bpf_prog_array_free(old_array);
340
}
341
342
list_for_each_entry_safe(storage, stmp, storages, list_cg) {
343
bpf_cgroup_storage_unlink(storage);
344
bpf_cgroup_storage_free(storage);
345
}
346
347
cgroup_unlock();
348
349
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
350
cgroup_bpf_put(p);
351
352
percpu_ref_exit(&cgrp->bpf.refcnt);
353
cgroup_put(cgrp);
354
}
355
356
/**
357
* cgroup_bpf_release_fn() - callback used to schedule releasing
358
* of bpf cgroup data
359
* @ref: percpu ref counter structure
360
*/
361
static void cgroup_bpf_release_fn(struct percpu_ref *ref)
362
{
363
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
364
365
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
366
queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
367
}
368
369
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
370
* link or direct prog.
371
*/
372
static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
373
{
374
if (pl->prog)
375
return pl->prog;
376
if (pl->link)
377
return pl->link->link.prog;
378
return NULL;
379
}
380
381
/* count number of elements in the list.
382
* it's slow but the list cannot be long
383
*/
384
static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
385
{
386
struct bpf_prog_list *pl;
387
u32 cnt = 0;
388
389
hlist_for_each_entry(pl, head, node) {
390
if (!prog_list_prog(pl))
391
continue;
392
if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
393
(*preorder_cnt)++;
394
cnt++;
395
}
396
return cnt;
397
}
398
399
/* if parent has non-overridable prog attached,
400
* disallow attaching new programs to the descendent cgroup.
401
* if parent has overridable or multi-prog, allow attaching
402
*/
403
static bool hierarchy_allows_attach(struct cgroup *cgrp,
404
enum cgroup_bpf_attach_type atype)
405
{
406
struct cgroup *p;
407
408
p = cgroup_parent(cgrp);
409
if (!p)
410
return true;
411
do {
412
u32 flags = p->bpf.flags[atype];
413
u32 cnt;
414
415
if (flags & BPF_F_ALLOW_MULTI)
416
return true;
417
cnt = prog_list_length(&p->bpf.progs[atype], NULL);
418
WARN_ON_ONCE(cnt > 1);
419
if (cnt == 1)
420
return !!(flags & BPF_F_ALLOW_OVERRIDE);
421
p = cgroup_parent(p);
422
} while (p);
423
return true;
424
}
425
426
/* compute a chain of effective programs for a given cgroup:
427
* start from the list of programs in this cgroup and add
428
* all parent programs.
429
* Note that parent's F_ALLOW_OVERRIDE-type program is yielding
430
* to programs in this cgroup
431
*/
432
static int compute_effective_progs(struct cgroup *cgrp,
433
enum cgroup_bpf_attach_type atype,
434
struct bpf_prog_array **array)
435
{
436
struct bpf_prog_array_item *item;
437
struct bpf_prog_array *progs;
438
struct bpf_prog_list *pl;
439
struct cgroup *p = cgrp;
440
int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
441
442
/* count number of effective programs by walking parents */
443
do {
444
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
445
cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
446
p = cgroup_parent(p);
447
} while (p);
448
449
progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
450
if (!progs)
451
return -ENOMEM;
452
453
/* populate the array with effective progs */
454
cnt = 0;
455
p = cgrp;
456
fstart = preorder_cnt;
457
bstart = preorder_cnt - 1;
458
do {
459
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
460
continue;
461
462
init_bstart = bstart;
463
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
464
if (!prog_list_prog(pl))
465
continue;
466
467
if (pl->flags & BPF_F_PREORDER) {
468
item = &progs->items[bstart];
469
bstart--;
470
} else {
471
item = &progs->items[fstart];
472
fstart++;
473
}
474
item->prog = prog_list_prog(pl);
475
bpf_cgroup_storages_assign(item->cgroup_storage,
476
pl->storage);
477
cnt++;
478
}
479
480
/* reverse pre-ordering progs at this cgroup level */
481
for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
482
swap(progs->items[i], progs->items[j]);
483
484
} while ((p = cgroup_parent(p)));
485
486
*array = progs;
487
return 0;
488
}
489
490
static void activate_effective_progs(struct cgroup *cgrp,
491
enum cgroup_bpf_attach_type atype,
492
struct bpf_prog_array *old_array)
493
{
494
old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
495
lockdep_is_held(&cgroup_mutex));
496
/* free prog array after grace period, since __cgroup_bpf_run_*()
497
* might be still walking the array
498
*/
499
bpf_prog_array_free(old_array);
500
}
501
502
/**
503
* cgroup_bpf_inherit() - inherit effective programs from parent
504
* @cgrp: the cgroup to modify
505
*/
506
static int cgroup_bpf_inherit(struct cgroup *cgrp)
507
{
508
/* has to use marco instead of const int, since compiler thinks
509
* that array below is variable length
510
*/
511
#define NR ARRAY_SIZE(cgrp->bpf.effective)
512
struct bpf_prog_array *arrays[NR] = {};
513
struct cgroup *p;
514
int ret, i;
515
516
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
517
GFP_KERNEL);
518
if (ret)
519
return ret;
520
521
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
522
cgroup_bpf_get(p);
523
524
for (i = 0; i < NR; i++)
525
INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
526
527
INIT_LIST_HEAD(&cgrp->bpf.storages);
528
529
for (i = 0; i < NR; i++)
530
if (compute_effective_progs(cgrp, i, &arrays[i]))
531
goto cleanup;
532
533
for (i = 0; i < NR; i++)
534
activate_effective_progs(cgrp, i, arrays[i]);
535
536
return 0;
537
cleanup:
538
for (i = 0; i < NR; i++)
539
bpf_prog_array_free(arrays[i]);
540
541
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
542
cgroup_bpf_put(p);
543
544
percpu_ref_exit(&cgrp->bpf.refcnt);
545
546
return -ENOMEM;
547
}
548
549
static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
550
unsigned long action, void *data)
551
{
552
struct cgroup *cgrp = data;
553
int ret = 0;
554
555
if (cgrp->root != &cgrp_dfl_root)
556
return NOTIFY_OK;
557
558
switch (action) {
559
case CGROUP_LIFETIME_ONLINE:
560
ret = cgroup_bpf_inherit(cgrp);
561
break;
562
case CGROUP_LIFETIME_OFFLINE:
563
cgroup_bpf_offline(cgrp);
564
break;
565
}
566
567
return notifier_from_errno(ret);
568
}
569
570
static int update_effective_progs(struct cgroup *cgrp,
571
enum cgroup_bpf_attach_type atype)
572
{
573
struct cgroup_subsys_state *css;
574
int err;
575
576
/* allocate and recompute effective prog arrays */
577
css_for_each_descendant_pre(css, &cgrp->self) {
578
struct cgroup *desc = container_of(css, struct cgroup, self);
579
580
if (percpu_ref_is_zero(&desc->bpf.refcnt))
581
continue;
582
583
err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
584
if (err)
585
goto cleanup;
586
}
587
588
/* all allocations were successful. Activate all prog arrays */
589
css_for_each_descendant_pre(css, &cgrp->self) {
590
struct cgroup *desc = container_of(css, struct cgroup, self);
591
592
if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
593
if (unlikely(desc->bpf.inactive)) {
594
bpf_prog_array_free(desc->bpf.inactive);
595
desc->bpf.inactive = NULL;
596
}
597
continue;
598
}
599
600
activate_effective_progs(desc, atype, desc->bpf.inactive);
601
desc->bpf.inactive = NULL;
602
}
603
604
return 0;
605
606
cleanup:
607
/* oom while computing effective. Free all computed effective arrays
608
* since they were not activated
609
*/
610
css_for_each_descendant_pre(css, &cgrp->self) {
611
struct cgroup *desc = container_of(css, struct cgroup, self);
612
613
bpf_prog_array_free(desc->bpf.inactive);
614
desc->bpf.inactive = NULL;
615
}
616
617
return err;
618
}
619
620
#define BPF_CGROUP_MAX_PROGS 64
621
622
static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
623
struct bpf_prog *prog,
624
struct bpf_cgroup_link *link,
625
struct bpf_prog *replace_prog,
626
bool allow_multi)
627
{
628
struct bpf_prog_list *pl;
629
630
/* single-attach case */
631
if (!allow_multi) {
632
if (hlist_empty(progs))
633
return NULL;
634
return hlist_entry(progs->first, typeof(*pl), node);
635
}
636
637
hlist_for_each_entry(pl, progs, node) {
638
if (prog && pl->prog == prog && prog != replace_prog)
639
/* disallow attaching the same prog twice */
640
return ERR_PTR(-EINVAL);
641
if (link && pl->link == link)
642
/* disallow attaching the same link twice */
643
return ERR_PTR(-EINVAL);
644
}
645
646
/* direct prog multi-attach w/ replacement case */
647
if (replace_prog) {
648
hlist_for_each_entry(pl, progs, node) {
649
if (pl->prog == replace_prog)
650
/* a match found */
651
return pl;
652
}
653
/* prog to replace not found for cgroup */
654
return ERR_PTR(-ENOENT);
655
}
656
657
return NULL;
658
}
659
660
static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
661
{
662
struct bpf_link *link = ERR_PTR(-EINVAL);
663
664
if (flags & BPF_F_ID)
665
link = bpf_link_by_id(id_or_fd);
666
else if (id_or_fd)
667
link = bpf_link_get_from_fd(id_or_fd);
668
return link;
669
}
670
671
static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
672
{
673
struct bpf_prog *prog = ERR_PTR(-EINVAL);
674
675
if (flags & BPF_F_ID)
676
prog = bpf_prog_by_id(id_or_fd);
677
else if (id_or_fd)
678
prog = bpf_prog_get(id_or_fd);
679
return prog;
680
}
681
682
static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
683
struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
684
{
685
bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
686
struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
687
bool preorder = flags & BPF_F_PREORDER;
688
struct bpf_link *anchor_link = NULL;
689
struct bpf_prog *anchor_prog = NULL;
690
bool is_before, is_after;
691
692
is_before = flags & BPF_F_BEFORE;
693
is_after = flags & BPF_F_AFTER;
694
if (is_link || is_id || id_or_fd) {
695
/* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
696
if (is_before == is_after)
697
return ERR_PTR(-EINVAL);
698
if ((is_link && !link) || (!is_link && !prog))
699
return ERR_PTR(-EINVAL);
700
} else if (!hlist_empty(progs)) {
701
/* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
702
if (is_before && is_after)
703
return ERR_PTR(-EINVAL);
704
}
705
706
if (is_link) {
707
anchor_link = bpf_get_anchor_link(flags, id_or_fd);
708
if (IS_ERR(anchor_link))
709
return ERR_CAST(anchor_link);
710
} else if (is_id || id_or_fd) {
711
anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
712
if (IS_ERR(anchor_prog))
713
return ERR_CAST(anchor_prog);
714
}
715
716
if (!anchor_prog && !anchor_link) {
717
/* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
718
* doesn't matter since either prepend or append to a combined
719
* list of progs will end up with correct result.
720
*/
721
hlist_for_each_entry(pltmp, progs, node) {
722
if (is_before)
723
return pltmp;
724
if (pltmp->node.next)
725
continue;
726
return pltmp;
727
}
728
return NULL;
729
}
730
731
hlist_for_each_entry(pltmp, progs, node) {
732
if ((anchor_prog && anchor_prog == pltmp->prog) ||
733
(anchor_link && anchor_link == &pltmp->link->link)) {
734
if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
735
goto out;
736
pl = pltmp;
737
goto out;
738
}
739
}
740
741
pl = ERR_PTR(-ENOENT);
742
out:
743
if (anchor_link)
744
bpf_link_put(anchor_link);
745
else
746
bpf_prog_put(anchor_prog);
747
return pl;
748
}
749
750
static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
751
struct bpf_prog *prog, struct bpf_cgroup_link *link,
752
u32 flags, u32 id_or_fd)
753
{
754
struct bpf_prog_list *pltmp;
755
756
pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
757
if (IS_ERR(pltmp))
758
return PTR_ERR(pltmp);
759
760
if (!pltmp)
761
hlist_add_head(&pl->node, progs);
762
else if (flags & BPF_F_BEFORE)
763
hlist_add_before(&pl->node, &pltmp->node);
764
else
765
hlist_add_behind(&pl->node, &pltmp->node);
766
767
return 0;
768
}
769
770
/**
771
* __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
772
* propagate the change to descendants
773
* @cgrp: The cgroup which descendants to traverse
774
* @prog: A program to attach
775
* @link: A link to attach
776
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
777
* @type: Type of attach operation
778
* @flags: Option flags
779
* @id_or_fd: Relative prog id or fd
780
* @revision: bpf_prog_list revision
781
*
782
* Exactly one of @prog or @link can be non-null.
783
* Must be called with cgroup_mutex held.
784
*/
785
static int __cgroup_bpf_attach(struct cgroup *cgrp,
786
struct bpf_prog *prog, struct bpf_prog *replace_prog,
787
struct bpf_cgroup_link *link,
788
enum bpf_attach_type type, u32 flags, u32 id_or_fd,
789
u64 revision)
790
{
791
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
792
struct bpf_prog *old_prog = NULL;
793
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
794
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
795
struct bpf_prog *new_prog = prog ? : link->link.prog;
796
enum cgroup_bpf_attach_type atype;
797
struct bpf_prog_list *pl;
798
struct hlist_head *progs;
799
int err;
800
801
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
802
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
803
/* invalid combination */
804
return -EINVAL;
805
if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
806
/* only either replace or insertion with before/after */
807
return -EINVAL;
808
if (link && (prog || replace_prog))
809
/* only either link or prog/replace_prog can be specified */
810
return -EINVAL;
811
if (!!replace_prog != !!(flags & BPF_F_REPLACE))
812
/* replace_prog implies BPF_F_REPLACE, and vice versa */
813
return -EINVAL;
814
815
atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
816
if (atype < 0)
817
return -EINVAL;
818
if (revision && revision != cgrp->bpf.revisions[atype])
819
return -ESTALE;
820
821
progs = &cgrp->bpf.progs[atype];
822
823
if (!hierarchy_allows_attach(cgrp, atype))
824
return -EPERM;
825
826
if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
827
/* Disallow attaching non-overridable on top
828
* of existing overridable in this cgroup.
829
* Disallow attaching multi-prog if overridable or none
830
*/
831
return -EPERM;
832
833
if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
834
return -E2BIG;
835
836
pl = find_attach_entry(progs, prog, link, replace_prog,
837
flags & BPF_F_ALLOW_MULTI);
838
if (IS_ERR(pl))
839
return PTR_ERR(pl);
840
841
if (bpf_cgroup_storages_alloc(storage, new_storage, type,
842
prog ? : link->link.prog, cgrp))
843
return -ENOMEM;
844
845
if (pl) {
846
old_prog = pl->prog;
847
} else {
848
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
849
if (!pl) {
850
bpf_cgroup_storages_free(new_storage);
851
return -ENOMEM;
852
}
853
854
err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
855
if (err) {
856
kfree(pl);
857
bpf_cgroup_storages_free(new_storage);
858
return err;
859
}
860
}
861
862
pl->prog = prog;
863
pl->link = link;
864
pl->flags = flags;
865
bpf_cgroup_storages_assign(pl->storage, storage);
866
cgrp->bpf.flags[atype] = saved_flags;
867
868
if (type == BPF_LSM_CGROUP) {
869
err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
870
if (err)
871
goto cleanup;
872
}
873
874
err = update_effective_progs(cgrp, atype);
875
if (err)
876
goto cleanup_trampoline;
877
878
cgrp->bpf.revisions[atype] += 1;
879
if (old_prog) {
880
if (type == BPF_LSM_CGROUP)
881
bpf_trampoline_unlink_cgroup_shim(old_prog);
882
bpf_prog_put(old_prog);
883
} else {
884
static_branch_inc(&cgroup_bpf_enabled_key[atype]);
885
}
886
bpf_cgroup_storages_link(new_storage, cgrp, type);
887
return 0;
888
889
cleanup_trampoline:
890
if (type == BPF_LSM_CGROUP)
891
bpf_trampoline_unlink_cgroup_shim(new_prog);
892
893
cleanup:
894
if (old_prog) {
895
pl->prog = old_prog;
896
pl->link = NULL;
897
}
898
bpf_cgroup_storages_free(new_storage);
899
if (!old_prog) {
900
hlist_del(&pl->node);
901
kfree(pl);
902
}
903
return err;
904
}
905
906
static int cgroup_bpf_attach(struct cgroup *cgrp,
907
struct bpf_prog *prog, struct bpf_prog *replace_prog,
908
struct bpf_cgroup_link *link,
909
enum bpf_attach_type type,
910
u32 flags, u32 id_or_fd, u64 revision)
911
{
912
int ret;
913
914
cgroup_lock();
915
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
916
id_or_fd, revision);
917
cgroup_unlock();
918
return ret;
919
}
920
921
/* Swap updated BPF program for given link in effective program arrays across
922
* all descendant cgroups. This function is guaranteed to succeed.
923
*/
924
static void replace_effective_prog(struct cgroup *cgrp,
925
enum cgroup_bpf_attach_type atype,
926
struct bpf_cgroup_link *link)
927
{
928
struct bpf_prog_array_item *item;
929
struct cgroup_subsys_state *css;
930
struct bpf_prog_array *progs;
931
struct bpf_prog_list *pl;
932
struct hlist_head *head;
933
struct cgroup *cg;
934
int pos;
935
936
css_for_each_descendant_pre(css, &cgrp->self) {
937
struct cgroup *desc = container_of(css, struct cgroup, self);
938
939
if (percpu_ref_is_zero(&desc->bpf.refcnt))
940
continue;
941
942
/* find position of link in effective progs array */
943
for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
944
if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
945
continue;
946
947
head = &cg->bpf.progs[atype];
948
hlist_for_each_entry(pl, head, node) {
949
if (!prog_list_prog(pl))
950
continue;
951
if (pl->link == link)
952
goto found;
953
pos++;
954
}
955
}
956
found:
957
BUG_ON(!cg);
958
progs = rcu_dereference_protected(
959
desc->bpf.effective[atype],
960
lockdep_is_held(&cgroup_mutex));
961
item = &progs->items[pos];
962
WRITE_ONCE(item->prog, link->link.prog);
963
}
964
}
965
966
/**
967
* __cgroup_bpf_replace() - Replace link's program and propagate the change
968
* to descendants
969
* @cgrp: The cgroup which descendants to traverse
970
* @link: A link for which to replace BPF program
971
* @new_prog: &struct bpf_prog for the target BPF program with its refcnt
972
* incremented
973
*
974
* Must be called with cgroup_mutex held.
975
*/
976
static int __cgroup_bpf_replace(struct cgroup *cgrp,
977
struct bpf_cgroup_link *link,
978
struct bpf_prog *new_prog)
979
{
980
enum cgroup_bpf_attach_type atype;
981
struct bpf_prog *old_prog;
982
struct bpf_prog_list *pl;
983
struct hlist_head *progs;
984
bool found = false;
985
986
atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
987
if (atype < 0)
988
return -EINVAL;
989
990
progs = &cgrp->bpf.progs[atype];
991
992
if (link->link.prog->type != new_prog->type)
993
return -EINVAL;
994
995
hlist_for_each_entry(pl, progs, node) {
996
if (pl->link == link) {
997
found = true;
998
break;
999
}
1000
}
1001
if (!found)
1002
return -ENOENT;
1003
1004
cgrp->bpf.revisions[atype] += 1;
1005
old_prog = xchg(&link->link.prog, new_prog);
1006
replace_effective_prog(cgrp, atype, link);
1007
bpf_prog_put(old_prog);
1008
return 0;
1009
}
1010
1011
static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
1012
struct bpf_prog *old_prog)
1013
{
1014
struct bpf_cgroup_link *cg_link;
1015
int ret;
1016
1017
cg_link = container_of(link, struct bpf_cgroup_link, link);
1018
1019
cgroup_lock();
1020
/* link might have been auto-released by dying cgroup, so fail */
1021
if (!cg_link->cgroup) {
1022
ret = -ENOLINK;
1023
goto out_unlock;
1024
}
1025
if (old_prog && link->prog != old_prog) {
1026
ret = -EPERM;
1027
goto out_unlock;
1028
}
1029
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
1030
out_unlock:
1031
cgroup_unlock();
1032
return ret;
1033
}
1034
1035
static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
1036
struct bpf_prog *prog,
1037
struct bpf_cgroup_link *link,
1038
bool allow_multi)
1039
{
1040
struct bpf_prog_list *pl;
1041
1042
if (!allow_multi) {
1043
if (hlist_empty(progs))
1044
/* report error when trying to detach and nothing is attached */
1045
return ERR_PTR(-ENOENT);
1046
1047
/* to maintain backward compatibility NONE and OVERRIDE cgroups
1048
* allow detaching with invalid FD (prog==NULL) in legacy mode
1049
*/
1050
return hlist_entry(progs->first, typeof(*pl), node);
1051
}
1052
1053
if (!prog && !link)
1054
/* to detach MULTI prog the user has to specify valid FD
1055
* of the program or link to be detached
1056
*/
1057
return ERR_PTR(-EINVAL);
1058
1059
/* find the prog or link and detach it */
1060
hlist_for_each_entry(pl, progs, node) {
1061
if (pl->prog == prog && pl->link == link)
1062
return pl;
1063
}
1064
return ERR_PTR(-ENOENT);
1065
}
1066
1067
/**
1068
* purge_effective_progs() - After compute_effective_progs fails to alloc new
1069
* cgrp->bpf.inactive table we can recover by
1070
* recomputing the array in place.
1071
*
1072
* @cgrp: The cgroup which descendants to travers
1073
* @prog: A program to detach or NULL
1074
* @link: A link to detach or NULL
1075
* @atype: Type of detach operation
1076
*/
1077
static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
1078
struct bpf_cgroup_link *link,
1079
enum cgroup_bpf_attach_type atype)
1080
{
1081
struct cgroup_subsys_state *css;
1082
struct bpf_prog_array *progs;
1083
struct bpf_prog_list *pl;
1084
struct hlist_head *head;
1085
struct cgroup *cg;
1086
int pos;
1087
1088
/* recompute effective prog array in place */
1089
css_for_each_descendant_pre(css, &cgrp->self) {
1090
struct cgroup *desc = container_of(css, struct cgroup, self);
1091
1092
if (percpu_ref_is_zero(&desc->bpf.refcnt))
1093
continue;
1094
1095
/* find position of link or prog in effective progs array */
1096
for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
1097
if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
1098
continue;
1099
1100
head = &cg->bpf.progs[atype];
1101
hlist_for_each_entry(pl, head, node) {
1102
if (!prog_list_prog(pl))
1103
continue;
1104
if (pl->prog == prog && pl->link == link)
1105
goto found;
1106
pos++;
1107
}
1108
}
1109
1110
/* no link or prog match, skip the cgroup of this layer */
1111
continue;
1112
found:
1113
progs = rcu_dereference_protected(
1114
desc->bpf.effective[atype],
1115
lockdep_is_held(&cgroup_mutex));
1116
1117
/* Remove the program from the array */
1118
WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
1119
"Failed to purge a prog from array at index %d", pos);
1120
}
1121
}
1122
1123
/**
1124
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
1125
* propagate the change to descendants
1126
* @cgrp: The cgroup which descendants to traverse
1127
* @prog: A program to detach or NULL
1128
* @link: A link to detach or NULL
1129
* @type: Type of detach operation
1130
* @revision: bpf_prog_list revision
1131
*
1132
* At most one of @prog or @link can be non-NULL.
1133
* Must be called with cgroup_mutex held.
1134
*/
1135
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1136
struct bpf_cgroup_link *link, enum bpf_attach_type type,
1137
u64 revision)
1138
{
1139
enum cgroup_bpf_attach_type atype;
1140
struct bpf_prog *old_prog;
1141
struct bpf_prog_list *pl;
1142
struct hlist_head *progs;
1143
u32 attach_btf_id = 0;
1144
u32 flags;
1145
1146
if (prog)
1147
attach_btf_id = prog->aux->attach_btf_id;
1148
if (link)
1149
attach_btf_id = link->link.prog->aux->attach_btf_id;
1150
1151
atype = bpf_cgroup_atype_find(type, attach_btf_id);
1152
if (atype < 0)
1153
return -EINVAL;
1154
1155
if (revision && revision != cgrp->bpf.revisions[atype])
1156
return -ESTALE;
1157
1158
progs = &cgrp->bpf.progs[atype];
1159
flags = cgrp->bpf.flags[atype];
1160
1161
if (prog && link)
1162
/* only one of prog or link can be specified */
1163
return -EINVAL;
1164
1165
pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
1166
if (IS_ERR(pl))
1167
return PTR_ERR(pl);
1168
1169
/* mark it deleted, so it's ignored while recomputing effective */
1170
old_prog = pl->prog;
1171
pl->prog = NULL;
1172
pl->link = NULL;
1173
1174
if (update_effective_progs(cgrp, atype)) {
1175
/* if update effective array failed replace the prog with a dummy prog*/
1176
pl->prog = old_prog;
1177
pl->link = link;
1178
purge_effective_progs(cgrp, old_prog, link, atype);
1179
}
1180
1181
/* now can actually delete it from this cgroup list */
1182
hlist_del(&pl->node);
1183
cgrp->bpf.revisions[atype] += 1;
1184
1185
kfree(pl);
1186
if (hlist_empty(progs))
1187
/* last program was detached, reset flags to zero */
1188
cgrp->bpf.flags[atype] = 0;
1189
if (old_prog) {
1190
if (type == BPF_LSM_CGROUP)
1191
bpf_trampoline_unlink_cgroup_shim(old_prog);
1192
bpf_prog_put(old_prog);
1193
}
1194
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1195
return 0;
1196
}
1197
1198
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1199
enum bpf_attach_type type, u64 revision)
1200
{
1201
int ret;
1202
1203
cgroup_lock();
1204
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
1205
cgroup_unlock();
1206
return ret;
1207
}
1208
1209
/* Must be called with cgroup_mutex held to avoid races. */
1210
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1211
union bpf_attr __user *uattr)
1212
{
1213
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1214
bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1215
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1216
enum bpf_attach_type type = attr->query.attach_type;
1217
enum cgroup_bpf_attach_type from_atype, to_atype;
1218
enum cgroup_bpf_attach_type atype;
1219
struct bpf_prog_array *effective;
1220
int cnt, ret = 0, i;
1221
int total_cnt = 0;
1222
u64 revision = 0;
1223
u32 flags;
1224
1225
if (effective_query && prog_attach_flags)
1226
return -EINVAL;
1227
1228
if (type == BPF_LSM_CGROUP) {
1229
if (!effective_query && attr->query.prog_cnt &&
1230
prog_ids && !prog_attach_flags)
1231
return -EINVAL;
1232
1233
from_atype = CGROUP_LSM_START;
1234
to_atype = CGROUP_LSM_END;
1235
flags = 0;
1236
} else {
1237
from_atype = to_cgroup_bpf_attach_type(type);
1238
if (from_atype < 0)
1239
return -EINVAL;
1240
to_atype = from_atype;
1241
flags = cgrp->bpf.flags[from_atype];
1242
}
1243
1244
for (atype = from_atype; atype <= to_atype; atype++) {
1245
if (effective_query) {
1246
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1247
lockdep_is_held(&cgroup_mutex));
1248
total_cnt += bpf_prog_array_length(effective);
1249
} else {
1250
total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
1251
}
1252
}
1253
1254
/* always output uattr->query.attach_flags as 0 during effective query */
1255
flags = effective_query ? 0 : flags;
1256
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1257
return -EFAULT;
1258
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
1259
return -EFAULT;
1260
if (!effective_query && from_atype == to_atype)
1261
revision = cgrp->bpf.revisions[from_atype];
1262
if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
1263
return -EFAULT;
1264
if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
1265
/* return early if user requested only program count + flags */
1266
return 0;
1267
1268
if (attr->query.prog_cnt < total_cnt) {
1269
total_cnt = attr->query.prog_cnt;
1270
ret = -ENOSPC;
1271
}
1272
1273
for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1274
if (effective_query) {
1275
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1276
lockdep_is_held(&cgroup_mutex));
1277
cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1278
ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1279
} else {
1280
struct hlist_head *progs;
1281
struct bpf_prog_list *pl;
1282
struct bpf_prog *prog;
1283
u32 id;
1284
1285
progs = &cgrp->bpf.progs[atype];
1286
cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
1287
i = 0;
1288
hlist_for_each_entry(pl, progs, node) {
1289
prog = prog_list_prog(pl);
1290
id = prog->aux->id;
1291
if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1292
return -EFAULT;
1293
if (++i == cnt)
1294
break;
1295
}
1296
1297
if (prog_attach_flags) {
1298
flags = cgrp->bpf.flags[atype];
1299
1300
for (i = 0; i < cnt; i++)
1301
if (copy_to_user(prog_attach_flags + i,
1302
&flags, sizeof(flags)))
1303
return -EFAULT;
1304
prog_attach_flags += cnt;
1305
}
1306
}
1307
1308
prog_ids += cnt;
1309
total_cnt -= cnt;
1310
}
1311
return ret;
1312
}
1313
1314
static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1315
union bpf_attr __user *uattr)
1316
{
1317
int ret;
1318
1319
cgroup_lock();
1320
ret = __cgroup_bpf_query(cgrp, attr, uattr);
1321
cgroup_unlock();
1322
return ret;
1323
}
1324
1325
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1326
enum bpf_prog_type ptype, struct bpf_prog *prog)
1327
{
1328
struct bpf_prog *replace_prog = NULL;
1329
struct cgroup *cgrp;
1330
int ret;
1331
1332
cgrp = cgroup_get_from_fd(attr->target_fd);
1333
if (IS_ERR(cgrp))
1334
return PTR_ERR(cgrp);
1335
1336
if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1337
(attr->attach_flags & BPF_F_REPLACE)) {
1338
replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1339
if (IS_ERR(replace_prog)) {
1340
cgroup_put(cgrp);
1341
return PTR_ERR(replace_prog);
1342
}
1343
}
1344
1345
ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1346
attr->attach_type, attr->attach_flags,
1347
attr->relative_fd, attr->expected_revision);
1348
1349
if (replace_prog)
1350
bpf_prog_put(replace_prog);
1351
cgroup_put(cgrp);
1352
return ret;
1353
}
1354
1355
int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1356
{
1357
struct bpf_prog *prog;
1358
struct cgroup *cgrp;
1359
int ret;
1360
1361
cgrp = cgroup_get_from_fd(attr->target_fd);
1362
if (IS_ERR(cgrp))
1363
return PTR_ERR(cgrp);
1364
1365
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1366
if (IS_ERR(prog))
1367
prog = NULL;
1368
1369
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
1370
if (prog)
1371
bpf_prog_put(prog);
1372
1373
cgroup_put(cgrp);
1374
return ret;
1375
}
1376
1377
static void bpf_cgroup_link_release(struct bpf_link *link)
1378
{
1379
struct bpf_cgroup_link *cg_link =
1380
container_of(link, struct bpf_cgroup_link, link);
1381
struct cgroup *cg;
1382
1383
/* link might have been auto-detached by dying cgroup already,
1384
* in that case our work is done here
1385
*/
1386
if (!cg_link->cgroup)
1387
return;
1388
1389
cgroup_lock();
1390
1391
/* re-check cgroup under lock again */
1392
if (!cg_link->cgroup) {
1393
cgroup_unlock();
1394
return;
1395
}
1396
1397
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1398
link->attach_type, 0));
1399
if (link->attach_type == BPF_LSM_CGROUP)
1400
bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1401
1402
cg = cg_link->cgroup;
1403
cg_link->cgroup = NULL;
1404
1405
cgroup_unlock();
1406
1407
cgroup_put(cg);
1408
}
1409
1410
static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1411
{
1412
struct bpf_cgroup_link *cg_link =
1413
container_of(link, struct bpf_cgroup_link, link);
1414
1415
kfree(cg_link);
1416
}
1417
1418
static int bpf_cgroup_link_detach(struct bpf_link *link)
1419
{
1420
bpf_cgroup_link_release(link);
1421
1422
return 0;
1423
}
1424
1425
static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1426
struct seq_file *seq)
1427
{
1428
struct bpf_cgroup_link *cg_link =
1429
container_of(link, struct bpf_cgroup_link, link);
1430
u64 cg_id = 0;
1431
1432
cgroup_lock();
1433
if (cg_link->cgroup)
1434
cg_id = cgroup_id(cg_link->cgroup);
1435
cgroup_unlock();
1436
1437
seq_printf(seq,
1438
"cgroup_id:\t%llu\n"
1439
"attach_type:\t%d\n",
1440
cg_id,
1441
link->attach_type);
1442
}
1443
1444
static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1445
struct bpf_link_info *info)
1446
{
1447
struct bpf_cgroup_link *cg_link =
1448
container_of(link, struct bpf_cgroup_link, link);
1449
u64 cg_id = 0;
1450
1451
cgroup_lock();
1452
if (cg_link->cgroup)
1453
cg_id = cgroup_id(cg_link->cgroup);
1454
cgroup_unlock();
1455
1456
info->cgroup.cgroup_id = cg_id;
1457
info->cgroup.attach_type = link->attach_type;
1458
return 0;
1459
}
1460
1461
static const struct bpf_link_ops bpf_cgroup_link_lops = {
1462
.release = bpf_cgroup_link_release,
1463
.dealloc = bpf_cgroup_link_dealloc,
1464
.detach = bpf_cgroup_link_detach,
1465
.update_prog = cgroup_bpf_replace,
1466
.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1467
.fill_link_info = bpf_cgroup_link_fill_link_info,
1468
};
1469
1470
#define BPF_F_LINK_ATTACH_MASK \
1471
(BPF_F_ID | \
1472
BPF_F_BEFORE | \
1473
BPF_F_AFTER | \
1474
BPF_F_PREORDER | \
1475
BPF_F_LINK)
1476
1477
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1478
{
1479
struct bpf_link_primer link_primer;
1480
struct bpf_cgroup_link *link;
1481
struct cgroup *cgrp;
1482
int err;
1483
1484
if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
1485
return -EINVAL;
1486
1487
cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1488
if (IS_ERR(cgrp))
1489
return PTR_ERR(cgrp);
1490
1491
link = kzalloc(sizeof(*link), GFP_USER);
1492
if (!link) {
1493
err = -ENOMEM;
1494
goto out_put_cgroup;
1495
}
1496
bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1497
prog, attr->link_create.attach_type);
1498
link->cgroup = cgrp;
1499
1500
err = bpf_link_prime(&link->link, &link_primer);
1501
if (err) {
1502
kfree(link);
1503
goto out_put_cgroup;
1504
}
1505
1506
err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1507
link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
1508
attr->link_create.cgroup.relative_fd,
1509
attr->link_create.cgroup.expected_revision);
1510
if (err) {
1511
bpf_link_cleanup(&link_primer);
1512
goto out_put_cgroup;
1513
}
1514
1515
return bpf_link_settle(&link_primer);
1516
1517
out_put_cgroup:
1518
cgroup_put(cgrp);
1519
return err;
1520
}
1521
1522
int cgroup_bpf_prog_query(const union bpf_attr *attr,
1523
union bpf_attr __user *uattr)
1524
{
1525
struct cgroup *cgrp;
1526
int ret;
1527
1528
cgrp = cgroup_get_from_fd(attr->query.target_fd);
1529
if (IS_ERR(cgrp))
1530
return PTR_ERR(cgrp);
1531
1532
ret = cgroup_bpf_query(cgrp, attr, uattr);
1533
1534
cgroup_put(cgrp);
1535
return ret;
1536
}
1537
1538
/**
1539
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1540
* @sk: The socket sending or receiving traffic
1541
* @skb: The skb that is being sent or received
1542
* @atype: The type of program to be executed
1543
*
1544
* If no socket is passed, or the socket is not of type INET or INET6,
1545
* this function does nothing and returns 0.
1546
*
1547
* The program type passed in via @type must be suitable for network
1548
* filtering. No further check is performed to assert that.
1549
*
1550
* For egress packets, this function can return:
1551
* NET_XMIT_SUCCESS (0) - continue with packet output
1552
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
1553
* NET_XMIT_CN (2) - continue with packet output and notify TCP
1554
* to call cwr
1555
* -err - drop packet
1556
*
1557
* For ingress packets, this function will return -EPERM if any
1558
* attached program was found and if it returned != 1 during execution.
1559
* Otherwise 0 is returned.
1560
*/
1561
int __cgroup_bpf_run_filter_skb(struct sock *sk,
1562
struct sk_buff *skb,
1563
enum cgroup_bpf_attach_type atype)
1564
{
1565
unsigned int offset = -skb_network_offset(skb);
1566
struct sock *save_sk;
1567
void *saved_data_end;
1568
struct cgroup *cgrp;
1569
int ret;
1570
1571
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1572
return 0;
1573
1574
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1575
save_sk = skb->sk;
1576
skb->sk = sk;
1577
__skb_push(skb, offset);
1578
1579
/* compute pointers for the bpf prog */
1580
bpf_compute_and_save_data_end(skb, &saved_data_end);
1581
1582
if (atype == CGROUP_INET_EGRESS) {
1583
u32 flags = 0;
1584
bool cn;
1585
1586
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1587
__bpf_prog_run_save_cb, 0, &flags);
1588
1589
/* Return values of CGROUP EGRESS BPF programs are:
1590
* 0: drop packet
1591
* 1: keep packet
1592
* 2: drop packet and cn
1593
* 3: keep packet and cn
1594
*
1595
* The returned value is then converted to one of the NET_XMIT
1596
* or an error code that is then interpreted as drop packet
1597
* (and no cn):
1598
* 0: NET_XMIT_SUCCESS skb should be transmitted
1599
* 1: NET_XMIT_DROP skb should be dropped and cn
1600
* 2: NET_XMIT_CN skb should be transmitted and cn
1601
* 3: -err skb should be dropped
1602
*/
1603
1604
cn = flags & BPF_RET_SET_CN;
1605
if (ret && !IS_ERR_VALUE((long)ret))
1606
ret = -EFAULT;
1607
if (!ret)
1608
ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1609
else
1610
ret = (cn ? NET_XMIT_DROP : ret);
1611
} else {
1612
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1613
skb, __bpf_prog_run_save_cb, 0,
1614
NULL);
1615
if (ret && !IS_ERR_VALUE((long)ret))
1616
ret = -EFAULT;
1617
}
1618
bpf_restore_data_end(skb, saved_data_end);
1619
__skb_pull(skb, offset);
1620
skb->sk = save_sk;
1621
1622
return ret;
1623
}
1624
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1625
1626
/**
1627
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
1628
* @sk: sock structure to manipulate
1629
* @atype: The type of program to be executed
1630
*
1631
* socket is passed is expected to be of type INET or INET6.
1632
*
1633
* The program type passed in via @type must be suitable for sock
1634
* filtering. No further check is performed to assert that.
1635
*
1636
* This function will return %-EPERM if any if an attached program was found
1637
* and if it returned != 1 during execution. In all other cases, 0 is returned.
1638
*/
1639
int __cgroup_bpf_run_filter_sk(struct sock *sk,
1640
enum cgroup_bpf_attach_type atype)
1641
{
1642
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1643
1644
return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1645
NULL);
1646
}
1647
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1648
1649
/**
1650
* __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1651
* provided by user sockaddr
1652
* @sk: sock struct that will use sockaddr
1653
* @uaddr: sockaddr struct provided by user
1654
* @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
1655
* read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
1656
* uaddr.
1657
* @atype: The type of program to be executed
1658
* @t_ctx: Pointer to attach type specific context
1659
* @flags: Pointer to u32 which contains higher bits of BPF program
1660
* return value (OR'ed together).
1661
*
1662
* socket is expected to be of type INET, INET6 or UNIX.
1663
*
1664
* This function will return %-EPERM if an attached program is found and
1665
* returned value != 1 during execution. In all other cases, 0 is returned.
1666
*/
1667
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1668
struct sockaddr *uaddr,
1669
int *uaddrlen,
1670
enum cgroup_bpf_attach_type atype,
1671
void *t_ctx,
1672
u32 *flags)
1673
{
1674
struct bpf_sock_addr_kern ctx = {
1675
.sk = sk,
1676
.uaddr = uaddr,
1677
.t_ctx = t_ctx,
1678
};
1679
struct sockaddr_storage unspec;
1680
struct cgroup *cgrp;
1681
int ret;
1682
1683
/* Check socket family since not all sockets represent network
1684
* endpoint (e.g. AF_UNIX).
1685
*/
1686
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
1687
sk->sk_family != AF_UNIX)
1688
return 0;
1689
1690
if (!ctx.uaddr) {
1691
memset(&unspec, 0, sizeof(unspec));
1692
ctx.uaddr = (struct sockaddr *)&unspec;
1693
ctx.uaddrlen = 0;
1694
} else {
1695
ctx.uaddrlen = *uaddrlen;
1696
}
1697
1698
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1699
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1700
0, flags);
1701
1702
if (!ret && uaddr)
1703
*uaddrlen = ctx.uaddrlen;
1704
1705
return ret;
1706
}
1707
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1708
1709
/**
1710
* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1711
* @sk: socket to get cgroup from
1712
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1713
* sk with connection information (IP addresses, etc.) May not contain
1714
* cgroup info if it is a req sock.
1715
* @atype: The type of program to be executed
1716
*
1717
* socket passed is expected to be of type INET or INET6.
1718
*
1719
* The program type passed in via @type must be suitable for sock_ops
1720
* filtering. No further check is performed to assert that.
1721
*
1722
* This function will return %-EPERM if any if an attached program was found
1723
* and if it returned != 1 during execution. In all other cases, 0 is returned.
1724
*/
1725
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1726
struct bpf_sock_ops_kern *sock_ops,
1727
enum cgroup_bpf_attach_type atype)
1728
{
1729
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1730
1731
return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1732
0, NULL);
1733
}
1734
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1735
1736
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1737
short access, enum cgroup_bpf_attach_type atype)
1738
{
1739
struct cgroup *cgrp;
1740
struct bpf_cgroup_dev_ctx ctx = {
1741
.access_type = (access << 16) | dev_type,
1742
.major = major,
1743
.minor = minor,
1744
};
1745
int ret;
1746
1747
rcu_read_lock();
1748
cgrp = task_dfl_cgroup(current);
1749
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1750
NULL);
1751
rcu_read_unlock();
1752
1753
return ret;
1754
}
1755
1756
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
1757
{
1758
/* flags argument is not used now,
1759
* but provides an ability to extend the API.
1760
* verifier checks that its value is correct.
1761
*/
1762
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
1763
struct bpf_cgroup_storage *storage;
1764
struct bpf_cg_run_ctx *ctx;
1765
void *ptr;
1766
1767
/* get current cgroup storage from BPF run context */
1768
ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1769
storage = ctx->prog_item->cgroup_storage[stype];
1770
1771
if (stype == BPF_CGROUP_STORAGE_SHARED)
1772
ptr = &READ_ONCE(storage->buf)->data[0];
1773
else
1774
ptr = this_cpu_ptr(storage->percpu_buf);
1775
1776
return (unsigned long)ptr;
1777
}
1778
1779
const struct bpf_func_proto bpf_get_local_storage_proto = {
1780
.func = bpf_get_local_storage,
1781
.gpl_only = false,
1782
.ret_type = RET_PTR_TO_MAP_VALUE,
1783
.arg1_type = ARG_CONST_MAP_PTR,
1784
.arg2_type = ARG_ANYTHING,
1785
};
1786
1787
BPF_CALL_0(bpf_get_retval)
1788
{
1789
struct bpf_cg_run_ctx *ctx =
1790
container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1791
1792
return ctx->retval;
1793
}
1794
1795
const struct bpf_func_proto bpf_get_retval_proto = {
1796
.func = bpf_get_retval,
1797
.gpl_only = false,
1798
.ret_type = RET_INTEGER,
1799
};
1800
1801
BPF_CALL_1(bpf_set_retval, int, retval)
1802
{
1803
struct bpf_cg_run_ctx *ctx =
1804
container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1805
1806
ctx->retval = retval;
1807
return 0;
1808
}
1809
1810
const struct bpf_func_proto bpf_set_retval_proto = {
1811
.func = bpf_set_retval,
1812
.gpl_only = false,
1813
.ret_type = RET_INTEGER,
1814
.arg1_type = ARG_ANYTHING,
1815
};
1816
1817
static const struct bpf_func_proto *
1818
cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1819
{
1820
const struct bpf_func_proto *func_proto;
1821
1822
func_proto = cgroup_common_func_proto(func_id, prog);
1823
if (func_proto)
1824
return func_proto;
1825
1826
switch (func_id) {
1827
case BPF_FUNC_perf_event_output:
1828
return &bpf_event_output_data_proto;
1829
default:
1830
return bpf_base_func_proto(func_id, prog);
1831
}
1832
}
1833
1834
static bool cgroup_dev_is_valid_access(int off, int size,
1835
enum bpf_access_type type,
1836
const struct bpf_prog *prog,
1837
struct bpf_insn_access_aux *info)
1838
{
1839
const int size_default = sizeof(__u32);
1840
1841
if (type == BPF_WRITE)
1842
return false;
1843
1844
if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1845
return false;
1846
/* The verifier guarantees that size > 0. */
1847
if (off % size != 0)
1848
return false;
1849
1850
switch (off) {
1851
case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1852
bpf_ctx_record_field_size(info, size_default);
1853
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1854
return false;
1855
break;
1856
default:
1857
if (size != size_default)
1858
return false;
1859
}
1860
1861
return true;
1862
}
1863
1864
const struct bpf_prog_ops cg_dev_prog_ops = {
1865
};
1866
1867
const struct bpf_verifier_ops cg_dev_verifier_ops = {
1868
.get_func_proto = cgroup_dev_func_proto,
1869
.is_valid_access = cgroup_dev_is_valid_access,
1870
};
1871
1872
/**
1873
* __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1874
*
1875
* @head: sysctl table header
1876
* @table: sysctl table
1877
* @write: sysctl is being read (= 0) or written (= 1)
1878
* @buf: pointer to buffer (in and out)
1879
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
1880
* result is size of @new_buf if program set new value, initial value
1881
* otherwise
1882
* @ppos: value-result argument: value is position at which read from or write
1883
* to sysctl is happening, result is new position if program overrode it,
1884
* initial value otherwise
1885
* @atype: type of program to be executed
1886
*
1887
* Program is run when sysctl is being accessed, either read or written, and
1888
* can allow or deny such access.
1889
*
1890
* This function will return %-EPERM if an attached program is found and
1891
* returned value != 1 during execution. In all other cases 0 is returned.
1892
*/
1893
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1894
const struct ctl_table *table, int write,
1895
char **buf, size_t *pcount, loff_t *ppos,
1896
enum cgroup_bpf_attach_type atype)
1897
{
1898
struct bpf_sysctl_kern ctx = {
1899
.head = head,
1900
.table = table,
1901
.write = write,
1902
.ppos = ppos,
1903
.cur_val = NULL,
1904
.cur_len = PAGE_SIZE,
1905
.new_val = NULL,
1906
.new_len = 0,
1907
.new_updated = 0,
1908
};
1909
struct cgroup *cgrp;
1910
loff_t pos = 0;
1911
int ret;
1912
1913
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1914
if (!ctx.cur_val ||
1915
table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1916
/* Let BPF program decide how to proceed. */
1917
ctx.cur_len = 0;
1918
}
1919
1920
if (write && *buf && *pcount) {
1921
/* BPF program should be able to override new value with a
1922
* buffer bigger than provided by user.
1923
*/
1924
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1925
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1926
if (ctx.new_val) {
1927
memcpy(ctx.new_val, *buf, ctx.new_len);
1928
} else {
1929
/* Let BPF program decide how to proceed. */
1930
ctx.new_len = 0;
1931
}
1932
}
1933
1934
rcu_read_lock();
1935
cgrp = task_dfl_cgroup(current);
1936
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1937
NULL);
1938
rcu_read_unlock();
1939
1940
kfree(ctx.cur_val);
1941
1942
if (ret == 1 && ctx.new_updated) {
1943
kfree(*buf);
1944
*buf = ctx.new_val;
1945
*pcount = ctx.new_len;
1946
} else {
1947
kfree(ctx.new_val);
1948
}
1949
1950
return ret;
1951
}
1952
1953
#ifdef CONFIG_NET
1954
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1955
struct bpf_sockopt_buf *buf)
1956
{
1957
if (unlikely(max_optlen < 0))
1958
return -EINVAL;
1959
1960
if (unlikely(max_optlen > PAGE_SIZE)) {
1961
/* We don't expose optvals that are greater than PAGE_SIZE
1962
* to the BPF program.
1963
*/
1964
max_optlen = PAGE_SIZE;
1965
}
1966
1967
if (max_optlen <= sizeof(buf->data)) {
1968
/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1969
* bytes avoid the cost of kzalloc.
1970
*/
1971
ctx->optval = buf->data;
1972
ctx->optval_end = ctx->optval + max_optlen;
1973
return max_optlen;
1974
}
1975
1976
ctx->optval = kzalloc(max_optlen, GFP_USER);
1977
if (!ctx->optval)
1978
return -ENOMEM;
1979
1980
ctx->optval_end = ctx->optval + max_optlen;
1981
1982
return max_optlen;
1983
}
1984
1985
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1986
struct bpf_sockopt_buf *buf)
1987
{
1988
if (ctx->optval == buf->data)
1989
return;
1990
kfree(ctx->optval);
1991
}
1992
1993
static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1994
struct bpf_sockopt_buf *buf)
1995
{
1996
return ctx->optval != buf->data;
1997
}
1998
1999
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
2000
int *optname, sockptr_t optval,
2001
int *optlen, char **kernel_optval)
2002
{
2003
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2004
struct bpf_sockopt_buf buf = {};
2005
struct bpf_sockopt_kern ctx = {
2006
.sk = sk,
2007
.level = *level,
2008
.optname = *optname,
2009
};
2010
int ret, max_optlen;
2011
2012
/* Allocate a bit more than the initial user buffer for
2013
* BPF program. The canonical use case is overriding
2014
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
2015
*/
2016
max_optlen = max_t(int, 16, *optlen);
2017
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
2018
if (max_optlen < 0)
2019
return max_optlen;
2020
2021
ctx.optlen = *optlen;
2022
2023
if (copy_from_sockptr(ctx.optval, optval,
2024
min(*optlen, max_optlen))) {
2025
ret = -EFAULT;
2026
goto out;
2027
}
2028
2029
lock_sock(sk);
2030
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
2031
&ctx, bpf_prog_run, 0, NULL);
2032
release_sock(sk);
2033
2034
if (ret)
2035
goto out;
2036
2037
if (ctx.optlen == -1) {
2038
/* optlen set to -1, bypass kernel */
2039
ret = 1;
2040
} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
2041
/* optlen is out of bounds */
2042
if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
2043
pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2044
ctx.optlen, max_optlen);
2045
ret = 0;
2046
goto out;
2047
}
2048
ret = -EFAULT;
2049
} else {
2050
/* optlen within bounds, run kernel handler */
2051
ret = 0;
2052
2053
/* export any potential modifications */
2054
*level = ctx.level;
2055
*optname = ctx.optname;
2056
2057
/* optlen == 0 from BPF indicates that we should
2058
* use original userspace data.
2059
*/
2060
if (ctx.optlen != 0) {
2061
*optlen = ctx.optlen;
2062
/* We've used bpf_sockopt_kern->buf as an intermediary
2063
* storage, but the BPF program indicates that we need
2064
* to pass this data to the kernel setsockopt handler.
2065
* No way to export on-stack buf, have to allocate a
2066
* new buffer.
2067
*/
2068
if (!sockopt_buf_allocated(&ctx, &buf)) {
2069
void *p = kmalloc(ctx.optlen, GFP_USER);
2070
2071
if (!p) {
2072
ret = -ENOMEM;
2073
goto out;
2074
}
2075
memcpy(p, ctx.optval, ctx.optlen);
2076
*kernel_optval = p;
2077
} else {
2078
*kernel_optval = ctx.optval;
2079
}
2080
/* export and don't free sockopt buf */
2081
return 0;
2082
}
2083
}
2084
2085
out:
2086
sockopt_free_buf(&ctx, &buf);
2087
return ret;
2088
}
2089
2090
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
2091
int optname, sockptr_t optval,
2092
sockptr_t optlen, int max_optlen,
2093
int retval)
2094
{
2095
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2096
struct bpf_sockopt_buf buf = {};
2097
struct bpf_sockopt_kern ctx = {
2098
.sk = sk,
2099
.level = level,
2100
.optname = optname,
2101
.current_task = current,
2102
};
2103
int orig_optlen;
2104
int ret;
2105
2106
orig_optlen = max_optlen;
2107
ctx.optlen = max_optlen;
2108
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
2109
if (max_optlen < 0)
2110
return max_optlen;
2111
2112
if (!retval) {
2113
/* If kernel getsockopt finished successfully,
2114
* copy whatever was returned to the user back
2115
* into our temporary buffer. Set optlen to the
2116
* one that kernel returned as well to let
2117
* BPF programs inspect the value.
2118
*/
2119
if (copy_from_sockptr(&ctx.optlen, optlen,
2120
sizeof(ctx.optlen))) {
2121
ret = -EFAULT;
2122
goto out;
2123
}
2124
2125
if (ctx.optlen < 0) {
2126
ret = -EFAULT;
2127
goto out;
2128
}
2129
orig_optlen = ctx.optlen;
2130
2131
if (copy_from_sockptr(ctx.optval, optval,
2132
min(ctx.optlen, max_optlen))) {
2133
ret = -EFAULT;
2134
goto out;
2135
}
2136
}
2137
2138
lock_sock(sk);
2139
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2140
&ctx, bpf_prog_run, retval, NULL);
2141
release_sock(sk);
2142
2143
if (ret < 0)
2144
goto out;
2145
2146
if (!sockptr_is_null(optval) &&
2147
(ctx.optlen > max_optlen || ctx.optlen < 0)) {
2148
if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
2149
pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2150
ctx.optlen, max_optlen);
2151
ret = retval;
2152
goto out;
2153
}
2154
ret = -EFAULT;
2155
goto out;
2156
}
2157
2158
if (ctx.optlen != 0) {
2159
if (!sockptr_is_null(optval) &&
2160
copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
2161
ret = -EFAULT;
2162
goto out;
2163
}
2164
if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
2165
ret = -EFAULT;
2166
goto out;
2167
}
2168
}
2169
2170
out:
2171
sockopt_free_buf(&ctx, &buf);
2172
return ret;
2173
}
2174
2175
int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
2176
int optname, void *optval,
2177
int *optlen, int retval)
2178
{
2179
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2180
struct bpf_sockopt_kern ctx = {
2181
.sk = sk,
2182
.level = level,
2183
.optname = optname,
2184
.optlen = *optlen,
2185
.optval = optval,
2186
.optval_end = optval + *optlen,
2187
.current_task = current,
2188
};
2189
int ret;
2190
2191
/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
2192
* user data back into BPF buffer when reval != 0. This is
2193
* done as an optimization to avoid extra copy, assuming
2194
* kernel won't populate the data in case of an error.
2195
* Here we always pass the data and memset() should
2196
* be called if that data shouldn't be "exported".
2197
*/
2198
2199
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2200
&ctx, bpf_prog_run, retval, NULL);
2201
if (ret < 0)
2202
return ret;
2203
2204
if (ctx.optlen > *optlen)
2205
return -EFAULT;
2206
2207
/* BPF programs can shrink the buffer, export the modifications.
2208
*/
2209
if (ctx.optlen != 0)
2210
*optlen = ctx.optlen;
2211
2212
return ret;
2213
}
2214
#endif
2215
2216
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
2217
size_t *lenp)
2218
{
2219
ssize_t tmp_ret = 0, ret;
2220
2221
if (dir->header.parent) {
2222
tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
2223
if (tmp_ret < 0)
2224
return tmp_ret;
2225
}
2226
2227
ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
2228
if (ret < 0)
2229
return ret;
2230
*bufp += ret;
2231
*lenp -= ret;
2232
ret += tmp_ret;
2233
2234
/* Avoid leading slash. */
2235
if (!ret)
2236
return ret;
2237
2238
tmp_ret = strscpy(*bufp, "/", *lenp);
2239
if (tmp_ret < 0)
2240
return tmp_ret;
2241
*bufp += tmp_ret;
2242
*lenp -= tmp_ret;
2243
2244
return ret + tmp_ret;
2245
}
2246
2247
BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
2248
size_t, buf_len, u64, flags)
2249
{
2250
ssize_t tmp_ret = 0, ret;
2251
2252
if (!buf)
2253
return -EINVAL;
2254
2255
if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
2256
if (!ctx->head)
2257
return -EINVAL;
2258
tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
2259
if (tmp_ret < 0)
2260
return tmp_ret;
2261
}
2262
2263
ret = strscpy(buf, ctx->table->procname, buf_len);
2264
2265
return ret < 0 ? ret : tmp_ret + ret;
2266
}
2267
2268
static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2269
.func = bpf_sysctl_get_name,
2270
.gpl_only = false,
2271
.ret_type = RET_INTEGER,
2272
.arg1_type = ARG_PTR_TO_CTX,
2273
.arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
2274
.arg3_type = ARG_CONST_SIZE,
2275
.arg4_type = ARG_ANYTHING,
2276
};
2277
2278
static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
2279
size_t src_len)
2280
{
2281
if (!dst)
2282
return -EINVAL;
2283
2284
if (!dst_len)
2285
return -E2BIG;
2286
2287
if (!src || !src_len) {
2288
memset(dst, 0, dst_len);
2289
return -EINVAL;
2290
}
2291
2292
memcpy(dst, src, min(dst_len, src_len));
2293
2294
if (dst_len > src_len) {
2295
memset(dst + src_len, '\0', dst_len - src_len);
2296
return src_len;
2297
}
2298
2299
dst[dst_len - 1] = '\0';
2300
2301
return -E2BIG;
2302
}
2303
2304
BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2305
char *, buf, size_t, buf_len)
2306
{
2307
return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
2308
}
2309
2310
static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2311
.func = bpf_sysctl_get_current_value,
2312
.gpl_only = false,
2313
.ret_type = RET_INTEGER,
2314
.arg1_type = ARG_PTR_TO_CTX,
2315
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
2316
.arg3_type = ARG_CONST_SIZE,
2317
};
2318
2319
BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
2320
size_t, buf_len)
2321
{
2322
if (!ctx->write) {
2323
if (buf && buf_len)
2324
memset(buf, '\0', buf_len);
2325
return -EINVAL;
2326
}
2327
return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
2328
}
2329
2330
static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2331
.func = bpf_sysctl_get_new_value,
2332
.gpl_only = false,
2333
.ret_type = RET_INTEGER,
2334
.arg1_type = ARG_PTR_TO_CTX,
2335
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
2336
.arg3_type = ARG_CONST_SIZE,
2337
};
2338
2339
BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2340
const char *, buf, size_t, buf_len)
2341
{
2342
if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
2343
return -EINVAL;
2344
2345
if (buf_len > PAGE_SIZE - 1)
2346
return -E2BIG;
2347
2348
memcpy(ctx->new_val, buf, buf_len);
2349
ctx->new_len = buf_len;
2350
ctx->new_updated = 1;
2351
2352
return 0;
2353
}
2354
2355
static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2356
.func = bpf_sysctl_set_new_value,
2357
.gpl_only = false,
2358
.ret_type = RET_INTEGER,
2359
.arg1_type = ARG_PTR_TO_CTX,
2360
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2361
.arg3_type = ARG_CONST_SIZE,
2362
};
2363
2364
static const struct bpf_func_proto *
2365
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2366
{
2367
const struct bpf_func_proto *func_proto;
2368
2369
func_proto = cgroup_common_func_proto(func_id, prog);
2370
if (func_proto)
2371
return func_proto;
2372
2373
switch (func_id) {
2374
case BPF_FUNC_sysctl_get_name:
2375
return &bpf_sysctl_get_name_proto;
2376
case BPF_FUNC_sysctl_get_current_value:
2377
return &bpf_sysctl_get_current_value_proto;
2378
case BPF_FUNC_sysctl_get_new_value:
2379
return &bpf_sysctl_get_new_value_proto;
2380
case BPF_FUNC_sysctl_set_new_value:
2381
return &bpf_sysctl_set_new_value_proto;
2382
case BPF_FUNC_ktime_get_coarse_ns:
2383
return &bpf_ktime_get_coarse_ns_proto;
2384
case BPF_FUNC_perf_event_output:
2385
return &bpf_event_output_data_proto;
2386
default:
2387
return bpf_base_func_proto(func_id, prog);
2388
}
2389
}
2390
2391
static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2392
const struct bpf_prog *prog,
2393
struct bpf_insn_access_aux *info)
2394
{
2395
const int size_default = sizeof(__u32);
2396
2397
if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2398
return false;
2399
2400
switch (off) {
2401
case bpf_ctx_range(struct bpf_sysctl, write):
2402
if (type != BPF_READ)
2403
return false;
2404
bpf_ctx_record_field_size(info, size_default);
2405
return bpf_ctx_narrow_access_ok(off, size, size_default);
2406
case bpf_ctx_range(struct bpf_sysctl, file_pos):
2407
if (type == BPF_READ) {
2408
bpf_ctx_record_field_size(info, size_default);
2409
return bpf_ctx_narrow_access_ok(off, size, size_default);
2410
} else {
2411
return size == size_default;
2412
}
2413
default:
2414
return false;
2415
}
2416
}
2417
2418
static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2419
const struct bpf_insn *si,
2420
struct bpf_insn *insn_buf,
2421
struct bpf_prog *prog, u32 *target_size)
2422
{
2423
struct bpf_insn *insn = insn_buf;
2424
u32 read_size;
2425
2426
switch (si->off) {
2427
case offsetof(struct bpf_sysctl, write):
2428
*insn++ = BPF_LDX_MEM(
2429
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2430
bpf_target_off(struct bpf_sysctl_kern, write,
2431
sizeof_field(struct bpf_sysctl_kern,
2432
write),
2433
target_size));
2434
break;
2435
case offsetof(struct bpf_sysctl, file_pos):
2436
/* ppos is a pointer so it should be accessed via indirect
2437
* loads and stores. Also for stores additional temporary
2438
* register is used since neither src_reg nor dst_reg can be
2439
* overridden.
2440
*/
2441
if (type == BPF_WRITE) {
2442
int treg = BPF_REG_9;
2443
2444
if (si->src_reg == treg || si->dst_reg == treg)
2445
--treg;
2446
if (si->src_reg == treg || si->dst_reg == treg)
2447
--treg;
2448
*insn++ = BPF_STX_MEM(
2449
BPF_DW, si->dst_reg, treg,
2450
offsetof(struct bpf_sysctl_kern, tmp_reg));
2451
*insn++ = BPF_LDX_MEM(
2452
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2453
treg, si->dst_reg,
2454
offsetof(struct bpf_sysctl_kern, ppos));
2455
*insn++ = BPF_RAW_INSN(
2456
BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
2457
treg, si->src_reg,
2458
bpf_ctx_narrow_access_offset(
2459
0, sizeof(u32), sizeof(loff_t)),
2460
si->imm);
2461
*insn++ = BPF_LDX_MEM(
2462
BPF_DW, treg, si->dst_reg,
2463
offsetof(struct bpf_sysctl_kern, tmp_reg));
2464
} else {
2465
*insn++ = BPF_LDX_MEM(
2466
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2467
si->dst_reg, si->src_reg,
2468
offsetof(struct bpf_sysctl_kern, ppos));
2469
read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2470
*insn++ = BPF_LDX_MEM(
2471
BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2472
bpf_ctx_narrow_access_offset(
2473
0, read_size, sizeof(loff_t)));
2474
}
2475
*target_size = sizeof(u32);
2476
break;
2477
}
2478
2479
return insn - insn_buf;
2480
}
2481
2482
const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2483
.get_func_proto = sysctl_func_proto,
2484
.is_valid_access = sysctl_is_valid_access,
2485
.convert_ctx_access = sysctl_convert_ctx_access,
2486
};
2487
2488
const struct bpf_prog_ops cg_sysctl_prog_ops = {
2489
};
2490
2491
#ifdef CONFIG_NET
2492
BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2493
{
2494
const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2495
2496
return net->net_cookie;
2497
}
2498
2499
static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2500
.func = bpf_get_netns_cookie_sockopt,
2501
.gpl_only = false,
2502
.ret_type = RET_INTEGER,
2503
.arg1_type = ARG_PTR_TO_CTX_OR_NULL,
2504
};
2505
#endif
2506
2507
static const struct bpf_func_proto *
2508
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2509
{
2510
const struct bpf_func_proto *func_proto;
2511
2512
func_proto = cgroup_common_func_proto(func_id, prog);
2513
if (func_proto)
2514
return func_proto;
2515
2516
switch (func_id) {
2517
#ifdef CONFIG_NET
2518
case BPF_FUNC_get_netns_cookie:
2519
return &bpf_get_netns_cookie_sockopt_proto;
2520
case BPF_FUNC_sk_storage_get:
2521
return &bpf_sk_storage_get_proto;
2522
case BPF_FUNC_sk_storage_delete:
2523
return &bpf_sk_storage_delete_proto;
2524
case BPF_FUNC_setsockopt:
2525
if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2526
return &bpf_sk_setsockopt_proto;
2527
return NULL;
2528
case BPF_FUNC_getsockopt:
2529
if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2530
return &bpf_sk_getsockopt_proto;
2531
return NULL;
2532
#endif
2533
#ifdef CONFIG_INET
2534
case BPF_FUNC_tcp_sock:
2535
return &bpf_tcp_sock_proto;
2536
#endif
2537
case BPF_FUNC_perf_event_output:
2538
return &bpf_event_output_data_proto;
2539
default:
2540
return bpf_base_func_proto(func_id, prog);
2541
}
2542
}
2543
2544
static bool cg_sockopt_is_valid_access(int off, int size,
2545
enum bpf_access_type type,
2546
const struct bpf_prog *prog,
2547
struct bpf_insn_access_aux *info)
2548
{
2549
const int size_default = sizeof(__u32);
2550
2551
if (off < 0 || off >= sizeof(struct bpf_sockopt))
2552
return false;
2553
2554
if (off % size != 0)
2555
return false;
2556
2557
if (type == BPF_WRITE) {
2558
switch (off) {
2559
case offsetof(struct bpf_sockopt, retval):
2560
if (size != size_default)
2561
return false;
2562
return prog->expected_attach_type ==
2563
BPF_CGROUP_GETSOCKOPT;
2564
case offsetof(struct bpf_sockopt, optname):
2565
fallthrough;
2566
case offsetof(struct bpf_sockopt, level):
2567
if (size != size_default)
2568
return false;
2569
return prog->expected_attach_type ==
2570
BPF_CGROUP_SETSOCKOPT;
2571
case offsetof(struct bpf_sockopt, optlen):
2572
return size == size_default;
2573
default:
2574
return false;
2575
}
2576
}
2577
2578
switch (off) {
2579
case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
2580
if (size != sizeof(__u64))
2581
return false;
2582
info->reg_type = PTR_TO_SOCKET;
2583
break;
2584
case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
2585
if (size != sizeof(__u64))
2586
return false;
2587
info->reg_type = PTR_TO_PACKET;
2588
break;
2589
case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
2590
if (size != sizeof(__u64))
2591
return false;
2592
info->reg_type = PTR_TO_PACKET_END;
2593
break;
2594
case bpf_ctx_range(struct bpf_sockopt, retval):
2595
if (size != size_default)
2596
return false;
2597
return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2598
default:
2599
if (size != size_default)
2600
return false;
2601
break;
2602
}
2603
return true;
2604
}
2605
2606
#define CG_SOCKOPT_READ_FIELD(F) \
2607
BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
2608
si->dst_reg, si->src_reg, \
2609
offsetof(struct bpf_sockopt_kern, F))
2610
2611
#define CG_SOCKOPT_WRITE_FIELD(F) \
2612
BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
2613
BPF_MEM | BPF_CLASS(si->code)), \
2614
si->dst_reg, si->src_reg, \
2615
offsetof(struct bpf_sockopt_kern, F), \
2616
si->imm)
2617
2618
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2619
const struct bpf_insn *si,
2620
struct bpf_insn *insn_buf,
2621
struct bpf_prog *prog,
2622
u32 *target_size)
2623
{
2624
struct bpf_insn *insn = insn_buf;
2625
2626
switch (si->off) {
2627
case offsetof(struct bpf_sockopt, sk):
2628
*insn++ = CG_SOCKOPT_READ_FIELD(sk);
2629
break;
2630
case offsetof(struct bpf_sockopt, level):
2631
if (type == BPF_WRITE)
2632
*insn++ = CG_SOCKOPT_WRITE_FIELD(level);
2633
else
2634
*insn++ = CG_SOCKOPT_READ_FIELD(level);
2635
break;
2636
case offsetof(struct bpf_sockopt, optname):
2637
if (type == BPF_WRITE)
2638
*insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
2639
else
2640
*insn++ = CG_SOCKOPT_READ_FIELD(optname);
2641
break;
2642
case offsetof(struct bpf_sockopt, optlen):
2643
if (type == BPF_WRITE)
2644
*insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
2645
else
2646
*insn++ = CG_SOCKOPT_READ_FIELD(optlen);
2647
break;
2648
case offsetof(struct bpf_sockopt, retval):
2649
BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2650
2651
if (type == BPF_WRITE) {
2652
int treg = BPF_REG_9;
2653
2654
if (si->src_reg == treg || si->dst_reg == treg)
2655
--treg;
2656
if (si->src_reg == treg || si->dst_reg == treg)
2657
--treg;
2658
*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2659
offsetof(struct bpf_sockopt_kern, tmp_reg));
2660
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2661
treg, si->dst_reg,
2662
offsetof(struct bpf_sockopt_kern, current_task));
2663
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2664
treg, treg,
2665
offsetof(struct task_struct, bpf_ctx));
2666
*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
2667
BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2668
treg, si->src_reg,
2669
offsetof(struct bpf_cg_run_ctx, retval),
2670
si->imm);
2671
*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2672
offsetof(struct bpf_sockopt_kern, tmp_reg));
2673
} else {
2674
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2675
si->dst_reg, si->src_reg,
2676
offsetof(struct bpf_sockopt_kern, current_task));
2677
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2678
si->dst_reg, si->dst_reg,
2679
offsetof(struct task_struct, bpf_ctx));
2680
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2681
si->dst_reg, si->dst_reg,
2682
offsetof(struct bpf_cg_run_ctx, retval));
2683
}
2684
break;
2685
case offsetof(struct bpf_sockopt, optval):
2686
*insn++ = CG_SOCKOPT_READ_FIELD(optval);
2687
break;
2688
case offsetof(struct bpf_sockopt, optval_end):
2689
*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
2690
break;
2691
}
2692
2693
return insn - insn_buf;
2694
}
2695
2696
static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2697
bool direct_write,
2698
const struct bpf_prog *prog)
2699
{
2700
/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2701
*/
2702
return 0;
2703
}
2704
2705
const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2706
.get_func_proto = cg_sockopt_func_proto,
2707
.is_valid_access = cg_sockopt_is_valid_access,
2708
.convert_ctx_access = cg_sockopt_convert_ctx_access,
2709
.gen_prologue = cg_sockopt_get_prologue,
2710
};
2711
2712
const struct bpf_prog_ops cg_sockopt_prog_ops = {
2713
};
2714
2715
/* Common helpers for cgroup hooks. */
2716
const struct bpf_func_proto *
2717
cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2718
{
2719
switch (func_id) {
2720
case BPF_FUNC_get_local_storage:
2721
return &bpf_get_local_storage_proto;
2722
case BPF_FUNC_get_retval:
2723
switch (prog->expected_attach_type) {
2724
case BPF_CGROUP_INET_INGRESS:
2725
case BPF_CGROUP_INET_EGRESS:
2726
case BPF_CGROUP_SOCK_OPS:
2727
case BPF_CGROUP_UDP4_RECVMSG:
2728
case BPF_CGROUP_UDP6_RECVMSG:
2729
case BPF_CGROUP_UNIX_RECVMSG:
2730
case BPF_CGROUP_INET4_GETPEERNAME:
2731
case BPF_CGROUP_INET6_GETPEERNAME:
2732
case BPF_CGROUP_UNIX_GETPEERNAME:
2733
case BPF_CGROUP_INET4_GETSOCKNAME:
2734
case BPF_CGROUP_INET6_GETSOCKNAME:
2735
case BPF_CGROUP_UNIX_GETSOCKNAME:
2736
return NULL;
2737
default:
2738
return &bpf_get_retval_proto;
2739
}
2740
case BPF_FUNC_set_retval:
2741
switch (prog->expected_attach_type) {
2742
case BPF_CGROUP_INET_INGRESS:
2743
case BPF_CGROUP_INET_EGRESS:
2744
case BPF_CGROUP_SOCK_OPS:
2745
case BPF_CGROUP_UDP4_RECVMSG:
2746
case BPF_CGROUP_UDP6_RECVMSG:
2747
case BPF_CGROUP_UNIX_RECVMSG:
2748
case BPF_CGROUP_INET4_GETPEERNAME:
2749
case BPF_CGROUP_INET6_GETPEERNAME:
2750
case BPF_CGROUP_UNIX_GETPEERNAME:
2751
case BPF_CGROUP_INET4_GETSOCKNAME:
2752
case BPF_CGROUP_INET6_GETSOCKNAME:
2753
case BPF_CGROUP_UNIX_GETSOCKNAME:
2754
return NULL;
2755
default:
2756
return &bpf_set_retval_proto;
2757
}
2758
default:
2759
return NULL;
2760
}
2761
}
2762
2763