Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/cgroup_iter.c
29267 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2022 Google */
3
#include <linux/bpf.h>
4
#include <linux/btf_ids.h>
5
#include <linux/cgroup.h>
6
#include <linux/kernel.h>
7
#include <linux/seq_file.h>
8
9
#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
10
11
/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
12
*
13
* 1. Walk the descendants of a cgroup in pre-order.
14
* 2. Walk the descendants of a cgroup in post-order.
15
* 3. Walk the ancestors of a cgroup.
16
* 4. Show the given cgroup only.
17
*
18
* For walking descendants, cgroup_iter can walk in either pre-order or
19
* post-order. For walking ancestors, the iter walks up from a cgroup to
20
* the root.
21
*
22
* The iter program can terminate the walk early by returning 1. Walk
23
* continues if prog returns 0.
24
*
25
* The prog can check (seq->num == 0) to determine whether this is
26
* the first element. The prog may also be passed a NULL cgroup,
27
* which means the walk has completed and the prog has a chance to
28
* do post-processing, such as outputting an epilogue.
29
*
30
* Note: the iter_prog is called with cgroup_mutex held.
31
*
32
* Currently only one session is supported, which means, depending on the
33
* volume of data bpf program intends to send to user space, the number
34
* of cgroups that can be walked is limited. For example, given the current
35
* buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
36
* cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
37
* be walked is 512. This is a limitation of cgroup_iter. If the output data
38
* is larger than the kernel buffer size, after all data in the kernel buffer
39
* is consumed by user space, the subsequent read() syscall will signal
40
* EOPNOTSUPP. In order to work around, the user may have to update their
41
* program to reduce the volume of data sent to output. For example, skip
42
* some uninteresting cgroups.
43
*/
44
45
struct bpf_iter__cgroup {
46
__bpf_md_ptr(struct bpf_iter_meta *, meta);
47
__bpf_md_ptr(struct cgroup *, cgroup);
48
};
49
50
struct cgroup_iter_priv {
51
struct cgroup_subsys_state *start_css;
52
bool visited_all;
53
bool terminate;
54
int order;
55
};
56
57
static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
58
{
59
struct cgroup_iter_priv *p = seq->private;
60
61
cgroup_lock();
62
63
/* cgroup_iter doesn't support read across multiple sessions. */
64
if (*pos > 0) {
65
if (p->visited_all)
66
return NULL;
67
68
/* Haven't visited all, but because cgroup_mutex has dropped,
69
* return -EOPNOTSUPP to indicate incomplete iteration.
70
*/
71
return ERR_PTR(-EOPNOTSUPP);
72
}
73
74
++*pos;
75
p->terminate = false;
76
p->visited_all = false;
77
if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
78
return css_next_descendant_pre(NULL, p->start_css);
79
else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
80
return css_next_descendant_post(NULL, p->start_css);
81
else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
82
return p->start_css;
83
}
84
85
static int __cgroup_iter_seq_show(struct seq_file *seq,
86
struct cgroup_subsys_state *css, int in_stop);
87
88
static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
89
{
90
struct cgroup_iter_priv *p = seq->private;
91
92
cgroup_unlock();
93
94
/* pass NULL to the prog for post-processing */
95
if (!v) {
96
__cgroup_iter_seq_show(seq, NULL, true);
97
p->visited_all = true;
98
}
99
}
100
101
static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
102
{
103
struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
104
struct cgroup_iter_priv *p = seq->private;
105
106
++*pos;
107
if (p->terminate)
108
return NULL;
109
110
if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
111
return css_next_descendant_pre(curr, p->start_css);
112
else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
113
return css_next_descendant_post(curr, p->start_css);
114
else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
115
return curr->parent;
116
else /* BPF_CGROUP_ITER_SELF_ONLY */
117
return NULL;
118
}
119
120
static int __cgroup_iter_seq_show(struct seq_file *seq,
121
struct cgroup_subsys_state *css, int in_stop)
122
{
123
struct cgroup_iter_priv *p = seq->private;
124
struct bpf_iter__cgroup ctx;
125
struct bpf_iter_meta meta;
126
struct bpf_prog *prog;
127
int ret = 0;
128
129
/* cgroup is dead, skip this element */
130
if (css && cgroup_is_dead(css->cgroup))
131
return 0;
132
133
ctx.meta = &meta;
134
ctx.cgroup = css ? css->cgroup : NULL;
135
meta.seq = seq;
136
prog = bpf_iter_get_info(&meta, in_stop);
137
if (prog)
138
ret = bpf_iter_run_prog(prog, &ctx);
139
140
/* if prog returns > 0, terminate after this element. */
141
if (ret != 0)
142
p->terminate = true;
143
144
return 0;
145
}
146
147
static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
148
{
149
return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
150
false);
151
}
152
153
static const struct seq_operations cgroup_iter_seq_ops = {
154
.start = cgroup_iter_seq_start,
155
.next = cgroup_iter_seq_next,
156
.stop = cgroup_iter_seq_stop,
157
.show = cgroup_iter_seq_show,
158
};
159
160
BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
161
162
static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
163
{
164
struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
165
struct cgroup *cgrp = aux->cgroup.start;
166
167
/* bpf_iter_attach_cgroup() has already acquired an extra reference
168
* for the start cgroup, but the reference may be released after
169
* cgroup_iter_seq_init(), so acquire another reference for the
170
* start cgroup.
171
*/
172
p->start_css = &cgrp->self;
173
css_get(p->start_css);
174
p->terminate = false;
175
p->visited_all = false;
176
p->order = aux->cgroup.order;
177
return 0;
178
}
179
180
static void cgroup_iter_seq_fini(void *priv)
181
{
182
struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
183
184
css_put(p->start_css);
185
}
186
187
static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
188
.seq_ops = &cgroup_iter_seq_ops,
189
.init_seq_private = cgroup_iter_seq_init,
190
.fini_seq_private = cgroup_iter_seq_fini,
191
.seq_priv_size = sizeof(struct cgroup_iter_priv),
192
};
193
194
static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
195
union bpf_iter_link_info *linfo,
196
struct bpf_iter_aux_info *aux)
197
{
198
int fd = linfo->cgroup.cgroup_fd;
199
u64 id = linfo->cgroup.cgroup_id;
200
int order = linfo->cgroup.order;
201
struct cgroup *cgrp;
202
203
if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
204
order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
205
order != BPF_CGROUP_ITER_ANCESTORS_UP &&
206
order != BPF_CGROUP_ITER_SELF_ONLY)
207
return -EINVAL;
208
209
if (fd && id)
210
return -EINVAL;
211
212
if (fd)
213
cgrp = cgroup_v1v2_get_from_fd(fd);
214
else if (id)
215
cgrp = cgroup_get_from_id(id);
216
else /* walk the entire hierarchy by default. */
217
cgrp = cgroup_get_from_path("/");
218
219
if (IS_ERR(cgrp))
220
return PTR_ERR(cgrp);
221
222
aux->cgroup.start = cgrp;
223
aux->cgroup.order = order;
224
return 0;
225
}
226
227
static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
228
{
229
cgroup_put(aux->cgroup.start);
230
}
231
232
static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
233
struct seq_file *seq)
234
{
235
char *buf;
236
237
buf = kzalloc(PATH_MAX, GFP_KERNEL);
238
if (!buf) {
239
seq_puts(seq, "cgroup_path:\t<unknown>\n");
240
goto show_order;
241
}
242
243
/* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
244
* will print nothing.
245
*
246
* Path is in the calling process's cgroup namespace.
247
*/
248
cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
249
current->nsproxy->cgroup_ns);
250
seq_printf(seq, "cgroup_path:\t%s\n", buf);
251
kfree(buf);
252
253
show_order:
254
if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
255
seq_puts(seq, "order: descendants_pre\n");
256
else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
257
seq_puts(seq, "order: descendants_post\n");
258
else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
259
seq_puts(seq, "order: ancestors_up\n");
260
else /* BPF_CGROUP_ITER_SELF_ONLY */
261
seq_puts(seq, "order: self_only\n");
262
}
263
264
static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
265
struct bpf_link_info *info)
266
{
267
info->iter.cgroup.order = aux->cgroup.order;
268
info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
269
return 0;
270
}
271
272
DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
273
struct cgroup *cgroup)
274
275
static struct bpf_iter_reg bpf_cgroup_reg_info = {
276
.target = "cgroup",
277
.feature = BPF_ITER_RESCHED,
278
.attach_target = bpf_iter_attach_cgroup,
279
.detach_target = bpf_iter_detach_cgroup,
280
.show_fdinfo = bpf_iter_cgroup_show_fdinfo,
281
.fill_link_info = bpf_iter_cgroup_fill_link_info,
282
.ctx_arg_info_size = 1,
283
.ctx_arg_info = {
284
{ offsetof(struct bpf_iter__cgroup, cgroup),
285
PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
286
},
287
.seq_info = &cgroup_iter_seq_info,
288
};
289
290
static int __init bpf_cgroup_iter_init(void)
291
{
292
bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
293
return bpf_iter_reg_target(&bpf_cgroup_reg_info);
294
}
295
296
late_initcall(bpf_cgroup_iter_init);
297
298
struct bpf_iter_css {
299
__u64 __opaque[3];
300
} __attribute__((aligned(8)));
301
302
struct bpf_iter_css_kern {
303
struct cgroup_subsys_state *start;
304
struct cgroup_subsys_state *pos;
305
unsigned int flags;
306
} __attribute__((aligned(8)));
307
308
__bpf_kfunc_start_defs();
309
310
__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
311
struct cgroup_subsys_state *start, unsigned int flags)
312
{
313
struct bpf_iter_css_kern *kit = (void *)it;
314
315
BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
316
BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
317
318
kit->start = NULL;
319
switch (flags) {
320
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
321
case BPF_CGROUP_ITER_DESCENDANTS_POST:
322
case BPF_CGROUP_ITER_ANCESTORS_UP:
323
break;
324
default:
325
return -EINVAL;
326
}
327
328
kit->start = start;
329
kit->pos = NULL;
330
kit->flags = flags;
331
return 0;
332
}
333
334
__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
335
{
336
struct bpf_iter_css_kern *kit = (void *)it;
337
338
if (!kit->start)
339
return NULL;
340
341
switch (kit->flags) {
342
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
343
kit->pos = css_next_descendant_pre(kit->pos, kit->start);
344
break;
345
case BPF_CGROUP_ITER_DESCENDANTS_POST:
346
kit->pos = css_next_descendant_post(kit->pos, kit->start);
347
break;
348
case BPF_CGROUP_ITER_ANCESTORS_UP:
349
kit->pos = kit->pos ? kit->pos->parent : kit->start;
350
}
351
352
return kit->pos;
353
}
354
355
__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
356
{
357
}
358
359
__bpf_kfunc_end_defs();
360
361