Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/cgroup/pids.c
29278 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Process number limiting controller for cgroups.
4
*
5
* Used to allow a cgroup hierarchy to stop any new processes from fork()ing
6
* after a certain limit is reached.
7
*
8
* Since it is trivial to hit the task limit without hitting any kmemcg limits
9
* in place, PIDs are a fundamental resource. As such, PID exhaustion must be
10
* preventable in the scope of a cgroup hierarchy by allowing resource limiting
11
* of the number of tasks in a cgroup.
12
*
13
* In order to use the `pids` controller, set the maximum number of tasks in
14
* pids.max (this is not available in the root cgroup for obvious reasons). The
15
* number of processes currently in the cgroup is given by pids.current.
16
* Organisational operations are not blocked by cgroup policies, so it is
17
* possible to have pids.current > pids.max. However, it is not possible to
18
* violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
19
* would cause a cgroup policy to be violated.
20
*
21
* To set a cgroup to have no limit, set pids.max to "max". This is the default
22
* for all new cgroups (N.B. that PID limits are hierarchical, so the most
23
* stringent limit in the hierarchy is followed).
24
*
25
* pids.current tracks all child cgroup hierarchies, so parent/pids.current is
26
* a superset of parent/child/pids.current.
27
*
28
* Copyright (C) 2015 Aleksa Sarai <[email protected]>
29
*/
30
31
#include <linux/kernel.h>
32
#include <linux/threads.h>
33
#include <linux/atomic.h>
34
#include <linux/cgroup.h>
35
#include <linux/slab.h>
36
#include <linux/sched/task.h>
37
38
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
39
#define PIDS_MAX_STR "max"
40
41
enum pidcg_event {
42
/* Fork failed in subtree because this pids_cgroup limit was hit. */
43
PIDCG_MAX,
44
/* Fork failed in this pids_cgroup because ancestor limit was hit. */
45
PIDCG_FORKFAIL,
46
NR_PIDCG_EVENTS,
47
};
48
49
struct pids_cgroup {
50
struct cgroup_subsys_state css;
51
52
/*
53
* Use 64-bit types so that we can safely represent "max" as
54
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
55
*/
56
atomic64_t counter;
57
atomic64_t limit;
58
int64_t watermark;
59
60
/* Handles for pids.events[.local] */
61
struct cgroup_file events_file;
62
struct cgroup_file events_local_file;
63
64
atomic64_t events[NR_PIDCG_EVENTS];
65
atomic64_t events_local[NR_PIDCG_EVENTS];
66
};
67
68
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
69
{
70
return container_of(css, struct pids_cgroup, css);
71
}
72
73
static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
74
{
75
return css_pids(pids->css.parent);
76
}
77
78
static struct cgroup_subsys_state *
79
pids_css_alloc(struct cgroup_subsys_state *parent)
80
{
81
struct pids_cgroup *pids;
82
83
pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
84
if (!pids)
85
return ERR_PTR(-ENOMEM);
86
87
atomic64_set(&pids->limit, PIDS_MAX);
88
return &pids->css;
89
}
90
91
static void pids_css_free(struct cgroup_subsys_state *css)
92
{
93
kfree(css_pids(css));
94
}
95
96
static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
97
{
98
/*
99
* This is racy, but we don't need perfectly accurate tallying of
100
* the watermark, and this lets us avoid extra atomic overhead.
101
*/
102
if (nr_pids > READ_ONCE(p->watermark))
103
WRITE_ONCE(p->watermark, nr_pids);
104
}
105
106
/**
107
* pids_cancel - uncharge the local pid count
108
* @pids: the pid cgroup state
109
* @num: the number of pids to cancel
110
*
111
* This function will WARN if the pid count goes under 0, because such a case is
112
* a bug in the pids controller proper.
113
*/
114
static void pids_cancel(struct pids_cgroup *pids, int num)
115
{
116
/*
117
* A negative count (or overflow for that matter) is invalid,
118
* and indicates a bug in the `pids` controller proper.
119
*/
120
WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
121
}
122
123
/**
124
* pids_uncharge - hierarchically uncharge the pid count
125
* @pids: the pid cgroup state
126
* @num: the number of pids to uncharge
127
*/
128
static void pids_uncharge(struct pids_cgroup *pids, int num)
129
{
130
struct pids_cgroup *p;
131
132
for (p = pids; parent_pids(p); p = parent_pids(p))
133
pids_cancel(p, num);
134
}
135
136
/**
137
* pids_charge - hierarchically charge the pid count
138
* @pids: the pid cgroup state
139
* @num: the number of pids to charge
140
*
141
* This function does *not* follow the pid limit set. It cannot fail and the new
142
* pid count may exceed the limit. This is only used for reverting failed
143
* attaches, where there is no other way out than violating the limit.
144
*/
145
static void pids_charge(struct pids_cgroup *pids, int num)
146
{
147
struct pids_cgroup *p;
148
149
for (p = pids; parent_pids(p); p = parent_pids(p)) {
150
int64_t new = atomic64_add_return(num, &p->counter);
151
152
pids_update_watermark(p, new);
153
}
154
}
155
156
/**
157
* pids_try_charge - hierarchically try to charge the pid count
158
* @pids: the pid cgroup state
159
* @num: the number of pids to charge
160
* @fail: storage of pid cgroup causing the fail
161
*
162
* This function follows the set limit. It will fail if the charge would cause
163
* the new value to exceed the hierarchical limit. Returns 0 if the charge
164
* succeeded, otherwise -EAGAIN.
165
*/
166
static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
167
{
168
struct pids_cgroup *p, *q;
169
170
for (p = pids; parent_pids(p); p = parent_pids(p)) {
171
int64_t new = atomic64_add_return(num, &p->counter);
172
int64_t limit = atomic64_read(&p->limit);
173
174
/*
175
* Since new is capped to the maximum number of pid_t, if
176
* p->limit is %PIDS_MAX then we know that this test will never
177
* fail.
178
*/
179
if (new > limit) {
180
*fail = p;
181
goto revert;
182
}
183
/*
184
* Not technically accurate if we go over limit somewhere up
185
* the hierarchy, but that's tolerable for the watermark.
186
*/
187
pids_update_watermark(p, new);
188
}
189
190
return 0;
191
192
revert:
193
for (q = pids; q != p; q = parent_pids(q))
194
pids_cancel(q, num);
195
pids_cancel(p, num);
196
197
return -EAGAIN;
198
}
199
200
static int pids_can_attach(struct cgroup_taskset *tset)
201
{
202
struct task_struct *task;
203
struct cgroup_subsys_state *dst_css;
204
205
cgroup_taskset_for_each(task, dst_css, tset) {
206
struct pids_cgroup *pids = css_pids(dst_css);
207
struct cgroup_subsys_state *old_css;
208
struct pids_cgroup *old_pids;
209
210
/*
211
* No need to pin @old_css between here and cancel_attach()
212
* because cgroup core protects it from being freed before
213
* the migration completes or fails.
214
*/
215
old_css = task_css(task, pids_cgrp_id);
216
old_pids = css_pids(old_css);
217
218
pids_charge(pids, 1);
219
pids_uncharge(old_pids, 1);
220
}
221
222
return 0;
223
}
224
225
static void pids_cancel_attach(struct cgroup_taskset *tset)
226
{
227
struct task_struct *task;
228
struct cgroup_subsys_state *dst_css;
229
230
cgroup_taskset_for_each(task, dst_css, tset) {
231
struct pids_cgroup *pids = css_pids(dst_css);
232
struct cgroup_subsys_state *old_css;
233
struct pids_cgroup *old_pids;
234
235
old_css = task_css(task, pids_cgrp_id);
236
old_pids = css_pids(old_css);
237
238
pids_charge(old_pids, 1);
239
pids_uncharge(pids, 1);
240
}
241
}
242
243
static void pids_event(struct pids_cgroup *pids_forking,
244
struct pids_cgroup *pids_over_limit)
245
{
246
struct pids_cgroup *p = pids_forking;
247
248
/* Only log the first time limit is hit. */
249
if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
250
pr_info("cgroup: fork rejected by pids controller in ");
251
pr_cont_cgroup_path(p->css.cgroup);
252
pr_cont("\n");
253
}
254
if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
255
cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
256
cgroup_file_notify(&p->events_local_file);
257
return;
258
}
259
260
atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]);
261
cgroup_file_notify(&pids_over_limit->events_local_file);
262
263
for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) {
264
atomic64_inc(&p->events[PIDCG_MAX]);
265
cgroup_file_notify(&p->events_file);
266
}
267
}
268
269
/*
270
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
271
* on cgroup_threadgroup_change_begin() held by the copy_process().
272
*/
273
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
274
{
275
struct pids_cgroup *pids, *pids_over_limit;
276
int err;
277
278
pids = css_pids(cset->subsys[pids_cgrp_id]);
279
err = pids_try_charge(pids, 1, &pids_over_limit);
280
if (err)
281
pids_event(pids, pids_over_limit);
282
283
return err;
284
}
285
286
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
287
{
288
struct pids_cgroup *pids;
289
290
pids = css_pids(cset->subsys[pids_cgrp_id]);
291
pids_uncharge(pids, 1);
292
}
293
294
static void pids_release(struct task_struct *task)
295
{
296
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
297
298
pids_uncharge(pids, 1);
299
}
300
301
static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
302
size_t nbytes, loff_t off)
303
{
304
struct cgroup_subsys_state *css = of_css(of);
305
struct pids_cgroup *pids = css_pids(css);
306
int64_t limit;
307
int err;
308
309
buf = strstrip(buf);
310
if (!strcmp(buf, PIDS_MAX_STR)) {
311
limit = PIDS_MAX;
312
goto set_limit;
313
}
314
315
err = kstrtoll(buf, 0, &limit);
316
if (err)
317
return err;
318
319
if (limit < 0 || limit >= PIDS_MAX)
320
return -EINVAL;
321
322
set_limit:
323
/*
324
* Limit updates don't need to be mutex'd, since it isn't
325
* critical that any racing fork()s follow the new limit.
326
*/
327
atomic64_set(&pids->limit, limit);
328
return nbytes;
329
}
330
331
static int pids_max_show(struct seq_file *sf, void *v)
332
{
333
struct cgroup_subsys_state *css = seq_css(sf);
334
struct pids_cgroup *pids = css_pids(css);
335
int64_t limit = atomic64_read(&pids->limit);
336
337
if (limit >= PIDS_MAX)
338
seq_printf(sf, "%s\n", PIDS_MAX_STR);
339
else
340
seq_printf(sf, "%lld\n", limit);
341
342
return 0;
343
}
344
345
static s64 pids_current_read(struct cgroup_subsys_state *css,
346
struct cftype *cft)
347
{
348
struct pids_cgroup *pids = css_pids(css);
349
350
return atomic64_read(&pids->counter);
351
}
352
353
static s64 pids_peak_read(struct cgroup_subsys_state *css,
354
struct cftype *cft)
355
{
356
struct pids_cgroup *pids = css_pids(css);
357
358
return READ_ONCE(pids->watermark);
359
}
360
361
static int __pids_events_show(struct seq_file *sf, bool local)
362
{
363
struct pids_cgroup *pids = css_pids(seq_css(sf));
364
enum pidcg_event pe = PIDCG_MAX;
365
atomic64_t *events;
366
367
if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
368
cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
369
pe = PIDCG_FORKFAIL;
370
local = true;
371
}
372
events = local ? pids->events_local : pids->events;
373
374
seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
375
return 0;
376
}
377
378
static int pids_events_show(struct seq_file *sf, void *v)
379
{
380
__pids_events_show(sf, false);
381
return 0;
382
}
383
384
static int pids_events_local_show(struct seq_file *sf, void *v)
385
{
386
__pids_events_show(sf, true);
387
return 0;
388
}
389
390
static struct cftype pids_files[] = {
391
{
392
.name = "max",
393
.write = pids_max_write,
394
.seq_show = pids_max_show,
395
.flags = CFTYPE_NOT_ON_ROOT,
396
},
397
{
398
.name = "current",
399
.read_s64 = pids_current_read,
400
.flags = CFTYPE_NOT_ON_ROOT,
401
},
402
{
403
.name = "peak",
404
.flags = CFTYPE_NOT_ON_ROOT,
405
.read_s64 = pids_peak_read,
406
},
407
{
408
.name = "events",
409
.seq_show = pids_events_show,
410
.file_offset = offsetof(struct pids_cgroup, events_file),
411
.flags = CFTYPE_NOT_ON_ROOT,
412
},
413
{
414
.name = "events.local",
415
.seq_show = pids_events_local_show,
416
.file_offset = offsetof(struct pids_cgroup, events_local_file),
417
.flags = CFTYPE_NOT_ON_ROOT,
418
},
419
{ } /* terminate */
420
};
421
422
static struct cftype pids_files_legacy[] = {
423
{
424
.name = "max",
425
.write = pids_max_write,
426
.seq_show = pids_max_show,
427
.flags = CFTYPE_NOT_ON_ROOT,
428
},
429
{
430
.name = "current",
431
.read_s64 = pids_current_read,
432
.flags = CFTYPE_NOT_ON_ROOT,
433
},
434
{
435
.name = "peak",
436
.flags = CFTYPE_NOT_ON_ROOT,
437
.read_s64 = pids_peak_read,
438
},
439
{
440
.name = "events",
441
.seq_show = pids_events_show,
442
.file_offset = offsetof(struct pids_cgroup, events_file),
443
.flags = CFTYPE_NOT_ON_ROOT,
444
},
445
{ } /* terminate */
446
};
447
448
449
struct cgroup_subsys pids_cgrp_subsys = {
450
.css_alloc = pids_css_alloc,
451
.css_free = pids_css_free,
452
.can_attach = pids_can_attach,
453
.cancel_attach = pids_cancel_attach,
454
.can_fork = pids_can_fork,
455
.cancel_fork = pids_cancel_fork,
456
.release = pids_release,
457
.legacy_cftypes = pids_files_legacy,
458
.dfl_cftypes = pids_files,
459
.threaded = true,
460
};
461
462