Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/cgroup/cpuset-v1.c
29281 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
3
#include "cgroup-internal.h"
4
#include "cpuset-internal.h"
5
6
/*
7
* Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
8
*/
9
struct cpuset_remove_tasks_struct {
10
struct work_struct work;
11
struct cpuset *cs;
12
};
13
14
/*
15
* Frequency meter - How fast is some event occurring?
16
*
17
* These routines manage a digitally filtered, constant time based,
18
* event frequency meter. There are four routines:
19
* fmeter_init() - initialize a frequency meter.
20
* fmeter_markevent() - called each time the event happens.
21
* fmeter_getrate() - returns the recent rate of such events.
22
* fmeter_update() - internal routine used to update fmeter.
23
*
24
* A common data structure is passed to each of these routines,
25
* which is used to keep track of the state required to manage the
26
* frequency meter and its digital filter.
27
*
28
* The filter works on the number of events marked per unit time.
29
* The filter is single-pole low-pass recursive (IIR). The time unit
30
* is 1 second. Arithmetic is done using 32-bit integers scaled to
31
* simulate 3 decimal digits of precision (multiplied by 1000).
32
*
33
* With an FM_COEF of 933, and a time base of 1 second, the filter
34
* has a half-life of 10 seconds, meaning that if the events quit
35
* happening, then the rate returned from the fmeter_getrate()
36
* will be cut in half each 10 seconds, until it converges to zero.
37
*
38
* It is not worth doing a real infinitely recursive filter. If more
39
* than FM_MAXTICKS ticks have elapsed since the last filter event,
40
* just compute FM_MAXTICKS ticks worth, by which point the level
41
* will be stable.
42
*
43
* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
44
* arithmetic overflow in the fmeter_update() routine.
45
*
46
* Given the simple 32 bit integer arithmetic used, this meter works
47
* best for reporting rates between one per millisecond (msec) and
48
* one per 32 (approx) seconds. At constant rates faster than one
49
* per msec it maxes out at values just under 1,000,000. At constant
50
* rates between one per msec, and one per second it will stabilize
51
* to a value N*1000, where N is the rate of events per second.
52
* At constant rates between one per second and one per 32 seconds,
53
* it will be choppy, moving up on the seconds that have an event,
54
* and then decaying until the next event. At rates slower than
55
* about one in 32 seconds, it decays all the way back to zero between
56
* each event.
57
*/
58
59
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
60
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
61
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
62
#define FM_SCALE 1000 /* faux fixed point scale */
63
64
/* Initialize a frequency meter */
65
void fmeter_init(struct fmeter *fmp)
66
{
67
fmp->cnt = 0;
68
fmp->val = 0;
69
fmp->time = 0;
70
spin_lock_init(&fmp->lock);
71
}
72
73
/* Internal meter update - process cnt events and update value */
74
static void fmeter_update(struct fmeter *fmp)
75
{
76
time64_t now;
77
u32 ticks;
78
79
now = ktime_get_seconds();
80
ticks = now - fmp->time;
81
82
if (ticks == 0)
83
return;
84
85
ticks = min(FM_MAXTICKS, ticks);
86
while (ticks-- > 0)
87
fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
88
fmp->time = now;
89
90
fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
91
fmp->cnt = 0;
92
}
93
94
/* Process any previous ticks, then bump cnt by one (times scale). */
95
static void fmeter_markevent(struct fmeter *fmp)
96
{
97
spin_lock(&fmp->lock);
98
fmeter_update(fmp);
99
fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
100
spin_unlock(&fmp->lock);
101
}
102
103
/* Process any previous ticks, then return current value. */
104
static int fmeter_getrate(struct fmeter *fmp)
105
{
106
int val;
107
108
spin_lock(&fmp->lock);
109
fmeter_update(fmp);
110
val = fmp->val;
111
spin_unlock(&fmp->lock);
112
return val;
113
}
114
115
/*
116
* Collection of memory_pressure is suppressed unless
117
* this flag is enabled by writing "1" to the special
118
* cpuset file 'memory_pressure_enabled' in the root cpuset.
119
*/
120
121
int cpuset_memory_pressure_enabled __read_mostly;
122
123
/*
124
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
125
*
126
* Keep a running average of the rate of synchronous (direct)
127
* page reclaim efforts initiated by tasks in each cpuset.
128
*
129
* This represents the rate at which some task in the cpuset
130
* ran low on memory on all nodes it was allowed to use, and
131
* had to enter the kernels page reclaim code in an effort to
132
* create more free memory by tossing clean pages or swapping
133
* or writing dirty pages.
134
*
135
* Display to user space in the per-cpuset read-only file
136
* "memory_pressure". Value displayed is an integer
137
* representing the recent rate of entry into the synchronous
138
* (direct) page reclaim by any task attached to the cpuset.
139
*/
140
141
void __cpuset_memory_pressure_bump(void)
142
{
143
rcu_read_lock();
144
fmeter_markevent(&task_cs(current)->fmeter);
145
rcu_read_unlock();
146
}
147
148
static int update_relax_domain_level(struct cpuset *cs, s64 val)
149
{
150
#ifdef CONFIG_SMP
151
if (val < -1 || val > sched_domain_level_max + 1)
152
return -EINVAL;
153
#endif
154
155
if (val != cs->relax_domain_level) {
156
cs->relax_domain_level = val;
157
if (!cpumask_empty(cs->cpus_allowed) &&
158
is_sched_load_balance(cs))
159
rebuild_sched_domains_locked();
160
}
161
162
return 0;
163
}
164
165
static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
166
s64 val)
167
{
168
struct cpuset *cs = css_cs(css);
169
cpuset_filetype_t type = cft->private;
170
int retval = -ENODEV;
171
172
cpuset_full_lock();
173
if (!is_cpuset_online(cs))
174
goto out_unlock;
175
176
switch (type) {
177
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178
pr_info_once("cpuset.%s is deprecated\n", cft->name);
179
retval = update_relax_domain_level(cs, val);
180
break;
181
default:
182
retval = -EINVAL;
183
break;
184
}
185
out_unlock:
186
cpuset_full_unlock();
187
return retval;
188
}
189
190
static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191
{
192
struct cpuset *cs = css_cs(css);
193
cpuset_filetype_t type = cft->private;
194
195
switch (type) {
196
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197
return cs->relax_domain_level;
198
default:
199
BUG();
200
}
201
202
/* Unreachable but makes gcc happy */
203
return 0;
204
}
205
206
/*
207
* update task's spread flag if cpuset's page/slab spread flag is set
208
*
209
* Call with callback_lock or cpuset_mutex held. The check can be skipped
210
* if on default hierarchy.
211
*/
212
void cpuset1_update_task_spread_flags(struct cpuset *cs,
213
struct task_struct *tsk)
214
{
215
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216
return;
217
218
if (is_spread_page(cs))
219
task_set_spread_page(tsk);
220
else
221
task_clear_spread_page(tsk);
222
223
if (is_spread_slab(cs))
224
task_set_spread_slab(tsk);
225
else
226
task_clear_spread_slab(tsk);
227
}
228
229
/**
230
* cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231
* @cs: the cpuset in which each task's spread flags needs to be changed
232
*
233
* Iterate through each task of @cs updating its spread flags. As this
234
* function is called with cpuset_mutex held, cpuset membership stays
235
* stable.
236
*/
237
void cpuset1_update_tasks_flags(struct cpuset *cs)
238
{
239
struct css_task_iter it;
240
struct task_struct *task;
241
242
css_task_iter_start(&cs->css, 0, &it);
243
while ((task = css_task_iter_next(&it)))
244
cpuset1_update_task_spread_flags(cs, task);
245
css_task_iter_end(&it);
246
}
247
248
/*
249
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
250
* or memory nodes, we need to walk over the cpuset hierarchy,
251
* removing that CPU or node from all cpusets. If this removes the
252
* last CPU or node from a cpuset, then move the tasks in the empty
253
* cpuset to its next-highest non-empty parent.
254
*/
255
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256
{
257
struct cpuset *parent;
258
259
/*
260
* Find its next-highest non-empty parent, (top cpuset
261
* has online cpus, so can't be empty).
262
*/
263
parent = parent_cs(cs);
264
while (cpumask_empty(parent->cpus_allowed) ||
265
nodes_empty(parent->mems_allowed))
266
parent = parent_cs(parent);
267
268
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270
pr_cont_cgroup_name(cs->css.cgroup);
271
pr_cont("\n");
272
}
273
}
274
275
static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276
{
277
struct cpuset_remove_tasks_struct *s;
278
279
s = container_of(work, struct cpuset_remove_tasks_struct, work);
280
remove_tasks_in_empty_cpuset(s->cs);
281
css_put(&s->cs->css);
282
kfree(s);
283
}
284
285
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286
struct cpumask *new_cpus, nodemask_t *new_mems,
287
bool cpus_updated, bool mems_updated)
288
{
289
bool is_empty;
290
291
cpuset_callback_lock_irq();
292
cpumask_copy(cs->cpus_allowed, new_cpus);
293
cpumask_copy(cs->effective_cpus, new_cpus);
294
cs->mems_allowed = *new_mems;
295
cs->effective_mems = *new_mems;
296
cpuset_callback_unlock_irq();
297
298
/*
299
* Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300
* as the tasks will be migrated to an ancestor.
301
*/
302
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303
cpuset_update_tasks_cpumask(cs, new_cpus);
304
if (mems_updated && !nodes_empty(cs->mems_allowed))
305
cpuset_update_tasks_nodemask(cs);
306
307
is_empty = cpumask_empty(cs->cpus_allowed) ||
308
nodes_empty(cs->mems_allowed);
309
310
/*
311
* Move tasks to the nearest ancestor with execution resources,
312
* This is full cgroup operation which will also call back into
313
* cpuset. Execute it asynchronously using workqueue.
314
*/
315
if (is_empty && cs->css.cgroup->nr_populated_csets &&
316
css_tryget_online(&cs->css)) {
317
struct cpuset_remove_tasks_struct *s;
318
319
s = kzalloc(sizeof(*s), GFP_KERNEL);
320
if (WARN_ON_ONCE(!s)) {
321
css_put(&cs->css);
322
return;
323
}
324
325
s->cs = cs;
326
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327
schedule_work(&s->work);
328
}
329
}
330
331
/*
332
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333
*
334
* One cpuset is a subset of another if all its allowed CPUs and
335
* Memory Nodes are a subset of the other, and its exclusive flags
336
* are only set if the other's are set. Call holding cpuset_mutex.
337
*/
338
339
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340
{
341
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342
nodes_subset(p->mems_allowed, q->mems_allowed) &&
343
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344
is_mem_exclusive(p) <= is_mem_exclusive(q);
345
}
346
347
/*
348
* cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349
* behavior.
350
*/
351
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352
{
353
struct cgroup_subsys_state *css;
354
struct cpuset *c, *par;
355
int ret;
356
357
WARN_ON_ONCE(!rcu_read_lock_held());
358
359
/* Each of our child cpusets must be a subset of us */
360
ret = -EBUSY;
361
cpuset_for_each_child(c, css, cur)
362
if (!is_cpuset_subset(c, trial))
363
goto out;
364
365
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
366
ret = -EACCES;
367
par = parent_cs(cur);
368
if (par && !is_cpuset_subset(trial, par))
369
goto out;
370
371
ret = 0;
372
out:
373
return ret;
374
}
375
376
#ifdef CONFIG_PROC_PID_CPUSET
377
/*
378
* proc_cpuset_show()
379
* - Print tasks cpuset path into seq_file.
380
* - Used for /proc/<pid>/cpuset.
381
*/
382
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
383
struct pid *pid, struct task_struct *tsk)
384
{
385
char *buf;
386
struct cgroup_subsys_state *css;
387
int retval;
388
389
retval = -ENOMEM;
390
buf = kmalloc(PATH_MAX, GFP_KERNEL);
391
if (!buf)
392
goto out;
393
394
rcu_read_lock();
395
spin_lock_irq(&css_set_lock);
396
css = task_css(tsk, cpuset_cgrp_id);
397
retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
398
current->nsproxy->cgroup_ns);
399
spin_unlock_irq(&css_set_lock);
400
rcu_read_unlock();
401
402
if (retval == -E2BIG)
403
retval = -ENAMETOOLONG;
404
if (retval < 0)
405
goto out_free;
406
seq_puts(m, buf);
407
seq_putc(m, '\n');
408
retval = 0;
409
out_free:
410
kfree(buf);
411
out:
412
return retval;
413
}
414
#endif /* CONFIG_PROC_PID_CPUSET */
415
416
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
417
{
418
struct cpuset *cs = css_cs(css);
419
cpuset_filetype_t type = cft->private;
420
421
switch (type) {
422
case FILE_CPU_EXCLUSIVE:
423
return is_cpu_exclusive(cs);
424
case FILE_MEM_EXCLUSIVE:
425
return is_mem_exclusive(cs);
426
case FILE_MEM_HARDWALL:
427
return is_mem_hardwall(cs);
428
case FILE_SCHED_LOAD_BALANCE:
429
return is_sched_load_balance(cs);
430
case FILE_MEMORY_MIGRATE:
431
return is_memory_migrate(cs);
432
case FILE_MEMORY_PRESSURE_ENABLED:
433
return cpuset_memory_pressure_enabled;
434
case FILE_MEMORY_PRESSURE:
435
return fmeter_getrate(&cs->fmeter);
436
case FILE_SPREAD_PAGE:
437
return is_spread_page(cs);
438
case FILE_SPREAD_SLAB:
439
return is_spread_slab(cs);
440
default:
441
BUG();
442
}
443
444
/* Unreachable but makes gcc happy */
445
return 0;
446
}
447
448
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
449
u64 val)
450
{
451
struct cpuset *cs = css_cs(css);
452
cpuset_filetype_t type = cft->private;
453
int retval = 0;
454
455
cpuset_full_lock();
456
if (!is_cpuset_online(cs)) {
457
retval = -ENODEV;
458
goto out_unlock;
459
}
460
461
switch (type) {
462
case FILE_CPU_EXCLUSIVE:
463
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
464
break;
465
case FILE_MEM_EXCLUSIVE:
466
pr_info_once("cpuset.%s is deprecated\n", cft->name);
467
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
468
break;
469
case FILE_MEM_HARDWALL:
470
pr_info_once("cpuset.%s is deprecated\n", cft->name);
471
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
472
break;
473
case FILE_SCHED_LOAD_BALANCE:
474
pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
475
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
476
break;
477
case FILE_MEMORY_MIGRATE:
478
pr_info_once("cpuset.%s is deprecated\n", cft->name);
479
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
480
break;
481
case FILE_MEMORY_PRESSURE_ENABLED:
482
pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
483
cpuset_memory_pressure_enabled = !!val;
484
break;
485
case FILE_SPREAD_PAGE:
486
pr_info_once("cpuset.%s is deprecated\n", cft->name);
487
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
488
break;
489
case FILE_SPREAD_SLAB:
490
pr_warn_once("cpuset.%s is deprecated\n", cft->name);
491
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
492
break;
493
default:
494
retval = -EINVAL;
495
break;
496
}
497
out_unlock:
498
cpuset_full_unlock();
499
return retval;
500
}
501
502
/*
503
* for the common functions, 'private' gives the type of file
504
*/
505
506
struct cftype cpuset1_files[] = {
507
{
508
.name = "cpus",
509
.seq_show = cpuset_common_seq_show,
510
.write = cpuset_write_resmask,
511
.max_write_len = (100U + 6 * NR_CPUS),
512
.private = FILE_CPULIST,
513
},
514
515
{
516
.name = "mems",
517
.seq_show = cpuset_common_seq_show,
518
.write = cpuset_write_resmask,
519
.max_write_len = (100U + 6 * MAX_NUMNODES),
520
.private = FILE_MEMLIST,
521
},
522
523
{
524
.name = "effective_cpus",
525
.seq_show = cpuset_common_seq_show,
526
.private = FILE_EFFECTIVE_CPULIST,
527
},
528
529
{
530
.name = "effective_mems",
531
.seq_show = cpuset_common_seq_show,
532
.private = FILE_EFFECTIVE_MEMLIST,
533
},
534
535
{
536
.name = "cpu_exclusive",
537
.read_u64 = cpuset_read_u64,
538
.write_u64 = cpuset_write_u64,
539
.private = FILE_CPU_EXCLUSIVE,
540
},
541
542
{
543
.name = "mem_exclusive",
544
.read_u64 = cpuset_read_u64,
545
.write_u64 = cpuset_write_u64,
546
.private = FILE_MEM_EXCLUSIVE,
547
},
548
549
{
550
.name = "mem_hardwall",
551
.read_u64 = cpuset_read_u64,
552
.write_u64 = cpuset_write_u64,
553
.private = FILE_MEM_HARDWALL,
554
},
555
556
{
557
.name = "sched_load_balance",
558
.read_u64 = cpuset_read_u64,
559
.write_u64 = cpuset_write_u64,
560
.private = FILE_SCHED_LOAD_BALANCE,
561
},
562
563
{
564
.name = "sched_relax_domain_level",
565
.read_s64 = cpuset_read_s64,
566
.write_s64 = cpuset_write_s64,
567
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
568
},
569
570
{
571
.name = "memory_migrate",
572
.read_u64 = cpuset_read_u64,
573
.write_u64 = cpuset_write_u64,
574
.private = FILE_MEMORY_MIGRATE,
575
},
576
577
{
578
.name = "memory_pressure",
579
.read_u64 = cpuset_read_u64,
580
.private = FILE_MEMORY_PRESSURE,
581
},
582
583
{
584
.name = "memory_spread_page",
585
.read_u64 = cpuset_read_u64,
586
.write_u64 = cpuset_write_u64,
587
.private = FILE_SPREAD_PAGE,
588
},
589
590
{
591
/* obsolete, may be removed in the future */
592
.name = "memory_spread_slab",
593
.read_u64 = cpuset_read_u64,
594
.write_u64 = cpuset_write_u64,
595
.private = FILE_SPREAD_SLAB,
596
},
597
598
{
599
.name = "memory_pressure_enabled",
600
.flags = CFTYPE_ONLY_ON_ROOT,
601
.read_u64 = cpuset_read_u64,
602
.write_u64 = cpuset_write_u64,
603
.private = FILE_MEMORY_PRESSURE_ENABLED,
604
},
605
606
{ } /* terminate */
607
};
608
609