Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/cgroup/rdma.c
29278 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* RDMA resource limiting controller for cgroups.
4
*
5
* Used to allow a cgroup hierarchy to stop processes from consuming
6
* additional RDMA resources after a certain limit is reached.
7
*
8
* Copyright (C) 2016 Parav Pandit <[email protected]>
9
*/
10
11
#include <linux/bitops.h>
12
#include <linux/slab.h>
13
#include <linux/seq_file.h>
14
#include <linux/cgroup.h>
15
#include <linux/parser.h>
16
#include <linux/cgroup_rdma.h>
17
18
#define RDMACG_MAX_STR "max"
19
20
/*
21
* Protects list of resource pools maintained on per cgroup basis
22
* and rdma device list.
23
*/
24
static DEFINE_MUTEX(rdmacg_mutex);
25
static LIST_HEAD(rdmacg_devices);
26
27
enum rdmacg_file_type {
28
RDMACG_RESOURCE_TYPE_MAX,
29
RDMACG_RESOURCE_TYPE_STAT,
30
};
31
32
/*
33
* resource table definition as to be seen by the user.
34
* Need to add entries to it when more resources are
35
* added/defined at IB verb/core layer.
36
*/
37
static char const *rdmacg_resource_names[] = {
38
[RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
39
[RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
40
};
41
42
/* resource tracker for each resource of rdma cgroup */
43
struct rdmacg_resource {
44
int max;
45
int usage;
46
};
47
48
/*
49
* resource pool object which represents per cgroup, per device
50
* resources. There are multiple instances of this object per cgroup,
51
* therefore it cannot be embedded within rdma_cgroup structure. It
52
* is maintained as list.
53
*/
54
struct rdmacg_resource_pool {
55
struct rdmacg_device *device;
56
struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
57
58
struct list_head cg_node;
59
struct list_head dev_node;
60
61
/* count active user tasks of this pool */
62
u64 usage_sum;
63
/* total number counts which are set to max */
64
int num_max_cnt;
65
};
66
67
static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
68
{
69
return container_of(css, struct rdma_cgroup, css);
70
}
71
72
static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
73
{
74
return css_rdmacg(cg->css.parent);
75
}
76
77
static inline struct rdma_cgroup *get_current_rdmacg(void)
78
{
79
return css_rdmacg(task_get_css(current, rdma_cgrp_id));
80
}
81
82
static void set_resource_limit(struct rdmacg_resource_pool *rpool,
83
int index, int new_max)
84
{
85
if (new_max == S32_MAX) {
86
if (rpool->resources[index].max != S32_MAX)
87
rpool->num_max_cnt++;
88
} else {
89
if (rpool->resources[index].max == S32_MAX)
90
rpool->num_max_cnt--;
91
}
92
rpool->resources[index].max = new_max;
93
}
94
95
static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
96
{
97
int i;
98
99
for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100
set_resource_limit(rpool, i, S32_MAX);
101
}
102
103
static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104
{
105
lockdep_assert_held(&rdmacg_mutex);
106
107
list_del(&rpool->cg_node);
108
list_del(&rpool->dev_node);
109
kfree(rpool);
110
}
111
112
static struct rdmacg_resource_pool *
113
find_cg_rpool_locked(struct rdma_cgroup *cg,
114
struct rdmacg_device *device)
115
116
{
117
struct rdmacg_resource_pool *pool;
118
119
lockdep_assert_held(&rdmacg_mutex);
120
121
list_for_each_entry(pool, &cg->rpools, cg_node)
122
if (pool->device == device)
123
return pool;
124
125
return NULL;
126
}
127
128
static struct rdmacg_resource_pool *
129
get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130
{
131
struct rdmacg_resource_pool *rpool;
132
133
rpool = find_cg_rpool_locked(cg, device);
134
if (rpool)
135
return rpool;
136
137
rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
138
if (!rpool)
139
return ERR_PTR(-ENOMEM);
140
141
rpool->device = device;
142
set_all_resource_max_limit(rpool);
143
144
INIT_LIST_HEAD(&rpool->cg_node);
145
INIT_LIST_HEAD(&rpool->dev_node);
146
list_add_tail(&rpool->cg_node, &cg->rpools);
147
list_add_tail(&rpool->dev_node, &device->rpools);
148
return rpool;
149
}
150
151
/**
152
* uncharge_cg_locked - uncharge resource for rdma cgroup
153
* @cg: pointer to cg to uncharge and all parents in hierarchy
154
* @device: pointer to rdmacg device
155
* @index: index of the resource to uncharge in cg (resource pool)
156
*
157
* It also frees the resource pool which was created as part of
158
* charging operation when there are no resources attached to
159
* resource pool.
160
*/
161
static void
162
uncharge_cg_locked(struct rdma_cgroup *cg,
163
struct rdmacg_device *device,
164
enum rdmacg_resource_type index)
165
{
166
struct rdmacg_resource_pool *rpool;
167
168
rpool = find_cg_rpool_locked(cg, device);
169
170
/*
171
* rpool cannot be null at this stage. Let kernel operate in case
172
* if there a bug in IB stack or rdma controller, instead of crashing
173
* the system.
174
*/
175
if (unlikely(!rpool)) {
176
pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177
return;
178
}
179
180
rpool->resources[index].usage--;
181
182
/*
183
* A negative count (or overflow) is invalid,
184
* it indicates a bug in the rdma controller.
185
*/
186
WARN_ON_ONCE(rpool->resources[index].usage < 0);
187
rpool->usage_sum--;
188
if (rpool->usage_sum == 0 &&
189
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190
/*
191
* No user of the rpool and all entries are set to max, so
192
* safe to delete this rpool.
193
*/
194
free_cg_rpool_locked(rpool);
195
}
196
}
197
198
/**
199
* rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200
* @cg: pointer to cg to uncharge and all parents in hierarchy
201
* @device: pointer to rdmacg device
202
* @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
203
* stop uncharging
204
* @index: index of the resource to uncharge in cg in given resource pool
205
*/
206
static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
207
struct rdmacg_device *device,
208
struct rdma_cgroup *stop_cg,
209
enum rdmacg_resource_type index)
210
{
211
struct rdma_cgroup *p;
212
213
mutex_lock(&rdmacg_mutex);
214
215
for (p = cg; p != stop_cg; p = parent_rdmacg(p))
216
uncharge_cg_locked(p, device, index);
217
218
mutex_unlock(&rdmacg_mutex);
219
220
css_put(&cg->css);
221
}
222
223
/**
224
* rdmacg_uncharge - hierarchically uncharge rdma resource count
225
* @cg: pointer to cg to uncharge and all parents in hierarchy
226
* @device: pointer to rdmacg device
227
* @index: index of the resource to uncharge in cgroup in given resource pool
228
*/
229
void rdmacg_uncharge(struct rdma_cgroup *cg,
230
struct rdmacg_device *device,
231
enum rdmacg_resource_type index)
232
{
233
if (index >= RDMACG_RESOURCE_MAX)
234
return;
235
236
rdmacg_uncharge_hierarchy(cg, device, NULL, index);
237
}
238
EXPORT_SYMBOL(rdmacg_uncharge);
239
240
/**
241
* rdmacg_try_charge - hierarchically try to charge the rdma resource
242
* @rdmacg: pointer to rdma cgroup which will own this resource
243
* @device: pointer to rdmacg device
244
* @index: index of the resource to charge in cgroup (resource pool)
245
*
246
* This function follows charging resource in hierarchical way.
247
* It will fail if the charge would cause the new value to exceed the
248
* hierarchical limit.
249
* Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
250
* Returns pointer to rdmacg for this resource when charging is successful.
251
*
252
* Charger needs to account resources on two criteria.
253
* (a) per cgroup & (b) per device resource usage.
254
* Per cgroup resource usage ensures that tasks of cgroup doesn't cross
255
* the configured limits. Per device provides granular configuration
256
* in multi device usage. It allocates resource pool in the hierarchy
257
* for each parent it come across for first resource. Later on resource
258
* pool will be available. Therefore it will be much faster thereon
259
* to charge/uncharge.
260
*/
261
int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
262
struct rdmacg_device *device,
263
enum rdmacg_resource_type index)
264
{
265
struct rdma_cgroup *cg, *p;
266
struct rdmacg_resource_pool *rpool;
267
s64 new;
268
int ret = 0;
269
270
if (index >= RDMACG_RESOURCE_MAX)
271
return -EINVAL;
272
273
/*
274
* hold on to css, as cgroup can be removed but resource
275
* accounting happens on css.
276
*/
277
cg = get_current_rdmacg();
278
279
mutex_lock(&rdmacg_mutex);
280
for (p = cg; p; p = parent_rdmacg(p)) {
281
rpool = get_cg_rpool_locked(p, device);
282
if (IS_ERR(rpool)) {
283
ret = PTR_ERR(rpool);
284
goto err;
285
} else {
286
new = rpool->resources[index].usage + 1;
287
if (new > rpool->resources[index].max) {
288
ret = -EAGAIN;
289
goto err;
290
} else {
291
rpool->resources[index].usage = new;
292
rpool->usage_sum++;
293
}
294
}
295
}
296
mutex_unlock(&rdmacg_mutex);
297
298
*rdmacg = cg;
299
return 0;
300
301
err:
302
mutex_unlock(&rdmacg_mutex);
303
rdmacg_uncharge_hierarchy(cg, device, p, index);
304
return ret;
305
}
306
EXPORT_SYMBOL(rdmacg_try_charge);
307
308
/**
309
* rdmacg_register_device - register rdmacg device to rdma controller.
310
* @device: pointer to rdmacg device whose resources need to be accounted.
311
*
312
* If IB stack wish a device to participate in rdma cgroup resource
313
* tracking, it must invoke this API to register with rdma cgroup before
314
* any user space application can start using the RDMA resources.
315
*/
316
void rdmacg_register_device(struct rdmacg_device *device)
317
{
318
INIT_LIST_HEAD(&device->dev_node);
319
INIT_LIST_HEAD(&device->rpools);
320
321
mutex_lock(&rdmacg_mutex);
322
list_add_tail(&device->dev_node, &rdmacg_devices);
323
mutex_unlock(&rdmacg_mutex);
324
}
325
EXPORT_SYMBOL(rdmacg_register_device);
326
327
/**
328
* rdmacg_unregister_device - unregister rdmacg device from rdma controller.
329
* @device: pointer to rdmacg device which was previously registered with rdma
330
* controller using rdmacg_register_device().
331
*
332
* IB stack must invoke this after all the resources of the IB device
333
* are destroyed and after ensuring that no more resources will be created
334
* when this API is invoked.
335
*/
336
void rdmacg_unregister_device(struct rdmacg_device *device)
337
{
338
struct rdmacg_resource_pool *rpool, *tmp;
339
340
/*
341
* Synchronize with any active resource settings,
342
* usage query happening via configfs.
343
*/
344
mutex_lock(&rdmacg_mutex);
345
list_del_init(&device->dev_node);
346
347
/*
348
* Now that this device is off the cgroup list, its safe to free
349
* all the rpool resources.
350
*/
351
list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
352
free_cg_rpool_locked(rpool);
353
354
mutex_unlock(&rdmacg_mutex);
355
}
356
EXPORT_SYMBOL(rdmacg_unregister_device);
357
358
static int parse_resource(char *c, int *intval)
359
{
360
substring_t argstr;
361
char *name, *value = c;
362
size_t len;
363
int ret, i;
364
365
name = strsep(&value, "=");
366
if (!name || !value)
367
return -EINVAL;
368
369
i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
370
if (i < 0)
371
return i;
372
373
len = strlen(value);
374
375
argstr.from = value;
376
argstr.to = value + len;
377
378
ret = match_int(&argstr, intval);
379
if (ret >= 0) {
380
if (*intval < 0)
381
return -EINVAL;
382
return i;
383
}
384
if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
385
*intval = S32_MAX;
386
return i;
387
}
388
return -EINVAL;
389
}
390
391
static int rdmacg_parse_limits(char *options,
392
int *new_limits, unsigned long *enables)
393
{
394
char *c;
395
int err = -EINVAL;
396
397
/* parse resource options */
398
while ((c = strsep(&options, " ")) != NULL) {
399
int index, intval;
400
401
index = parse_resource(c, &intval);
402
if (index < 0)
403
goto err;
404
405
new_limits[index] = intval;
406
*enables |= BIT(index);
407
}
408
return 0;
409
410
err:
411
return err;
412
}
413
414
static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
415
{
416
struct rdmacg_device *device;
417
418
lockdep_assert_held(&rdmacg_mutex);
419
420
list_for_each_entry(device, &rdmacg_devices, dev_node)
421
if (!strcmp(name, device->name))
422
return device;
423
424
return NULL;
425
}
426
427
static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
428
char *buf, size_t nbytes, loff_t off)
429
{
430
struct rdma_cgroup *cg = css_rdmacg(of_css(of));
431
const char *dev_name;
432
struct rdmacg_resource_pool *rpool;
433
struct rdmacg_device *device;
434
char *options = strstrip(buf);
435
int *new_limits;
436
unsigned long enables = 0;
437
int i = 0, ret = 0;
438
439
/* extract the device name first */
440
dev_name = strsep(&options, " ");
441
if (!dev_name) {
442
ret = -EINVAL;
443
goto err;
444
}
445
446
new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
447
if (!new_limits) {
448
ret = -ENOMEM;
449
goto err;
450
}
451
452
ret = rdmacg_parse_limits(options, new_limits, &enables);
453
if (ret)
454
goto parse_err;
455
456
/* acquire lock to synchronize with hot plug devices */
457
mutex_lock(&rdmacg_mutex);
458
459
device = rdmacg_get_device_locked(dev_name);
460
if (!device) {
461
ret = -ENODEV;
462
goto dev_err;
463
}
464
465
rpool = get_cg_rpool_locked(cg, device);
466
if (IS_ERR(rpool)) {
467
ret = PTR_ERR(rpool);
468
goto dev_err;
469
}
470
471
/* now set the new limits of the rpool */
472
for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
473
set_resource_limit(rpool, i, new_limits[i]);
474
475
if (rpool->usage_sum == 0 &&
476
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
477
/*
478
* No user of the rpool and all entries are set to max, so
479
* safe to delete this rpool.
480
*/
481
free_cg_rpool_locked(rpool);
482
}
483
484
dev_err:
485
mutex_unlock(&rdmacg_mutex);
486
487
parse_err:
488
kfree(new_limits);
489
490
err:
491
return ret ?: nbytes;
492
}
493
494
static void print_rpool_values(struct seq_file *sf,
495
struct rdmacg_resource_pool *rpool)
496
{
497
enum rdmacg_file_type sf_type;
498
int i;
499
u32 value;
500
501
sf_type = seq_cft(sf)->private;
502
503
for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
504
seq_puts(sf, rdmacg_resource_names[i]);
505
seq_putc(sf, '=');
506
if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
507
if (rpool)
508
value = rpool->resources[i].max;
509
else
510
value = S32_MAX;
511
} else {
512
if (rpool)
513
value = rpool->resources[i].usage;
514
else
515
value = 0;
516
}
517
518
if (value == S32_MAX)
519
seq_puts(sf, RDMACG_MAX_STR);
520
else
521
seq_printf(sf, "%d", value);
522
seq_putc(sf, ' ');
523
}
524
}
525
526
static int rdmacg_resource_read(struct seq_file *sf, void *v)
527
{
528
struct rdmacg_device *device;
529
struct rdmacg_resource_pool *rpool;
530
struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
531
532
mutex_lock(&rdmacg_mutex);
533
534
list_for_each_entry(device, &rdmacg_devices, dev_node) {
535
seq_printf(sf, "%s ", device->name);
536
537
rpool = find_cg_rpool_locked(cg, device);
538
print_rpool_values(sf, rpool);
539
540
seq_putc(sf, '\n');
541
}
542
543
mutex_unlock(&rdmacg_mutex);
544
return 0;
545
}
546
547
static struct cftype rdmacg_files[] = {
548
{
549
.name = "max",
550
.write = rdmacg_resource_set_max,
551
.seq_show = rdmacg_resource_read,
552
.private = RDMACG_RESOURCE_TYPE_MAX,
553
.flags = CFTYPE_NOT_ON_ROOT,
554
},
555
{
556
.name = "current",
557
.seq_show = rdmacg_resource_read,
558
.private = RDMACG_RESOURCE_TYPE_STAT,
559
.flags = CFTYPE_NOT_ON_ROOT,
560
},
561
{ } /* terminate */
562
};
563
564
static struct cgroup_subsys_state *
565
rdmacg_css_alloc(struct cgroup_subsys_state *parent)
566
{
567
struct rdma_cgroup *cg;
568
569
cg = kzalloc(sizeof(*cg), GFP_KERNEL);
570
if (!cg)
571
return ERR_PTR(-ENOMEM);
572
573
INIT_LIST_HEAD(&cg->rpools);
574
return &cg->css;
575
}
576
577
static void rdmacg_css_free(struct cgroup_subsys_state *css)
578
{
579
struct rdma_cgroup *cg = css_rdmacg(css);
580
581
kfree(cg);
582
}
583
584
/**
585
* rdmacg_css_offline - cgroup css_offline callback
586
* @css: css of interest
587
*
588
* This function is called when @css is about to go away and responsible
589
* for shooting down all rdmacg associated with @css. As part of that it
590
* marks all the resource pool entries to max value, so that when resources are
591
* uncharged, associated resource pool can be freed as well.
592
*/
593
static void rdmacg_css_offline(struct cgroup_subsys_state *css)
594
{
595
struct rdma_cgroup *cg = css_rdmacg(css);
596
struct rdmacg_resource_pool *rpool;
597
598
mutex_lock(&rdmacg_mutex);
599
600
list_for_each_entry(rpool, &cg->rpools, cg_node)
601
set_all_resource_max_limit(rpool);
602
603
mutex_unlock(&rdmacg_mutex);
604
}
605
606
struct cgroup_subsys rdma_cgrp_subsys = {
607
.css_alloc = rdmacg_css_alloc,
608
.css_free = rdmacg_css_free,
609
.css_offline = rdmacg_css_offline,
610
.legacy_cftypes = rdmacg_files,
611
.dfl_cftypes = rdmacg_files,
612
};
613
614