Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/memalloc.c
29280 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
3
#include <linux/mm.h>
4
#include <linux/llist.h>
5
#include <linux/bpf.h>
6
#include <linux/irq_work.h>
7
#include <linux/bpf_mem_alloc.h>
8
#include <linux/memcontrol.h>
9
#include <asm/local.h>
10
11
/* Any context (including NMI) BPF specific memory allocator.
12
*
13
* Tracing BPF programs can attach to kprobe and fentry. Hence they
14
* run in unknown context where calling plain kmalloc() might not be safe.
15
*
16
* Front-end kmalloc() with per-cpu per-bucket cache of free elements.
17
* Refill this cache asynchronously from irq_work.
18
*
19
* CPU_0 buckets
20
* 16 32 64 96 128 196 256 512 1024 2048 4096
21
* ...
22
* CPU_N buckets
23
* 16 32 64 96 128 196 256 512 1024 2048 4096
24
*
25
* The buckets are prefilled at the start.
26
* BPF programs always run with migration disabled.
27
* It's safe to allocate from cache of the current cpu with irqs disabled.
28
* Free-ing is always done into bucket of the current cpu as well.
29
* irq_work trims extra free elements from buckets with kfree
30
* and refills them with kmalloc, so global kmalloc logic takes care
31
* of freeing objects allocated by one cpu and freed on another.
32
*
33
* Every allocated objected is padded with extra 8 bytes that contains
34
* struct llist_node.
35
*/
36
#define LLIST_NODE_SZ sizeof(struct llist_node)
37
38
#define BPF_MEM_ALLOC_SIZE_MAX 4096
39
40
/* similar to kmalloc, but sizeof == 8 bucket is gone */
41
static u8 size_index[24] __ro_after_init = {
42
3, /* 8 */
43
3, /* 16 */
44
4, /* 24 */
45
4, /* 32 */
46
5, /* 40 */
47
5, /* 48 */
48
5, /* 56 */
49
5, /* 64 */
50
1, /* 72 */
51
1, /* 80 */
52
1, /* 88 */
53
1, /* 96 */
54
6, /* 104 */
55
6, /* 112 */
56
6, /* 120 */
57
6, /* 128 */
58
2, /* 136 */
59
2, /* 144 */
60
2, /* 152 */
61
2, /* 160 */
62
2, /* 168 */
63
2, /* 176 */
64
2, /* 184 */
65
2 /* 192 */
66
};
67
68
static int bpf_mem_cache_idx(size_t size)
69
{
70
if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
71
return -1;
72
73
if (size <= 192)
74
return size_index[(size - 1) / 8] - 1;
75
76
return fls(size - 1) - 2;
77
}
78
79
#define NUM_CACHES 11
80
81
struct bpf_mem_cache {
82
/* per-cpu list of free objects of size 'unit_size'.
83
* All accesses are done with interrupts disabled and 'active' counter
84
* protection with __llist_add() and __llist_del_first().
85
*/
86
struct llist_head free_llist;
87
local_t active;
88
89
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
90
* are sequenced by per-cpu 'active' counter. But unit_free() cannot
91
* fail. When 'active' is busy the unit_free() will add an object to
92
* free_llist_extra.
93
*/
94
struct llist_head free_llist_extra;
95
96
struct irq_work refill_work;
97
struct obj_cgroup *objcg;
98
int unit_size;
99
/* count of objects in free_llist */
100
int free_cnt;
101
int low_watermark, high_watermark, batch;
102
int percpu_size;
103
bool draining;
104
struct bpf_mem_cache *tgt;
105
106
/* list of objects to be freed after RCU GP */
107
struct llist_head free_by_rcu;
108
struct llist_node *free_by_rcu_tail;
109
struct llist_head waiting_for_gp;
110
struct llist_node *waiting_for_gp_tail;
111
struct rcu_head rcu;
112
atomic_t call_rcu_in_progress;
113
struct llist_head free_llist_extra_rcu;
114
115
/* list of objects to be freed after RCU tasks trace GP */
116
struct llist_head free_by_rcu_ttrace;
117
struct llist_head waiting_for_gp_ttrace;
118
struct rcu_head rcu_ttrace;
119
atomic_t call_rcu_ttrace_in_progress;
120
};
121
122
struct bpf_mem_caches {
123
struct bpf_mem_cache cache[NUM_CACHES];
124
};
125
126
static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
127
128
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
129
{
130
struct llist_node *entry, *next;
131
132
entry = head->first;
133
if (!entry)
134
return NULL;
135
next = entry->next;
136
head->first = next;
137
return entry;
138
}
139
140
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
141
{
142
if (c->percpu_size) {
143
void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
144
void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
145
146
if (!obj || !pptr) {
147
free_percpu(pptr);
148
kfree(obj);
149
return NULL;
150
}
151
obj[1] = pptr;
152
return obj;
153
}
154
155
return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
156
}
157
158
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
159
{
160
#ifdef CONFIG_MEMCG
161
if (c->objcg)
162
return get_mem_cgroup_from_objcg(c->objcg);
163
return root_mem_cgroup;
164
#else
165
return NULL;
166
#endif
167
}
168
169
static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
170
{
171
if (IS_ENABLED(CONFIG_PREEMPT_RT))
172
/* In RT irq_work runs in per-cpu kthread, so disable
173
* interrupts to avoid preemption and interrupts and
174
* reduce the chance of bpf prog executing on this cpu
175
* when active counter is busy.
176
*/
177
local_irq_save(*flags);
178
/* alloc_bulk runs from irq_work which will not preempt a bpf
179
* program that does unit_alloc/unit_free since IRQs are
180
* disabled there. There is no race to increment 'active'
181
* counter. It protects free_llist from corruption in case NMI
182
* bpf prog preempted this loop.
183
*/
184
WARN_ON_ONCE(local_inc_return(&c->active) != 1);
185
}
186
187
static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
188
{
189
local_dec(&c->active);
190
if (IS_ENABLED(CONFIG_PREEMPT_RT))
191
local_irq_restore(*flags);
192
}
193
194
static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
195
{
196
unsigned long flags;
197
198
inc_active(c, &flags);
199
__llist_add(obj, &c->free_llist);
200
c->free_cnt++;
201
dec_active(c, &flags);
202
}
203
204
/* Mostly runs from irq_work except __init phase. */
205
static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
206
{
207
struct mem_cgroup *memcg = NULL, *old_memcg;
208
gfp_t gfp;
209
void *obj;
210
int i;
211
212
gfp = __GFP_NOWARN | __GFP_ACCOUNT;
213
gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
214
215
for (i = 0; i < cnt; i++) {
216
/*
217
* For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
218
* done only by one CPU == current CPU. Other CPUs might
219
* llist_add() and llist_del_all() in parallel.
220
*/
221
obj = llist_del_first(&c->free_by_rcu_ttrace);
222
if (!obj)
223
break;
224
add_obj_to_free_list(c, obj);
225
}
226
if (i >= cnt)
227
return;
228
229
for (; i < cnt; i++) {
230
obj = llist_del_first(&c->waiting_for_gp_ttrace);
231
if (!obj)
232
break;
233
add_obj_to_free_list(c, obj);
234
}
235
if (i >= cnt)
236
return;
237
238
memcg = get_memcg(c);
239
old_memcg = set_active_memcg(memcg);
240
for (; i < cnt; i++) {
241
/* Allocate, but don't deplete atomic reserves that typical
242
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
243
* will allocate from the current numa node which is what we
244
* want here.
245
*/
246
obj = __alloc(c, node, gfp);
247
if (!obj)
248
break;
249
add_obj_to_free_list(c, obj);
250
}
251
set_active_memcg(old_memcg);
252
mem_cgroup_put(memcg);
253
}
254
255
static void free_one(void *obj, bool percpu)
256
{
257
if (percpu)
258
free_percpu(((void __percpu **)obj)[1]);
259
260
kfree(obj);
261
}
262
263
static int free_all(struct llist_node *llnode, bool percpu)
264
{
265
struct llist_node *pos, *t;
266
int cnt = 0;
267
268
llist_for_each_safe(pos, t, llnode) {
269
free_one(pos, percpu);
270
cnt++;
271
}
272
return cnt;
273
}
274
275
static void __free_rcu(struct rcu_head *head)
276
{
277
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
278
279
free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
280
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
281
}
282
283
static void __free_rcu_tasks_trace(struct rcu_head *head)
284
{
285
/* If RCU Tasks Trace grace period implies RCU grace period,
286
* there is no need to invoke call_rcu().
287
*/
288
if (rcu_trace_implies_rcu_gp())
289
__free_rcu(head);
290
else
291
call_rcu(head, __free_rcu);
292
}
293
294
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
295
{
296
struct llist_node *llnode = obj;
297
298
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
299
* Nothing races to add to free_by_rcu_ttrace list.
300
*/
301
llist_add(llnode, &c->free_by_rcu_ttrace);
302
}
303
304
static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
305
{
306
struct llist_node *llnode, *t;
307
308
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
309
if (unlikely(READ_ONCE(c->draining))) {
310
llnode = llist_del_all(&c->free_by_rcu_ttrace);
311
free_all(llnode, !!c->percpu_size);
312
}
313
return;
314
}
315
316
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
317
llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
318
llist_add(llnode, &c->waiting_for_gp_ttrace);
319
320
if (unlikely(READ_ONCE(c->draining))) {
321
__free_rcu(&c->rcu_ttrace);
322
return;
323
}
324
325
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
326
* If RCU Tasks Trace grace period implies RCU grace period, free
327
* these elements directly, else use call_rcu() to wait for normal
328
* progs to finish and finally do free_one() on each element.
329
*/
330
call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
331
}
332
333
static void free_bulk(struct bpf_mem_cache *c)
334
{
335
struct bpf_mem_cache *tgt = c->tgt;
336
struct llist_node *llnode, *t;
337
unsigned long flags;
338
int cnt;
339
340
WARN_ON_ONCE(tgt->unit_size != c->unit_size);
341
WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
342
343
do {
344
inc_active(c, &flags);
345
llnode = __llist_del_first(&c->free_llist);
346
if (llnode)
347
cnt = --c->free_cnt;
348
else
349
cnt = 0;
350
dec_active(c, &flags);
351
if (llnode)
352
enque_to_free(tgt, llnode);
353
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
354
355
/* and drain free_llist_extra */
356
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
357
enque_to_free(tgt, llnode);
358
do_call_rcu_ttrace(tgt);
359
}
360
361
static void __free_by_rcu(struct rcu_head *head)
362
{
363
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
364
struct bpf_mem_cache *tgt = c->tgt;
365
struct llist_node *llnode;
366
367
WARN_ON_ONCE(tgt->unit_size != c->unit_size);
368
WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
369
370
llnode = llist_del_all(&c->waiting_for_gp);
371
if (!llnode)
372
goto out;
373
374
llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
375
376
/* Objects went through regular RCU GP. Send them to RCU tasks trace */
377
do_call_rcu_ttrace(tgt);
378
out:
379
atomic_set(&c->call_rcu_in_progress, 0);
380
}
381
382
static void check_free_by_rcu(struct bpf_mem_cache *c)
383
{
384
struct llist_node *llnode, *t;
385
unsigned long flags;
386
387
/* drain free_llist_extra_rcu */
388
if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
389
inc_active(c, &flags);
390
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
391
if (__llist_add(llnode, &c->free_by_rcu))
392
c->free_by_rcu_tail = llnode;
393
dec_active(c, &flags);
394
}
395
396
if (llist_empty(&c->free_by_rcu))
397
return;
398
399
if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
400
/*
401
* Instead of kmalloc-ing new rcu_head and triggering 10k
402
* call_rcu() to hit rcutree.qhimark and force RCU to notice
403
* the overload just ask RCU to hurry up. There could be many
404
* objects in free_by_rcu list.
405
* This hint reduces memory consumption for an artificial
406
* benchmark from 2 Gbyte to 150 Mbyte.
407
*/
408
rcu_request_urgent_qs_task(current);
409
return;
410
}
411
412
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
413
414
inc_active(c, &flags);
415
WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
416
c->waiting_for_gp_tail = c->free_by_rcu_tail;
417
dec_active(c, &flags);
418
419
if (unlikely(READ_ONCE(c->draining))) {
420
free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
421
atomic_set(&c->call_rcu_in_progress, 0);
422
} else {
423
call_rcu_hurry(&c->rcu, __free_by_rcu);
424
}
425
}
426
427
static void bpf_mem_refill(struct irq_work *work)
428
{
429
struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
430
int cnt;
431
432
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
433
cnt = c->free_cnt;
434
if (cnt < c->low_watermark)
435
/* irq_work runs on this cpu and kmalloc will allocate
436
* from the current numa node which is what we want here.
437
*/
438
alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
439
else if (cnt > c->high_watermark)
440
free_bulk(c);
441
442
check_free_by_rcu(c);
443
}
444
445
static void notrace irq_work_raise(struct bpf_mem_cache *c)
446
{
447
irq_work_queue(&c->refill_work);
448
}
449
450
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
451
* the freelist cache will be elem_size * 64 (or less) on each cpu.
452
*
453
* For bpf programs that don't have statically known allocation sizes and
454
* assuming (low_mark + high_mark) / 2 as an average number of elements per
455
* bucket and all buckets are used the total amount of memory in freelists
456
* on each cpu will be:
457
* 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
458
* == ~ 116 Kbyte using below heuristic.
459
* Initialized, but unused bpf allocator (not bpf map specific one) will
460
* consume ~ 11 Kbyte per cpu.
461
* Typical case will be between 11K and 116K closer to 11K.
462
* bpf progs can and should share bpf_mem_cache when possible.
463
*
464
* Percpu allocation is typically rare. To avoid potential unnecessary large
465
* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
466
*/
467
static void init_refill_work(struct bpf_mem_cache *c)
468
{
469
init_irq_work(&c->refill_work, bpf_mem_refill);
470
if (c->percpu_size) {
471
c->low_watermark = 1;
472
c->high_watermark = 3;
473
} else if (c->unit_size <= 256) {
474
c->low_watermark = 32;
475
c->high_watermark = 96;
476
} else {
477
/* When page_size == 4k, order-0 cache will have low_mark == 2
478
* and high_mark == 6 with batch alloc of 3 individual pages at
479
* a time.
480
* 8k allocs and above low == 1, high == 3, batch == 1.
481
*/
482
c->low_watermark = max(32 * 256 / c->unit_size, 1);
483
c->high_watermark = max(96 * 256 / c->unit_size, 3);
484
}
485
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
486
}
487
488
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
489
{
490
int cnt = 1;
491
492
/* To avoid consuming memory, for non-percpu allocation, assume that
493
* 1st run of bpf prog won't be doing more than 4 map_update_elem from
494
* irq disabled region if unit size is less than or equal to 256.
495
* For all other cases, let us just do one allocation.
496
*/
497
if (!c->percpu_size && c->unit_size <= 256)
498
cnt = 4;
499
alloc_bulk(c, cnt, cpu_to_node(cpu), false);
500
}
501
502
/* When size != 0 bpf_mem_cache for each cpu.
503
* This is typical bpf hash map use case when all elements have equal size.
504
*
505
* When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
506
* kmalloc/kfree. Max allocation size is 4096 in this case.
507
* This is bpf_dynptr and bpf_kptr use case.
508
*/
509
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
510
{
511
struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
512
struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
513
struct obj_cgroup *objcg = NULL;
514
int cpu, i, unit_size, percpu_size = 0;
515
516
if (percpu && size == 0)
517
return -EINVAL;
518
519
/* room for llist_node and per-cpu pointer */
520
if (percpu)
521
percpu_size = LLIST_NODE_SZ + sizeof(void *);
522
ma->percpu = percpu;
523
524
if (size) {
525
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
526
if (!pc)
527
return -ENOMEM;
528
529
if (!percpu)
530
size += LLIST_NODE_SZ; /* room for llist_node */
531
unit_size = size;
532
533
#ifdef CONFIG_MEMCG
534
if (memcg_bpf_enabled())
535
objcg = get_obj_cgroup_from_current();
536
#endif
537
ma->objcg = objcg;
538
539
for_each_possible_cpu(cpu) {
540
c = per_cpu_ptr(pc, cpu);
541
c->unit_size = unit_size;
542
c->objcg = objcg;
543
c->percpu_size = percpu_size;
544
c->tgt = c;
545
init_refill_work(c);
546
prefill_mem_cache(c, cpu);
547
}
548
ma->cache = pc;
549
return 0;
550
}
551
552
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
553
if (!pcc)
554
return -ENOMEM;
555
#ifdef CONFIG_MEMCG
556
objcg = get_obj_cgroup_from_current();
557
#endif
558
ma->objcg = objcg;
559
for_each_possible_cpu(cpu) {
560
cc = per_cpu_ptr(pcc, cpu);
561
for (i = 0; i < NUM_CACHES; i++) {
562
c = &cc->cache[i];
563
c->unit_size = sizes[i];
564
c->objcg = objcg;
565
c->percpu_size = percpu_size;
566
c->tgt = c;
567
568
init_refill_work(c);
569
prefill_mem_cache(c, cpu);
570
}
571
}
572
573
ma->caches = pcc;
574
return 0;
575
}
576
577
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
578
{
579
struct bpf_mem_caches __percpu *pcc;
580
581
pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
582
if (!pcc)
583
return -ENOMEM;
584
585
ma->caches = pcc;
586
ma->objcg = objcg;
587
ma->percpu = true;
588
return 0;
589
}
590
591
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
592
{
593
struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
594
int cpu, i, unit_size, percpu_size;
595
struct obj_cgroup *objcg;
596
struct bpf_mem_cache *c;
597
598
i = bpf_mem_cache_idx(size);
599
if (i < 0)
600
return -EINVAL;
601
602
/* room for llist_node and per-cpu pointer */
603
percpu_size = LLIST_NODE_SZ + sizeof(void *);
604
605
unit_size = sizes[i];
606
objcg = ma->objcg;
607
pcc = ma->caches;
608
609
for_each_possible_cpu(cpu) {
610
cc = per_cpu_ptr(pcc, cpu);
611
c = &cc->cache[i];
612
if (c->unit_size)
613
break;
614
615
c->unit_size = unit_size;
616
c->objcg = objcg;
617
c->percpu_size = percpu_size;
618
c->tgt = c;
619
620
init_refill_work(c);
621
prefill_mem_cache(c, cpu);
622
}
623
624
return 0;
625
}
626
627
static void drain_mem_cache(struct bpf_mem_cache *c)
628
{
629
bool percpu = !!c->percpu_size;
630
631
/* No progs are using this bpf_mem_cache, but htab_map_free() called
632
* bpf_mem_cache_free() for all remaining elements and they can be in
633
* free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
634
*
635
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
636
* on these lists, so it is safe to use __llist_del_all().
637
*/
638
free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
639
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
640
free_all(__llist_del_all(&c->free_llist), percpu);
641
free_all(__llist_del_all(&c->free_llist_extra), percpu);
642
free_all(__llist_del_all(&c->free_by_rcu), percpu);
643
free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
644
free_all(llist_del_all(&c->waiting_for_gp), percpu);
645
}
646
647
static void check_mem_cache(struct bpf_mem_cache *c)
648
{
649
WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
650
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
651
WARN_ON_ONCE(!llist_empty(&c->free_llist));
652
WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
653
WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
654
WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
655
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
656
}
657
658
static void check_leaked_objs(struct bpf_mem_alloc *ma)
659
{
660
struct bpf_mem_caches *cc;
661
struct bpf_mem_cache *c;
662
int cpu, i;
663
664
if (ma->cache) {
665
for_each_possible_cpu(cpu) {
666
c = per_cpu_ptr(ma->cache, cpu);
667
check_mem_cache(c);
668
}
669
}
670
if (ma->caches) {
671
for_each_possible_cpu(cpu) {
672
cc = per_cpu_ptr(ma->caches, cpu);
673
for (i = 0; i < NUM_CACHES; i++) {
674
c = &cc->cache[i];
675
check_mem_cache(c);
676
}
677
}
678
}
679
}
680
681
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
682
{
683
check_leaked_objs(ma);
684
free_percpu(ma->cache);
685
free_percpu(ma->caches);
686
ma->cache = NULL;
687
ma->caches = NULL;
688
}
689
690
static void free_mem_alloc(struct bpf_mem_alloc *ma)
691
{
692
/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
693
* might still execute. Wait for them.
694
*
695
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
696
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
697
* to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
698
* so if call_rcu(head, __free_rcu) is skipped due to
699
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
700
* using rcu_trace_implies_rcu_gp() as well.
701
*/
702
rcu_barrier(); /* wait for __free_by_rcu */
703
rcu_barrier_tasks_trace(); /* wait for __free_rcu */
704
if (!rcu_trace_implies_rcu_gp())
705
rcu_barrier();
706
free_mem_alloc_no_barrier(ma);
707
}
708
709
static void free_mem_alloc_deferred(struct work_struct *work)
710
{
711
struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
712
713
free_mem_alloc(ma);
714
kfree(ma);
715
}
716
717
static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
718
{
719
struct bpf_mem_alloc *copy;
720
721
if (!rcu_in_progress) {
722
/* Fast path. No callbacks are pending, hence no need to do
723
* rcu_barrier-s.
724
*/
725
free_mem_alloc_no_barrier(ma);
726
return;
727
}
728
729
copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
730
if (!copy) {
731
/* Slow path with inline barrier-s */
732
free_mem_alloc(ma);
733
return;
734
}
735
736
/* Defer barriers into worker to let the rest of map memory to be freed */
737
memset(ma, 0, sizeof(*ma));
738
INIT_WORK(&copy->work, free_mem_alloc_deferred);
739
queue_work(system_dfl_wq, &copy->work);
740
}
741
742
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
743
{
744
struct bpf_mem_caches *cc;
745
struct bpf_mem_cache *c;
746
int cpu, i, rcu_in_progress;
747
748
if (ma->cache) {
749
rcu_in_progress = 0;
750
for_each_possible_cpu(cpu) {
751
c = per_cpu_ptr(ma->cache, cpu);
752
WRITE_ONCE(c->draining, true);
753
irq_work_sync(&c->refill_work);
754
drain_mem_cache(c);
755
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
756
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
757
}
758
obj_cgroup_put(ma->objcg);
759
destroy_mem_alloc(ma, rcu_in_progress);
760
}
761
if (ma->caches) {
762
rcu_in_progress = 0;
763
for_each_possible_cpu(cpu) {
764
cc = per_cpu_ptr(ma->caches, cpu);
765
for (i = 0; i < NUM_CACHES; i++) {
766
c = &cc->cache[i];
767
WRITE_ONCE(c->draining, true);
768
irq_work_sync(&c->refill_work);
769
drain_mem_cache(c);
770
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
771
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
772
}
773
}
774
obj_cgroup_put(ma->objcg);
775
destroy_mem_alloc(ma, rcu_in_progress);
776
}
777
}
778
779
/* notrace is necessary here and in other functions to make sure
780
* bpf programs cannot attach to them and cause llist corruptions.
781
*/
782
static void notrace *unit_alloc(struct bpf_mem_cache *c)
783
{
784
struct llist_node *llnode = NULL;
785
unsigned long flags;
786
int cnt = 0;
787
788
/* Disable irqs to prevent the following race for majority of prog types:
789
* prog_A
790
* bpf_mem_alloc
791
* preemption or irq -> prog_B
792
* bpf_mem_alloc
793
*
794
* but prog_B could be a perf_event NMI prog.
795
* Use per-cpu 'active' counter to order free_list access between
796
* unit_alloc/unit_free/bpf_mem_refill.
797
*/
798
local_irq_save(flags);
799
if (local_inc_return(&c->active) == 1) {
800
llnode = __llist_del_first(&c->free_llist);
801
if (llnode) {
802
cnt = --c->free_cnt;
803
*(struct bpf_mem_cache **)llnode = c;
804
}
805
}
806
local_dec(&c->active);
807
808
WARN_ON(cnt < 0);
809
810
if (cnt < c->low_watermark)
811
irq_work_raise(c);
812
/* Enable IRQ after the enqueue of irq work completes, so irq work
813
* will run after IRQ is enabled and free_llist may be refilled by
814
* irq work before other task preempts current task.
815
*/
816
local_irq_restore(flags);
817
818
return llnode;
819
}
820
821
/* Though 'ptr' object could have been allocated on a different cpu
822
* add it to the free_llist of the current cpu.
823
* Let kfree() logic deal with it when it's later called from irq_work.
824
*/
825
static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
826
{
827
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
828
unsigned long flags;
829
int cnt = 0;
830
831
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
832
833
/*
834
* Remember bpf_mem_cache that allocated this object.
835
* The hint is not accurate.
836
*/
837
c->tgt = *(struct bpf_mem_cache **)llnode;
838
839
local_irq_save(flags);
840
if (local_inc_return(&c->active) == 1) {
841
__llist_add(llnode, &c->free_llist);
842
cnt = ++c->free_cnt;
843
} else {
844
/* unit_free() cannot fail. Therefore add an object to atomic
845
* llist. free_bulk() will drain it. Though free_llist_extra is
846
* a per-cpu list we have to use atomic llist_add here, since
847
* it also can be interrupted by bpf nmi prog that does another
848
* unit_free() into the same free_llist_extra.
849
*/
850
llist_add(llnode, &c->free_llist_extra);
851
}
852
local_dec(&c->active);
853
854
if (cnt > c->high_watermark)
855
/* free few objects from current cpu into global kmalloc pool */
856
irq_work_raise(c);
857
/* Enable IRQ after irq_work_raise() completes, otherwise when current
858
* task is preempted by task which does unit_alloc(), unit_alloc() may
859
* return NULL unexpectedly because irq work is already pending but can
860
* not been triggered and free_llist can not be refilled timely.
861
*/
862
local_irq_restore(flags);
863
}
864
865
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
866
{
867
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
868
unsigned long flags;
869
870
c->tgt = *(struct bpf_mem_cache **)llnode;
871
872
local_irq_save(flags);
873
if (local_inc_return(&c->active) == 1) {
874
if (__llist_add(llnode, &c->free_by_rcu))
875
c->free_by_rcu_tail = llnode;
876
} else {
877
llist_add(llnode, &c->free_llist_extra_rcu);
878
}
879
local_dec(&c->active);
880
881
if (!atomic_read(&c->call_rcu_in_progress))
882
irq_work_raise(c);
883
local_irq_restore(flags);
884
}
885
886
/* Called from BPF program or from sys_bpf syscall.
887
* In both cases migration is disabled.
888
*/
889
void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
890
{
891
int idx;
892
void *ret;
893
894
if (!size)
895
return NULL;
896
897
if (!ma->percpu)
898
size += LLIST_NODE_SZ;
899
idx = bpf_mem_cache_idx(size);
900
if (idx < 0)
901
return NULL;
902
903
ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
904
return !ret ? NULL : ret + LLIST_NODE_SZ;
905
}
906
907
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
908
{
909
struct bpf_mem_cache *c;
910
int idx;
911
912
if (!ptr)
913
return;
914
915
c = *(void **)(ptr - LLIST_NODE_SZ);
916
idx = bpf_mem_cache_idx(c->unit_size);
917
if (WARN_ON_ONCE(idx < 0))
918
return;
919
920
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
921
}
922
923
void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
924
{
925
struct bpf_mem_cache *c;
926
int idx;
927
928
if (!ptr)
929
return;
930
931
c = *(void **)(ptr - LLIST_NODE_SZ);
932
idx = bpf_mem_cache_idx(c->unit_size);
933
if (WARN_ON_ONCE(idx < 0))
934
return;
935
936
unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
937
}
938
939
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
940
{
941
void *ret;
942
943
ret = unit_alloc(this_cpu_ptr(ma->cache));
944
return !ret ? NULL : ret + LLIST_NODE_SZ;
945
}
946
947
void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
948
{
949
if (!ptr)
950
return;
951
952
unit_free(this_cpu_ptr(ma->cache), ptr);
953
}
954
955
void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
956
{
957
if (!ptr)
958
return;
959
960
unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
961
}
962
963
/* Directly does a kfree() without putting 'ptr' back to the free_llist
964
* for reuse and without waiting for a rcu_tasks_trace gp.
965
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
966
* before calling bpf_mem_cache_raw_free().
967
* It could be used when the rcu_tasks_trace callback does not have
968
* a hold on the original bpf_mem_alloc object that allocated the
969
* 'ptr'. This should only be used in the uncommon code path.
970
* Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
971
* and may affect performance.
972
*/
973
void bpf_mem_cache_raw_free(void *ptr)
974
{
975
if (!ptr)
976
return;
977
978
kfree(ptr - LLIST_NODE_SZ);
979
}
980
981
/* When flags == GFP_KERNEL, it signals that the caller will not cause
982
* deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
983
* kmalloc if the free_llist is empty.
984
*/
985
void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
986
{
987
struct bpf_mem_cache *c;
988
void *ret;
989
990
c = this_cpu_ptr(ma->cache);
991
992
ret = unit_alloc(c);
993
if (!ret && flags == GFP_KERNEL) {
994
struct mem_cgroup *memcg, *old_memcg;
995
996
memcg = get_memcg(c);
997
old_memcg = set_active_memcg(memcg);
998
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
999
if (ret)
1000
*(struct bpf_mem_cache **)ret = c;
1001
set_active_memcg(old_memcg);
1002
mem_cgroup_put(memcg);
1003
}
1004
1005
return !ret ? NULL : ret + LLIST_NODE_SZ;
1006
}
1007
1008
int bpf_mem_alloc_check_size(bool percpu, size_t size)
1009
{
1010
/* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
1011
if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
1012
(!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
1013
return -E2BIG;
1014
1015
return 0;
1016
}
1017
1018