Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/devmap.c
29280 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
3
*/
4
5
/* Devmaps primary use is as a backend map for XDP BPF helper call
6
* bpf_redirect_map(). Because XDP is mostly concerned with performance we
7
* spent some effort to ensure the datapath with redirect maps does not use
8
* any locking. This is a quick note on the details.
9
*
10
* We have three possible paths to get into the devmap control plane bpf
11
* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
12
* will invoke an update, delete, or lookup operation. To ensure updates and
13
* deletes appear atomic from the datapath side xchg() is used to modify the
14
* netdev_map array. Then because the datapath does a lookup into the netdev_map
15
* array (read-only) from an RCU critical section we use call_rcu() to wait for
16
* an rcu grace period before free'ing the old data structures. This ensures the
17
* datapath always has a valid copy. However, the datapath does a "flush"
18
* operation that pushes any pending packets in the driver outside the RCU
19
* critical section. Each bpf_dtab_netdev tracks these pending operations using
20
* a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
21
* this list is empty, indicating outstanding flush operations have completed.
22
*
23
* BPF syscalls may race with BPF program calls on any of the update, delete
24
* or lookup operations. As noted above the xchg() operation also keep the
25
* netdev_map consistent in this case. From the devmap side BPF programs
26
* calling into these operations are the same as multiple user space threads
27
* making system calls.
28
*
29
* Finally, any of the above may race with a netdev_unregister notifier. The
30
* unregister notifier must search for net devices in the map structure that
31
* contain a reference to the net device and remove them. This is a two step
32
* process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
33
* check to see if the ifindex is the same as the net_device being removed.
34
* When removing the dev a cmpxchg() is used to ensure the correct dev is
35
* removed, in the case of a concurrent update or delete operation it is
36
* possible that the initially referenced dev is no longer in the map. As the
37
* notifier hook walks the map we know that new dev references can not be
38
* added by the user because core infrastructure ensures dev_get_by_index()
39
* calls will fail at this point.
40
*
41
* The devmap_hash type is a map type which interprets keys as ifindexes and
42
* indexes these using a hashmap. This allows maps that use ifindex as key to be
43
* densely packed instead of having holes in the lookup array for unused
44
* ifindexes. The setup and packet enqueue/send code is shared between the two
45
* types of devmap; only the lookup and insertion is different.
46
*/
47
#include <linux/bpf.h>
48
#include <net/xdp.h>
49
#include <linux/filter.h>
50
#include <trace/events/xdp.h>
51
#include <linux/btf_ids.h>
52
53
#define DEV_CREATE_FLAG_MASK \
54
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
55
56
struct xdp_dev_bulk_queue {
57
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
58
struct list_head flush_node;
59
struct net_device *dev;
60
struct net_device *dev_rx;
61
struct bpf_prog *xdp_prog;
62
unsigned int count;
63
};
64
65
struct bpf_dtab_netdev {
66
struct net_device *dev; /* must be first member, due to tracepoint */
67
struct hlist_node index_hlist;
68
struct bpf_prog *xdp_prog;
69
struct rcu_head rcu;
70
unsigned int idx;
71
struct bpf_devmap_val val;
72
};
73
74
struct bpf_dtab {
75
struct bpf_map map;
76
struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
77
struct list_head list;
78
79
/* these are only used for DEVMAP_HASH type maps */
80
struct hlist_head *dev_index_head;
81
spinlock_t index_lock;
82
unsigned int items;
83
u32 n_buckets;
84
};
85
86
static DEFINE_SPINLOCK(dev_map_lock);
87
static LIST_HEAD(dev_map_list);
88
89
static struct hlist_head *dev_map_create_hash(unsigned int entries,
90
int numa_node)
91
{
92
int i;
93
struct hlist_head *hash;
94
95
hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
96
if (hash != NULL)
97
for (i = 0; i < entries; i++)
98
INIT_HLIST_HEAD(&hash[i]);
99
100
return hash;
101
}
102
103
static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
104
int idx)
105
{
106
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
107
}
108
109
static int dev_map_alloc_check(union bpf_attr *attr)
110
{
111
u32 valsize = attr->value_size;
112
113
/* check sanity of attributes. 2 value sizes supported:
114
* 4 bytes: ifindex
115
* 8 bytes: ifindex + prog fd
116
*/
117
if (attr->max_entries == 0 || attr->key_size != 4 ||
118
(valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
119
valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
120
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
121
return -EINVAL;
122
123
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
124
/* Hash table size must be power of 2; roundup_pow_of_two()
125
* can overflow into UB on 32-bit arches
126
*/
127
if (attr->max_entries > 1UL << 31)
128
return -EINVAL;
129
}
130
131
return 0;
132
}
133
134
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
135
{
136
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
137
* verifier prevents writes from the BPF side
138
*/
139
attr->map_flags |= BPF_F_RDONLY_PROG;
140
bpf_map_init_from_attr(&dtab->map, attr);
141
142
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
143
/* Hash table size must be power of 2 */
144
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
145
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
146
dtab->map.numa_node);
147
if (!dtab->dev_index_head)
148
return -ENOMEM;
149
150
spin_lock_init(&dtab->index_lock);
151
} else {
152
dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
153
sizeof(struct bpf_dtab_netdev *),
154
dtab->map.numa_node);
155
if (!dtab->netdev_map)
156
return -ENOMEM;
157
}
158
159
return 0;
160
}
161
162
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
163
{
164
struct bpf_dtab *dtab;
165
int err;
166
167
dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
168
if (!dtab)
169
return ERR_PTR(-ENOMEM);
170
171
err = dev_map_init_map(dtab, attr);
172
if (err) {
173
bpf_map_area_free(dtab);
174
return ERR_PTR(err);
175
}
176
177
spin_lock(&dev_map_lock);
178
list_add_tail_rcu(&dtab->list, &dev_map_list);
179
spin_unlock(&dev_map_lock);
180
181
return &dtab->map;
182
}
183
184
static void dev_map_free(struct bpf_map *map)
185
{
186
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
187
u32 i;
188
189
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
190
* so the programs (can be more than one that used this map) were
191
* disconnected from events. The following synchronize_rcu() guarantees
192
* both rcu read critical sections complete and waits for
193
* preempt-disable regions (NAPI being the relevant context here) so we
194
* are certain there will be no further reads against the netdev_map and
195
* all flush operations are complete. Flush operations can only be done
196
* from NAPI context for this reason.
197
*/
198
199
spin_lock(&dev_map_lock);
200
list_del_rcu(&dtab->list);
201
spin_unlock(&dev_map_lock);
202
203
/* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
204
* during NAPI callback and cleared after the XDP redirect. There is no
205
* explicit RCU read section which protects bpf_redirect_info->map but
206
* local_bh_disable() also marks the beginning an RCU section. This
207
* makes the complete softirq callback RCU protected. Thus after
208
* following synchronize_rcu() there no bpf_redirect_info->map == map
209
* assignment.
210
*/
211
synchronize_rcu();
212
213
/* Make sure prior __dev_map_entry_free() have completed. */
214
rcu_barrier();
215
216
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
217
for (i = 0; i < dtab->n_buckets; i++) {
218
struct bpf_dtab_netdev *dev;
219
struct hlist_head *head;
220
struct hlist_node *next;
221
222
head = dev_map_index_hash(dtab, i);
223
224
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
225
hlist_del_rcu(&dev->index_hlist);
226
if (dev->xdp_prog)
227
bpf_prog_put(dev->xdp_prog);
228
dev_put(dev->dev);
229
kfree(dev);
230
}
231
}
232
233
bpf_map_area_free(dtab->dev_index_head);
234
} else {
235
for (i = 0; i < dtab->map.max_entries; i++) {
236
struct bpf_dtab_netdev *dev;
237
238
dev = rcu_dereference_raw(dtab->netdev_map[i]);
239
if (!dev)
240
continue;
241
242
if (dev->xdp_prog)
243
bpf_prog_put(dev->xdp_prog);
244
dev_put(dev->dev);
245
kfree(dev);
246
}
247
248
bpf_map_area_free(dtab->netdev_map);
249
}
250
251
bpf_map_area_free(dtab);
252
}
253
254
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
255
{
256
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
257
u32 index = key ? *(u32 *)key : U32_MAX;
258
u32 *next = next_key;
259
260
if (index >= dtab->map.max_entries) {
261
*next = 0;
262
return 0;
263
}
264
265
if (index == dtab->map.max_entries - 1)
266
return -ENOENT;
267
*next = index + 1;
268
return 0;
269
}
270
271
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
272
* by local_bh_disable() (from XDP calls inside NAPI). The
273
* rcu_read_lock_bh_held() below makes lockdep accept both.
274
*/
275
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
276
{
277
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
278
struct hlist_head *head = dev_map_index_hash(dtab, key);
279
struct bpf_dtab_netdev *dev;
280
281
hlist_for_each_entry_rcu(dev, head, index_hlist,
282
lockdep_is_held(&dtab->index_lock))
283
if (dev->idx == key)
284
return dev;
285
286
return NULL;
287
}
288
289
static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
290
void *next_key)
291
{
292
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
293
u32 idx, *next = next_key;
294
struct bpf_dtab_netdev *dev, *next_dev;
295
struct hlist_head *head;
296
int i = 0;
297
298
if (!key)
299
goto find_first;
300
301
idx = *(u32 *)key;
302
303
dev = __dev_map_hash_lookup_elem(map, idx);
304
if (!dev)
305
goto find_first;
306
307
next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
308
struct bpf_dtab_netdev, index_hlist);
309
310
if (next_dev) {
311
*next = next_dev->idx;
312
return 0;
313
}
314
315
i = idx & (dtab->n_buckets - 1);
316
i++;
317
318
find_first:
319
for (; i < dtab->n_buckets; i++) {
320
head = dev_map_index_hash(dtab, i);
321
322
next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
323
struct bpf_dtab_netdev,
324
index_hlist);
325
if (next_dev) {
326
*next = next_dev->idx;
327
return 0;
328
}
329
}
330
331
return -ENOENT;
332
}
333
334
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
335
struct xdp_frame **frames, int n,
336
struct net_device *tx_dev,
337
struct net_device *rx_dev)
338
{
339
struct xdp_txq_info txq = { .dev = tx_dev };
340
struct xdp_rxq_info rxq = { .dev = rx_dev };
341
struct xdp_buff xdp;
342
int i, nframes = 0;
343
344
for (i = 0; i < n; i++) {
345
struct xdp_frame *xdpf = frames[i];
346
u32 act;
347
int err;
348
349
xdp_convert_frame_to_buff(xdpf, &xdp);
350
xdp.txq = &txq;
351
xdp.rxq = &rxq;
352
353
act = bpf_prog_run_xdp(xdp_prog, &xdp);
354
switch (act) {
355
case XDP_PASS:
356
err = xdp_update_frame_from_buff(&xdp, xdpf);
357
if (unlikely(err < 0))
358
xdp_return_frame_rx_napi(xdpf);
359
else
360
frames[nframes++] = xdpf;
361
break;
362
default:
363
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
364
fallthrough;
365
case XDP_ABORTED:
366
trace_xdp_exception(tx_dev, xdp_prog, act);
367
fallthrough;
368
case XDP_DROP:
369
xdp_return_frame_rx_napi(xdpf);
370
break;
371
}
372
}
373
return nframes; /* sent frames count */
374
}
375
376
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
377
{
378
struct net_device *dev = bq->dev;
379
unsigned int cnt = bq->count;
380
int sent = 0, err = 0;
381
int to_send = cnt;
382
int i;
383
384
if (unlikely(!cnt))
385
return;
386
387
for (i = 0; i < cnt; i++) {
388
struct xdp_frame *xdpf = bq->q[i];
389
390
prefetch(xdpf);
391
}
392
393
if (bq->xdp_prog) {
394
to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
395
if (!to_send)
396
goto out;
397
}
398
399
sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
400
if (sent < 0) {
401
/* If ndo_xdp_xmit fails with an errno, no frames have
402
* been xmit'ed.
403
*/
404
err = sent;
405
sent = 0;
406
}
407
408
/* If not all frames have been transmitted, it is our
409
* responsibility to free them
410
*/
411
for (i = sent; unlikely(i < to_send); i++)
412
xdp_return_frame_rx_napi(bq->q[i]);
413
414
out:
415
bq->count = 0;
416
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
417
}
418
419
/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
420
* driver before returning from its napi->poll() routine. See the comment above
421
* xdp_do_flush() in filter.c.
422
*/
423
void __dev_flush(struct list_head *flush_list)
424
{
425
struct xdp_dev_bulk_queue *bq, *tmp;
426
427
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
428
bq_xmit_all(bq, XDP_XMIT_FLUSH);
429
bq->dev_rx = NULL;
430
bq->xdp_prog = NULL;
431
__list_del_clearprev(&bq->flush_node);
432
}
433
}
434
435
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
436
* by local_bh_disable() (from XDP calls inside NAPI). The
437
* rcu_read_lock_bh_held() below makes lockdep accept both.
438
*/
439
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
440
{
441
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
442
struct bpf_dtab_netdev *obj;
443
444
if (key >= map->max_entries)
445
return NULL;
446
447
obj = rcu_dereference_check(dtab->netdev_map[key],
448
rcu_read_lock_bh_held());
449
return obj;
450
}
451
452
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
453
* variable access, and map elements stick around. See comment above
454
* xdp_do_flush() in filter.c.
455
*/
456
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
457
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
458
{
459
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
460
461
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
462
bq_xmit_all(bq, 0);
463
464
/* Ingress dev_rx will be the same for all xdp_frame's in
465
* bulk_queue, because bq stored per-CPU and must be flushed
466
* from net_device drivers NAPI func end.
467
*
468
* Do the same with xdp_prog and flush_list since these fields
469
* are only ever modified together.
470
*/
471
if (!bq->dev_rx) {
472
struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
473
474
bq->dev_rx = dev_rx;
475
bq->xdp_prog = xdp_prog;
476
list_add(&bq->flush_node, flush_list);
477
}
478
479
bq->q[bq->count++] = xdpf;
480
}
481
482
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
483
struct net_device *dev_rx,
484
struct bpf_prog *xdp_prog)
485
{
486
int err;
487
488
if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
489
return -EOPNOTSUPP;
490
491
if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
492
xdp_frame_has_frags(xdpf)))
493
return -EOPNOTSUPP;
494
495
err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
496
if (unlikely(err))
497
return err;
498
499
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
500
return 0;
501
}
502
503
static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
504
{
505
struct xdp_txq_info txq = { .dev = dst->dev };
506
struct xdp_buff xdp;
507
u32 act;
508
509
if (!dst->xdp_prog)
510
return XDP_PASS;
511
512
__skb_pull(skb, skb->mac_len);
513
xdp.txq = &txq;
514
515
act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
516
switch (act) {
517
case XDP_PASS:
518
__skb_push(skb, skb->mac_len);
519
break;
520
default:
521
bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
522
fallthrough;
523
case XDP_ABORTED:
524
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
525
fallthrough;
526
case XDP_DROP:
527
kfree_skb(skb);
528
break;
529
}
530
531
return act;
532
}
533
534
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
535
struct net_device *dev_rx)
536
{
537
return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
538
}
539
540
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
541
struct net_device *dev_rx)
542
{
543
struct net_device *dev = dst->dev;
544
545
return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
546
}
547
548
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
549
{
550
if (!obj)
551
return false;
552
553
if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
554
return false;
555
556
if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
557
xdp_frame_has_frags(xdpf)))
558
return false;
559
560
if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
561
return false;
562
563
return true;
564
}
565
566
static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
567
struct net_device *dev_rx,
568
struct xdp_frame *xdpf)
569
{
570
struct xdp_frame *nxdpf;
571
572
nxdpf = xdpf_clone(xdpf);
573
if (!nxdpf)
574
return -ENOMEM;
575
576
bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
577
578
return 0;
579
}
580
581
static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
582
{
583
while (num_excluded--) {
584
if (ifindex == excluded[num_excluded])
585
return true;
586
}
587
return false;
588
}
589
590
/* Get ifindex of each upper device. 'indexes' must be able to hold at
591
* least MAX_NEST_DEV elements.
592
* Returns the number of ifindexes added.
593
*/
594
static int get_upper_ifindexes(struct net_device *dev, int *indexes)
595
{
596
struct net_device *upper;
597
struct list_head *iter;
598
int n = 0;
599
600
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
601
indexes[n++] = upper->ifindex;
602
}
603
return n;
604
}
605
606
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
607
struct bpf_map *map, bool exclude_ingress)
608
{
609
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
610
struct bpf_dtab_netdev *dst, *last_dst = NULL;
611
int excluded_devices[1+MAX_NEST_DEV];
612
struct hlist_head *head;
613
int num_excluded = 0;
614
unsigned int i;
615
int err;
616
617
if (exclude_ingress) {
618
num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
619
excluded_devices[num_excluded++] = dev_rx->ifindex;
620
}
621
622
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
623
for (i = 0; i < map->max_entries; i++) {
624
dst = rcu_dereference_check(dtab->netdev_map[i],
625
rcu_read_lock_bh_held());
626
if (!is_valid_dst(dst, xdpf))
627
continue;
628
629
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
630
continue;
631
632
/* we only need n-1 clones; last_dst enqueued below */
633
if (!last_dst) {
634
last_dst = dst;
635
continue;
636
}
637
638
err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
639
if (err)
640
return err;
641
642
last_dst = dst;
643
}
644
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
645
for (i = 0; i < dtab->n_buckets; i++) {
646
head = dev_map_index_hash(dtab, i);
647
hlist_for_each_entry_rcu(dst, head, index_hlist,
648
lockdep_is_held(&dtab->index_lock)) {
649
if (!is_valid_dst(dst, xdpf))
650
continue;
651
652
if (is_ifindex_excluded(excluded_devices, num_excluded,
653
dst->dev->ifindex))
654
continue;
655
656
/* we only need n-1 clones; last_dst enqueued below */
657
if (!last_dst) {
658
last_dst = dst;
659
continue;
660
}
661
662
err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
663
if (err)
664
return err;
665
666
last_dst = dst;
667
}
668
}
669
}
670
671
/* consume the last copy of the frame */
672
if (last_dst)
673
bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
674
else
675
xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
676
677
return 0;
678
}
679
680
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
681
const struct bpf_prog *xdp_prog)
682
{
683
int err;
684
685
err = xdp_ok_fwd_dev(dst->dev, skb->len);
686
if (unlikely(err))
687
return err;
688
689
/* Redirect has already succeeded semantically at this point, so we just
690
* return 0 even if packet is dropped. Helper below takes care of
691
* freeing skb.
692
*/
693
if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
694
return 0;
695
696
skb->dev = dst->dev;
697
generic_xdp_tx(skb, xdp_prog);
698
699
return 0;
700
}
701
702
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
703
struct sk_buff *skb,
704
const struct bpf_prog *xdp_prog)
705
{
706
struct sk_buff *nskb;
707
int err;
708
709
nskb = skb_clone(skb, GFP_ATOMIC);
710
if (!nskb)
711
return -ENOMEM;
712
713
err = dev_map_generic_redirect(dst, nskb, xdp_prog);
714
if (unlikely(err)) {
715
consume_skb(nskb);
716
return err;
717
}
718
719
return 0;
720
}
721
722
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
723
const struct bpf_prog *xdp_prog,
724
struct bpf_map *map, bool exclude_ingress)
725
{
726
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
727
struct bpf_dtab_netdev *dst, *last_dst = NULL;
728
int excluded_devices[1+MAX_NEST_DEV];
729
struct hlist_head *head;
730
struct hlist_node *next;
731
int num_excluded = 0;
732
unsigned int i;
733
int err;
734
735
if (exclude_ingress) {
736
num_excluded = get_upper_ifindexes(dev, excluded_devices);
737
excluded_devices[num_excluded++] = dev->ifindex;
738
}
739
740
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
741
for (i = 0; i < map->max_entries; i++) {
742
dst = rcu_dereference_check(dtab->netdev_map[i],
743
rcu_read_lock_bh_held());
744
if (!dst)
745
continue;
746
747
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
748
continue;
749
750
/* we only need n-1 clones; last_dst enqueued below */
751
if (!last_dst) {
752
last_dst = dst;
753
continue;
754
}
755
756
err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
757
if (err)
758
return err;
759
760
last_dst = dst;
761
762
}
763
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
764
for (i = 0; i < dtab->n_buckets; i++) {
765
head = dev_map_index_hash(dtab, i);
766
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
767
if (is_ifindex_excluded(excluded_devices, num_excluded,
768
dst->dev->ifindex))
769
continue;
770
771
/* we only need n-1 clones; last_dst enqueued below */
772
if (!last_dst) {
773
last_dst = dst;
774
continue;
775
}
776
777
err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
778
if (err)
779
return err;
780
781
last_dst = dst;
782
}
783
}
784
}
785
786
/* consume the first skb and return */
787
if (last_dst)
788
return dev_map_generic_redirect(last_dst, skb, xdp_prog);
789
790
/* dtab is empty */
791
consume_skb(skb);
792
return 0;
793
}
794
795
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
796
{
797
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
798
799
return obj ? &obj->val : NULL;
800
}
801
802
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
803
{
804
struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
805
*(u32 *)key);
806
return obj ? &obj->val : NULL;
807
}
808
809
static void __dev_map_entry_free(struct rcu_head *rcu)
810
{
811
struct bpf_dtab_netdev *dev;
812
813
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
814
if (dev->xdp_prog)
815
bpf_prog_put(dev->xdp_prog);
816
dev_put(dev->dev);
817
kfree(dev);
818
}
819
820
static long dev_map_delete_elem(struct bpf_map *map, void *key)
821
{
822
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
823
struct bpf_dtab_netdev *old_dev;
824
u32 k = *(u32 *)key;
825
826
if (k >= map->max_entries)
827
return -EINVAL;
828
829
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
830
if (old_dev) {
831
call_rcu(&old_dev->rcu, __dev_map_entry_free);
832
atomic_dec((atomic_t *)&dtab->items);
833
}
834
return 0;
835
}
836
837
static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
838
{
839
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
840
struct bpf_dtab_netdev *old_dev;
841
u32 k = *(u32 *)key;
842
unsigned long flags;
843
int ret = -ENOENT;
844
845
spin_lock_irqsave(&dtab->index_lock, flags);
846
847
old_dev = __dev_map_hash_lookup_elem(map, k);
848
if (old_dev) {
849
dtab->items--;
850
hlist_del_init_rcu(&old_dev->index_hlist);
851
call_rcu(&old_dev->rcu, __dev_map_entry_free);
852
ret = 0;
853
}
854
spin_unlock_irqrestore(&dtab->index_lock, flags);
855
856
return ret;
857
}
858
859
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
860
struct bpf_dtab *dtab,
861
struct bpf_devmap_val *val,
862
unsigned int idx)
863
{
864
struct bpf_prog *prog = NULL;
865
struct bpf_dtab_netdev *dev;
866
867
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
868
GFP_NOWAIT,
869
dtab->map.numa_node);
870
if (!dev)
871
return ERR_PTR(-ENOMEM);
872
873
dev->dev = dev_get_by_index(net, val->ifindex);
874
if (!dev->dev)
875
goto err_out;
876
877
if (val->bpf_prog.fd > 0) {
878
prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
879
BPF_PROG_TYPE_XDP, false);
880
if (IS_ERR(prog))
881
goto err_put_dev;
882
if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
883
!bpf_prog_map_compatible(&dtab->map, prog))
884
goto err_put_prog;
885
}
886
887
dev->idx = idx;
888
if (prog) {
889
dev->xdp_prog = prog;
890
dev->val.bpf_prog.id = prog->aux->id;
891
} else {
892
dev->xdp_prog = NULL;
893
dev->val.bpf_prog.id = 0;
894
}
895
dev->val.ifindex = val->ifindex;
896
897
return dev;
898
err_put_prog:
899
bpf_prog_put(prog);
900
err_put_dev:
901
dev_put(dev->dev);
902
err_out:
903
kfree(dev);
904
return ERR_PTR(-EINVAL);
905
}
906
907
static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
908
void *key, void *value, u64 map_flags)
909
{
910
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
911
struct bpf_dtab_netdev *dev, *old_dev;
912
struct bpf_devmap_val val = {};
913
u32 i = *(u32 *)key;
914
915
if (unlikely(map_flags > BPF_EXIST))
916
return -EINVAL;
917
if (unlikely(i >= dtab->map.max_entries))
918
return -E2BIG;
919
if (unlikely(map_flags == BPF_NOEXIST))
920
return -EEXIST;
921
922
/* already verified value_size <= sizeof val */
923
memcpy(&val, value, map->value_size);
924
925
if (!val.ifindex) {
926
dev = NULL;
927
/* can not specify fd if ifindex is 0 */
928
if (val.bpf_prog.fd > 0)
929
return -EINVAL;
930
} else {
931
dev = __dev_map_alloc_node(net, dtab, &val, i);
932
if (IS_ERR(dev))
933
return PTR_ERR(dev);
934
}
935
936
/* Use call_rcu() here to ensure rcu critical sections have completed
937
* Remembering the driver side flush operation will happen before the
938
* net device is removed.
939
*/
940
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
941
if (old_dev)
942
call_rcu(&old_dev->rcu, __dev_map_entry_free);
943
else
944
atomic_inc((atomic_t *)&dtab->items);
945
946
return 0;
947
}
948
949
static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
950
u64 map_flags)
951
{
952
return __dev_map_update_elem(current->nsproxy->net_ns,
953
map, key, value, map_flags);
954
}
955
956
static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
957
void *key, void *value, u64 map_flags)
958
{
959
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
960
struct bpf_dtab_netdev *dev, *old_dev;
961
struct bpf_devmap_val val = {};
962
u32 idx = *(u32 *)key;
963
unsigned long flags;
964
int err = -EEXIST;
965
966
/* already verified value_size <= sizeof val */
967
memcpy(&val, value, map->value_size);
968
969
if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
970
return -EINVAL;
971
972
spin_lock_irqsave(&dtab->index_lock, flags);
973
974
old_dev = __dev_map_hash_lookup_elem(map, idx);
975
if (old_dev && (map_flags & BPF_NOEXIST))
976
goto out_err;
977
978
dev = __dev_map_alloc_node(net, dtab, &val, idx);
979
if (IS_ERR(dev)) {
980
err = PTR_ERR(dev);
981
goto out_err;
982
}
983
984
if (old_dev) {
985
hlist_del_rcu(&old_dev->index_hlist);
986
} else {
987
if (dtab->items >= dtab->map.max_entries) {
988
spin_unlock_irqrestore(&dtab->index_lock, flags);
989
call_rcu(&dev->rcu, __dev_map_entry_free);
990
return -E2BIG;
991
}
992
dtab->items++;
993
}
994
995
hlist_add_head_rcu(&dev->index_hlist,
996
dev_map_index_hash(dtab, idx));
997
spin_unlock_irqrestore(&dtab->index_lock, flags);
998
999
if (old_dev)
1000
call_rcu(&old_dev->rcu, __dev_map_entry_free);
1001
1002
return 0;
1003
1004
out_err:
1005
spin_unlock_irqrestore(&dtab->index_lock, flags);
1006
return err;
1007
}
1008
1009
static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
1010
u64 map_flags)
1011
{
1012
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
1013
map, key, value, map_flags);
1014
}
1015
1016
static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
1017
{
1018
return __bpf_xdp_redirect_map(map, ifindex, flags,
1019
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
1020
__dev_map_lookup_elem);
1021
}
1022
1023
static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
1024
{
1025
return __bpf_xdp_redirect_map(map, ifindex, flags,
1026
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
1027
__dev_map_hash_lookup_elem);
1028
}
1029
1030
static u64 dev_map_mem_usage(const struct bpf_map *map)
1031
{
1032
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
1033
u64 usage = sizeof(struct bpf_dtab);
1034
1035
if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
1036
usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
1037
else
1038
usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
1039
usage += atomic_read((atomic_t *)&dtab->items) *
1040
(u64)sizeof(struct bpf_dtab_netdev);
1041
return usage;
1042
}
1043
1044
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
1045
const struct bpf_map_ops dev_map_ops = {
1046
.map_meta_equal = bpf_map_meta_equal,
1047
.map_alloc_check = dev_map_alloc_check,
1048
.map_alloc = dev_map_alloc,
1049
.map_free = dev_map_free,
1050
.map_get_next_key = dev_map_get_next_key,
1051
.map_lookup_elem = dev_map_lookup_elem,
1052
.map_update_elem = dev_map_update_elem,
1053
.map_delete_elem = dev_map_delete_elem,
1054
.map_check_btf = map_check_no_btf,
1055
.map_mem_usage = dev_map_mem_usage,
1056
.map_btf_id = &dev_map_btf_ids[0],
1057
.map_redirect = dev_map_redirect,
1058
};
1059
1060
const struct bpf_map_ops dev_map_hash_ops = {
1061
.map_meta_equal = bpf_map_meta_equal,
1062
.map_alloc_check = dev_map_alloc_check,
1063
.map_alloc = dev_map_alloc,
1064
.map_free = dev_map_free,
1065
.map_get_next_key = dev_map_hash_get_next_key,
1066
.map_lookup_elem = dev_map_hash_lookup_elem,
1067
.map_update_elem = dev_map_hash_update_elem,
1068
.map_delete_elem = dev_map_hash_delete_elem,
1069
.map_check_btf = map_check_no_btf,
1070
.map_mem_usage = dev_map_mem_usage,
1071
.map_btf_id = &dev_map_btf_ids[0],
1072
.map_redirect = dev_hash_map_redirect,
1073
};
1074
1075
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
1076
struct net_device *netdev)
1077
{
1078
unsigned long flags;
1079
u32 i;
1080
1081
spin_lock_irqsave(&dtab->index_lock, flags);
1082
for (i = 0; i < dtab->n_buckets; i++) {
1083
struct bpf_dtab_netdev *dev;
1084
struct hlist_head *head;
1085
struct hlist_node *next;
1086
1087
head = dev_map_index_hash(dtab, i);
1088
1089
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
1090
if (netdev != dev->dev)
1091
continue;
1092
1093
dtab->items--;
1094
hlist_del_rcu(&dev->index_hlist);
1095
call_rcu(&dev->rcu, __dev_map_entry_free);
1096
}
1097
}
1098
spin_unlock_irqrestore(&dtab->index_lock, flags);
1099
}
1100
1101
static int dev_map_notification(struct notifier_block *notifier,
1102
ulong event, void *ptr)
1103
{
1104
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
1105
struct bpf_dtab *dtab;
1106
int i, cpu;
1107
1108
switch (event) {
1109
case NETDEV_REGISTER:
1110
if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
1111
break;
1112
1113
/* will be freed in free_netdev() */
1114
netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
1115
if (!netdev->xdp_bulkq)
1116
return NOTIFY_BAD;
1117
1118
for_each_possible_cpu(cpu)
1119
per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
1120
break;
1121
case NETDEV_UNREGISTER:
1122
/* This rcu_read_lock/unlock pair is needed because
1123
* dev_map_list is an RCU list AND to ensure a delete
1124
* operation does not free a netdev_map entry while we
1125
* are comparing it against the netdev being unregistered.
1126
*/
1127
rcu_read_lock();
1128
list_for_each_entry_rcu(dtab, &dev_map_list, list) {
1129
if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
1130
dev_map_hash_remove_netdev(dtab, netdev);
1131
continue;
1132
}
1133
1134
for (i = 0; i < dtab->map.max_entries; i++) {
1135
struct bpf_dtab_netdev *dev, *odev;
1136
1137
dev = rcu_dereference(dtab->netdev_map[i]);
1138
if (!dev || netdev != dev->dev)
1139
continue;
1140
odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
1141
if (dev == odev) {
1142
call_rcu(&dev->rcu,
1143
__dev_map_entry_free);
1144
atomic_dec((atomic_t *)&dtab->items);
1145
}
1146
}
1147
}
1148
rcu_read_unlock();
1149
break;
1150
default:
1151
break;
1152
}
1153
return NOTIFY_OK;
1154
}
1155
1156
static struct notifier_block dev_map_notifier = {
1157
.notifier_call = dev_map_notification,
1158
};
1159
1160
static int __init dev_map_init(void)
1161
{
1162
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
1163
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
1164
offsetof(struct _bpf_dtab_netdev, dev));
1165
register_netdevice_notifier(&dev_map_notifier);
1166
1167
return 0;
1168
}
1169
1170
subsys_initcall(dev_map_init);
1171
1172