CoCalc -- hashtab.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/bpf/hashtab.c
²⁹²⁶⁷ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3
 * Copyright (c) 2016 Facebook
4
 */
5
#include <linux/bpf.h>
6
#include <linux/btf.h>
7
#include <linux/jhash.h>
8
#include <linux/filter.h>
9
#include <linux/rculist_nulls.h>
10
#include <linux/rcupdate_wait.h>
11
#include <linux/random.h>
12
#include <uapi/linux/btf.h>
13
#include <linux/rcupdate_trace.h>
14
#include <linux/btf_ids.h>
15
#include "percpu_freelist.h"
16
#include "bpf_lru_list.h"
17
#include "map_in_map.h"
18
#include <linux/bpf_mem_alloc.h>
19
#include <asm/rqspinlock.h>
20

21
#define HTAB_CREATE_FLAG_MASK						\
22
	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
23
	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24

25
#define BATCH_OPS(_name)			\
26
	.map_lookup_batch =			\
27
	_name##_map_lookup_batch,		\
28
	.map_lookup_and_delete_batch =		\
29
	_name##_map_lookup_and_delete_batch,	\
30
	.map_update_batch =			\
31
	generic_map_update_batch,		\
32
	.map_delete_batch =			\
33
	generic_map_delete_batch
34

35
/*
36
 * The bucket lock has two protection scopes:
37
 *
38
 * 1) Serializing concurrent operations from BPF programs on different
39
 *    CPUs
40
 *
41
 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
42
 *
43
 * BPF programs can execute in any context including perf, kprobes and
44
 * tracing. As there are almost no limits where perf, kprobes and tracing
45
 * can be invoked from the lock operations need to be protected against
46
 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
47
 * the lock held section when functions which acquire this lock are invoked
48
 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49
 * variable bpf_prog_active, which prevents BPF programs attached to perf
50
 * events, kprobes and tracing to be invoked before the prior invocation
51
 * from one of these contexts completed. sys_bpf() uses the same mechanism
52
 * by pinning the task to the current CPU and incrementing the recursion
53
 * protection across the map operation.
54
 *
55
 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56
 * operations like memory allocations (even with GFP_ATOMIC) from atomic
57
 * contexts. This is required because even with GFP_ATOMIC the memory
58
 * allocator calls into code paths which acquire locks with long held lock
59
 * sections. To ensure the deterministic behaviour these locks are regular
60
 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61
 * true atomic contexts on an RT kernel are the low level hardware
62
 * handling, scheduling, low level interrupt handling, NMIs etc. None of
63
 * these contexts should ever do memory allocations.
64
 *
65
 * As regular device interrupt handlers and soft interrupts are forced into
66
 * thread context, the existing code which does
67
 *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
68
 * just works.
69
 *
70
 * In theory the BPF locks could be converted to regular spinlocks as well,
71
 * but the bucket locks and percpu_freelist locks can be taken from
72
 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73
 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74
 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75
 * because there is no memory allocation within the lock held sections. However
76
 * after hash map was fully converted to use bpf_mem_alloc, there will be
77
 * non-synchronous memory allocation for non-preallocated hash map, so it is
78
 * safe to always use raw spinlock for bucket lock.
79
 */
80
struct bucket {
81
	struct hlist_nulls_head head;
82
	rqspinlock_t raw_lock;
83
};
84

85
#define HASHTAB_MAP_LOCK_COUNT 8
86
#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
87

88
struct bpf_htab {
89
	struct bpf_map map;
90
	struct bpf_mem_alloc ma;
91
	struct bpf_mem_alloc pcpu_ma;
92
	struct bucket *buckets;
93
	void *elems;
94
	union {
95
		struct pcpu_freelist freelist;
96
		struct bpf_lru lru;
97
	};
98
	struct htab_elem *__percpu *extra_elems;
99
	/* number of elements in non-preallocated hashtable are kept
100
	 * in either pcount or count
101
	 */
102
	struct percpu_counter pcount;
103
	atomic_t count;
104
	bool use_percpu_counter;
105
	u32 n_buckets;	/* number of hash buckets */
106
	u32 elem_size;	/* size of each element in bytes */
107
	u32 hashrnd;
108
};
109

110
/* each htab element is struct htab_elem + key + value */
111
struct htab_elem {
112
	union {
113
		struct hlist_nulls_node hash_node;
114
		struct {
115
			void *padding;
116
			union {
117
				struct pcpu_freelist_node fnode;
118
				struct htab_elem *batch_flink;
119
			};
120
		};
121
	};
122
	union {
123
		/* pointer to per-cpu pointer */
124
		void *ptr_to_pptr;
125
		struct bpf_lru_node lru_node;
126
	};
127
	u32 hash;
128
	char key[] __aligned(8);
129
};
130

131
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132
{
133
	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134
}
135

136
static void htab_init_buckets(struct bpf_htab *htab)
137
{
138
	unsigned int i;
139

140
	for (i = 0; i < htab->n_buckets; i++) {
141
		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142
		raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
143
		cond_resched();
144
	}
145
}
146

147
static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
148
{
149
	unsigned long flags;
150
	int ret;
151

152
	ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
153
	if (ret)
154
		return ret;
155
	*pflags = flags;
156
	return 0;
157
}
158

159
static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
160
{
161
	raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
162
}
163

164
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
165

166
static bool htab_is_lru(const struct bpf_htab *htab)
167
{
168
	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
169
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
170
}
171

172
static bool htab_is_percpu(const struct bpf_htab *htab)
173
{
174
	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
175
		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
176
}
177

178
static inline bool is_fd_htab(const struct bpf_htab *htab)
179
{
180
	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
181
}
182

183
static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
184
{
185
	return l->key + round_up(key_size, 8);
186
}
187

188
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
189
				     void __percpu *pptr)
190
{
191
	*(void __percpu **)htab_elem_value(l, key_size) = pptr;
192
}
193

194
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
195
{
196
	return *(void __percpu **)htab_elem_value(l, key_size);
197
}
198

199
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
200
{
201
	return *(void **)htab_elem_value(l, map->key_size);
202
}
203

204
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
205
{
206
	return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
207
}
208

209
/* Both percpu and fd htab support in-place update, so no need for
210
 * extra elem. LRU itself can remove the least used element, so
211
 * there is no need for an extra elem during map_update.
212
 */
213
static bool htab_has_extra_elems(struct bpf_htab *htab)
214
{
215
	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216
}
217

218
static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
219
{
220
	if (btf_record_has_field(htab->map.record, BPF_TIMER))
221
		bpf_obj_free_timer(htab->map.record,
222
				   htab_elem_value(elem, htab->map.key_size));
223
	if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
224
		bpf_obj_free_workqueue(htab->map.record,
225
				       htab_elem_value(elem, htab->map.key_size));
226
	if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
227
		bpf_obj_free_task_work(htab->map.record,
228
				       htab_elem_value(elem, htab->map.key_size));
229
}
230

231
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
232
{
233
	u32 num_entries = htab->map.max_entries;
234
	int i;
235

236
	if (htab_has_extra_elems(htab))
237
		num_entries += num_possible_cpus();
238

239
	for (i = 0; i < num_entries; i++) {
240
		struct htab_elem *elem;
241

242
		elem = get_htab_elem(htab, i);
243
		htab_free_internal_structs(htab, elem);
244
		cond_resched();
245
	}
246
}
247

248
static void htab_free_prealloced_fields(struct bpf_htab *htab)
249
{
250
	u32 num_entries = htab->map.max_entries;
251
	int i;
252

253
	if (IS_ERR_OR_NULL(htab->map.record))
254
		return;
255
	if (htab_has_extra_elems(htab))
256
		num_entries += num_possible_cpus();
257
	for (i = 0; i < num_entries; i++) {
258
		struct htab_elem *elem;
259

260
		elem = get_htab_elem(htab, i);
261
		if (htab_is_percpu(htab)) {
262
			void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
263
			int cpu;
264

265
			for_each_possible_cpu(cpu) {
266
				bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
267
				cond_resched();
268
			}
269
		} else {
270
			bpf_obj_free_fields(htab->map.record,
271
					    htab_elem_value(elem, htab->map.key_size));
272
			cond_resched();
273
		}
274
		cond_resched();
275
	}
276
}
277

278
static void htab_free_elems(struct bpf_htab *htab)
279
{
280
	int i;
281

282
	if (!htab_is_percpu(htab))
283
		goto free_elems;
284

285
	for (i = 0; i < htab->map.max_entries; i++) {
286
		void __percpu *pptr;
287

288
		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
289
					 htab->map.key_size);
290
		free_percpu(pptr);
291
		cond_resched();
292
	}
293
free_elems:
294
	bpf_map_area_free(htab->elems);
295
}
296

297
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
298
 * (bucket_lock). If both locks need to be acquired together, the lock
299
 * order is always lru_lock -> bucket_lock and this only happens in
300
 * bpf_lru_list.c logic. For example, certain code path of
301
 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
302
 * will acquire lru_lock first followed by acquiring bucket_lock.
303
 *
304
 * In hashtab.c, to avoid deadlock, lock acquisition of
305
 * bucket_lock followed by lru_lock is not allowed. In such cases,
306
 * bucket_lock needs to be released first before acquiring lru_lock.
307
 */
308
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
309
					  u32 hash)
310
{
311
	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
312
	struct htab_elem *l;
313

314
	if (node) {
315
		bpf_map_inc_elem_count(&htab->map);
316
		l = container_of(node, struct htab_elem, lru_node);
317
		memcpy(l->key, key, htab->map.key_size);
318
		return l;
319
	}
320

321
	return NULL;
322
}
323

324
static int prealloc_init(struct bpf_htab *htab)
325
{
326
	u32 num_entries = htab->map.max_entries;
327
	int err = -ENOMEM, i;
328

329
	if (htab_has_extra_elems(htab))
330
		num_entries += num_possible_cpus();
331

332
	htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
333
					 htab->map.numa_node);
334
	if (!htab->elems)
335
		return -ENOMEM;
336

337
	if (!htab_is_percpu(htab))
338
		goto skip_percpu_elems;
339

340
	for (i = 0; i < num_entries; i++) {
341
		u32 size = round_up(htab->map.value_size, 8);
342
		void __percpu *pptr;
343

344
		pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
345
					    GFP_USER | __GFP_NOWARN);
346
		if (!pptr)
347
			goto free_elems;
348
		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
349
				  pptr);
350
		cond_resched();
351
	}
352

353
skip_percpu_elems:
354
	if (htab_is_lru(htab))
355
		err = bpf_lru_init(&htab->lru,
356
				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
357
				   offsetof(struct htab_elem, hash) -
358
				   offsetof(struct htab_elem, lru_node),
359
				   htab_lru_map_delete_node,
360
				   htab);
361
	else
362
		err = pcpu_freelist_init(&htab->freelist);
363

364
	if (err)
365
		goto free_elems;
366

367
	if (htab_is_lru(htab))
368
		bpf_lru_populate(&htab->lru, htab->elems,
369
				 offsetof(struct htab_elem, lru_node),
370
				 htab->elem_size, num_entries);
371
	else
372
		pcpu_freelist_populate(&htab->freelist,
373
				       htab->elems + offsetof(struct htab_elem, fnode),
374
				       htab->elem_size, num_entries);
375

376
	return 0;
377

378
free_elems:
379
	htab_free_elems(htab);
380
	return err;
381
}
382

383
static void prealloc_destroy(struct bpf_htab *htab)
384
{
385
	htab_free_elems(htab);
386

387
	if (htab_is_lru(htab))
388
		bpf_lru_destroy(&htab->lru);
389
	else
390
		pcpu_freelist_destroy(&htab->freelist);
391
}
392

393
static int alloc_extra_elems(struct bpf_htab *htab)
394
{
395
	struct htab_elem *__percpu *pptr, *l_new;
396
	struct pcpu_freelist_node *l;
397
	int cpu;
398

399
	pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
400
				    GFP_USER | __GFP_NOWARN);
401
	if (!pptr)
402
		return -ENOMEM;
403

404
	for_each_possible_cpu(cpu) {
405
		l = pcpu_freelist_pop(&htab->freelist);
406
		/* pop will succeed, since prealloc_init()
407
		 * preallocated extra num_possible_cpus elements
408
		 */
409
		l_new = container_of(l, struct htab_elem, fnode);
410
		*per_cpu_ptr(pptr, cpu) = l_new;
411
	}
412
	htab->extra_elems = pptr;
413
	return 0;
414
}
415

416
/* Called from syscall */
417
static int htab_map_alloc_check(union bpf_attr *attr)
418
{
419
	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
420
		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
421
	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
422
		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
423
	/* percpu_lru means each cpu has its own LRU list.
424
	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
425
	 * the map's value itself is percpu.  percpu_lru has
426
	 * nothing to do with the map's value.
427
	 */
428
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
429
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
430
	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
431
	int numa_node = bpf_map_attr_numa_node(attr);
432

433
	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
434
		     offsetof(struct htab_elem, hash_node.pprev));
435

436
	if (zero_seed && !capable(CAP_SYS_ADMIN))
437
		/* Guard against local DoS, and discourage production use. */
438
		return -EPERM;
439

440
	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
441
	    !bpf_map_flags_access_ok(attr->map_flags))
442
		return -EINVAL;
443

444
	if (!lru && percpu_lru)
445
		return -EINVAL;
446

447
	if (lru && !prealloc)
448
		return -ENOTSUPP;
449

450
	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
451
		return -EINVAL;
452

453
	/* check sanity of attributes.
454
	 * value_size == 0 may be allowed in the future to use map as a set
455
	 */
456
	if (attr->max_entries == 0 || attr->key_size == 0 ||
457
	    attr->value_size == 0)
458
		return -EINVAL;
459

460
	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
461
	   sizeof(struct htab_elem))
462
		/* if key_size + value_size is bigger, the user space won't be
463
		 * able to access the elements via bpf syscall. This check
464
		 * also makes sure that the elem_size doesn't overflow and it's
465
		 * kmalloc-able later in htab_map_update_elem()
466
		 */
467
		return -E2BIG;
468
	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
469
	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
470
		return -E2BIG;
471

472
	return 0;
473
}
474

475
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
476
{
477
	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
478
		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
479
	/* percpu_lru means each cpu has its own LRU list.
480
	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
481
	 * the map's value itself is percpu.  percpu_lru has
482
	 * nothing to do with the map's value.
483
	 */
484
	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
485
	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
486
	struct bpf_htab *htab;
487
	int err;
488

489
	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
490
	if (!htab)
491
		return ERR_PTR(-ENOMEM);
492

493
	bpf_map_init_from_attr(&htab->map, attr);
494

495
	if (percpu_lru) {
496
		/* ensure each CPU's lru list has >=1 elements.
497
		 * since we are at it, make each lru list has the same
498
		 * number of elements.
499
		 */
500
		htab->map.max_entries = roundup(attr->max_entries,
501
						num_possible_cpus());
502
		if (htab->map.max_entries < attr->max_entries)
503
			htab->map.max_entries = rounddown(attr->max_entries,
504
							  num_possible_cpus());
505
	}
506

507
	/* hash table size must be power of 2; roundup_pow_of_two() can overflow
508
	 * into UB on 32-bit arches, so check that first
509
	 */
510
	err = -E2BIG;
511
	if (htab->map.max_entries > 1UL << 31)
512
		goto free_htab;
513

514
	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
515

516
	htab->elem_size = sizeof(struct htab_elem) +
517
			  round_up(htab->map.key_size, 8);
518
	if (percpu)
519
		htab->elem_size += sizeof(void *);
520
	else
521
		htab->elem_size += round_up(htab->map.value_size, 8);
522

523
	/* check for u32 overflow */
524
	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
525
		goto free_htab;
526

527
	err = bpf_map_init_elem_count(&htab->map);
528
	if (err)
529
		goto free_htab;
530

531
	err = -ENOMEM;
532
	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
533
					   sizeof(struct bucket),
534
					   htab->map.numa_node);
535
	if (!htab->buckets)
536
		goto free_elem_count;
537

538
	if (htab->map.map_flags & BPF_F_ZERO_SEED)
539
		htab->hashrnd = 0;
540
	else
541
		htab->hashrnd = get_random_u32();
542

543
	htab_init_buckets(htab);
544

545
/* compute_batch_value() computes batch value as num_online_cpus() * 2
546
 * and __percpu_counter_compare() needs
547
 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
548
 * for percpu_counter to be faster than atomic_t. In practice the average bpf
549
 * hash map size is 10k, which means that a system with 64 cpus will fill
550
 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
551
 * define our own batch count as 32 then 10k hash map can be filled up to 80%:
552
 * 10k - 8k > 32 _batch_ * 64 _cpus_
553
 * and __percpu_counter_compare() will still be fast. At that point hash map
554
 * collisions will dominate its performance anyway. Assume that hash map filled
555
 * to 50+% isn't going to be O(1) and use the following formula to choose
556
 * between percpu_counter and atomic_t.
557
 */
558
#define PERCPU_COUNTER_BATCH 32
559
	if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
560
		htab->use_percpu_counter = true;
561

562
	if (htab->use_percpu_counter) {
563
		err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
564
		if (err)
565
			goto free_map_locked;
566
	}
567

568
	if (prealloc) {
569
		err = prealloc_init(htab);
570
		if (err)
571
			goto free_map_locked;
572

573
		if (htab_has_extra_elems(htab)) {
574
			err = alloc_extra_elems(htab);
575
			if (err)
576
				goto free_prealloc;
577
		}
578
	} else {
579
		err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
580
		if (err)
581
			goto free_map_locked;
582
		if (percpu) {
583
			err = bpf_mem_alloc_init(&htab->pcpu_ma,
584
						 round_up(htab->map.value_size, 8), true);
585
			if (err)
586
				goto free_map_locked;
587
		}
588
	}
589

590
	return &htab->map;
591

592
free_prealloc:
593
	prealloc_destroy(htab);
594
free_map_locked:
595
	if (htab->use_percpu_counter)
596
		percpu_counter_destroy(&htab->pcount);
597
	bpf_map_area_free(htab->buckets);
598
	bpf_mem_alloc_destroy(&htab->pcpu_ma);
599
	bpf_mem_alloc_destroy(&htab->ma);
600
free_elem_count:
601
	bpf_map_free_elem_count(&htab->map);
602
free_htab:
603
	bpf_map_area_free(htab);
604
	return ERR_PTR(err);
605
}
606

607
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
608
{
609
	if (likely(key_len % 4 == 0))
610
		return jhash2(key, key_len / 4, hashrnd);
611
	return jhash(key, key_len, hashrnd);
612
}
613

614
static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
615
{
616
	return &htab->buckets[hash & (htab->n_buckets - 1)];
617
}
618

619
static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
620
{
621
	return &__select_bucket(htab, hash)->head;
622
}
623

624
/* this lookup function can only be called with bucket lock taken */
625
static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
626
					 void *key, u32 key_size)
627
{
628
	struct hlist_nulls_node *n;
629
	struct htab_elem *l;
630

631
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
632
		if (l->hash == hash && !memcmp(&l->key, key, key_size))
633
			return l;
634

635
	return NULL;
636
}
637

638
/* can be called without bucket lock. it will repeat the loop in
639
 * the unlikely event when elements moved from one bucket into another
640
 * while link list is being walked
641
 */
642
static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
643
					       u32 hash, void *key,
644
					       u32 key_size, u32 n_buckets)
645
{
646
	struct hlist_nulls_node *n;
647
	struct htab_elem *l;
648

649
again:
650
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
651
		if (l->hash == hash && !memcmp(&l->key, key, key_size))
652
			return l;
653

654
	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
655
		goto again;
656

657
	return NULL;
658
}
659

660
/* Called from syscall or from eBPF program directly, so
661
 * arguments have to match bpf_map_lookup_elem() exactly.
662
 * The return value is adjusted by BPF instructions
663
 * in htab_map_gen_lookup().
664
 */
665
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
666
{
667
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
668
	struct hlist_nulls_head *head;
669
	struct htab_elem *l;
670
	u32 hash, key_size;
671

672
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
673
		     !rcu_read_lock_bh_held());
674

675
	key_size = map->key_size;
676

677
	hash = htab_map_hash(key, key_size, htab->hashrnd);
678

679
	head = select_bucket(htab, hash);
680

681
	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
682

683
	return l;
684
}
685

686
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
687
{
688
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
689

690
	if (l)
691
		return htab_elem_value(l, map->key_size);
692

693
	return NULL;
694
}
695

696
/* inline bpf_map_lookup_elem() call.
697
 * Instead of:
698
 * bpf_prog
699
 *   bpf_map_lookup_elem
700
 *     map->ops->map_lookup_elem
701
 *       htab_map_lookup_elem
702
 *         __htab_map_lookup_elem
703
 * do:
704
 * bpf_prog
705
 *   __htab_map_lookup_elem
706
 */
707
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
708
{
709
	struct bpf_insn *insn = insn_buf;
710
	const int ret = BPF_REG_0;
711

712
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
713
		     (void *(*)(struct bpf_map *map, void *key))NULL));
714
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
715
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
716
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
717
				offsetof(struct htab_elem, key) +
718
				round_up(map->key_size, 8));
719
	return insn - insn_buf;
720
}
721

722
static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
723
							void *key, const bool mark)
724
{
725
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
726

727
	if (l) {
728
		if (mark)
729
			bpf_lru_node_set_ref(&l->lru_node);
730
		return htab_elem_value(l, map->key_size);
731
	}
732

733
	return NULL;
734
}
735

736
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
737
{
738
	return __htab_lru_map_lookup_elem(map, key, true);
739
}
740

741
static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
742
{
743
	return __htab_lru_map_lookup_elem(map, key, false);
744
}
745

746
static int htab_lru_map_gen_lookup(struct bpf_map *map,
747
				   struct bpf_insn *insn_buf)
748
{
749
	struct bpf_insn *insn = insn_buf;
750
	const int ret = BPF_REG_0;
751
	const int ref_reg = BPF_REG_1;
752

753
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
754
		     (void *(*)(struct bpf_map *map, void *key))NULL));
755
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
756
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
757
	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
758
			      offsetof(struct htab_elem, lru_node) +
759
			      offsetof(struct bpf_lru_node, ref));
760
	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
761
	*insn++ = BPF_ST_MEM(BPF_B, ret,
762
			     offsetof(struct htab_elem, lru_node) +
763
			     offsetof(struct bpf_lru_node, ref),
764
			     1);
765
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
766
				offsetof(struct htab_elem, key) +
767
				round_up(map->key_size, 8));
768
	return insn - insn_buf;
769
}
770

771
static void check_and_free_fields(struct bpf_htab *htab,
772
				  struct htab_elem *elem)
773
{
774
	if (IS_ERR_OR_NULL(htab->map.record))
775
		return;
776

777
	if (htab_is_percpu(htab)) {
778
		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
779
		int cpu;
780

781
		for_each_possible_cpu(cpu)
782
			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
783
	} else {
784
		void *map_value = htab_elem_value(elem, htab->map.key_size);
785

786
		bpf_obj_free_fields(htab->map.record, map_value);
787
	}
788
}
789

790
/* It is called from the bpf_lru_list when the LRU needs to delete
791
 * older elements from the htab.
792
 */
793
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
794
{
795
	struct bpf_htab *htab = arg;
796
	struct htab_elem *l = NULL, *tgt_l;
797
	struct hlist_nulls_head *head;
798
	struct hlist_nulls_node *n;
799
	unsigned long flags;
800
	struct bucket *b;
801
	int ret;
802

803
	tgt_l = container_of(node, struct htab_elem, lru_node);
804
	b = __select_bucket(htab, tgt_l->hash);
805
	head = &b->head;
806

807
	ret = htab_lock_bucket(b, &flags);
808
	if (ret)
809
		return false;
810

811
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
812
		if (l == tgt_l) {
813
			hlist_nulls_del_rcu(&l->hash_node);
814
			bpf_map_dec_elem_count(&htab->map);
815
			break;
816
		}
817

818
	htab_unlock_bucket(b, flags);
819

820
	if (l == tgt_l)
821
		check_and_free_fields(htab, l);
822
	return l == tgt_l;
823
}
824

825
/* Called from syscall */
826
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
827
{
828
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
829
	struct hlist_nulls_head *head;
830
	struct htab_elem *l, *next_l;
831
	u32 hash, key_size;
832
	int i = 0;
833

834
	WARN_ON_ONCE(!rcu_read_lock_held());
835

836
	key_size = map->key_size;
837

838
	if (!key)
839
		goto find_first_elem;
840

841
	hash = htab_map_hash(key, key_size, htab->hashrnd);
842

843
	head = select_bucket(htab, hash);
844

845
	/* lookup the key */
846
	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
847

848
	if (!l)
849
		goto find_first_elem;
850

851
	/* key was found, get next key in the same bucket */
852
	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
853
				  struct htab_elem, hash_node);
854

855
	if (next_l) {
856
		/* if next elem in this hash list is non-zero, just return it */
857
		memcpy(next_key, next_l->key, key_size);
858
		return 0;
859
	}
860

861
	/* no more elements in this hash list, go to the next bucket */
862
	i = hash & (htab->n_buckets - 1);
863
	i++;
864

865
find_first_elem:
866
	/* iterate over buckets */
867
	for (; i < htab->n_buckets; i++) {
868
		head = select_bucket(htab, i);
869

870
		/* pick first element in the bucket */
871
		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
872
					  struct htab_elem, hash_node);
873
		if (next_l) {
874
			/* if it's not empty, just return it */
875
			memcpy(next_key, next_l->key, key_size);
876
			return 0;
877
		}
878
	}
879

880
	/* iterated over all buckets and all elements */
881
	return -ENOENT;
882
}
883

884
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
885
{
886
	check_and_free_fields(htab, l);
887

888
	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
889
		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
890
	bpf_mem_cache_free(&htab->ma, l);
891
}
892

893
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
894
{
895
	struct bpf_map *map = &htab->map;
896
	void *ptr;
897

898
	if (map->ops->map_fd_put_ptr) {
899
		ptr = fd_htab_map_get_ptr(map, l);
900
		map->ops->map_fd_put_ptr(map, ptr, true);
901
	}
902
}
903

904
static bool is_map_full(struct bpf_htab *htab)
905
{
906
	if (htab->use_percpu_counter)
907
		return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
908
						PERCPU_COUNTER_BATCH) >= 0;
909
	return atomic_read(&htab->count) >= htab->map.max_entries;
910
}
911

912
static void inc_elem_count(struct bpf_htab *htab)
913
{
914
	bpf_map_inc_elem_count(&htab->map);
915

916
	if (htab->use_percpu_counter)
917
		percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
918
	else
919
		atomic_inc(&htab->count);
920
}
921

922
static void dec_elem_count(struct bpf_htab *htab)
923
{
924
	bpf_map_dec_elem_count(&htab->map);
925

926
	if (htab->use_percpu_counter)
927
		percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
928
	else
929
		atomic_dec(&htab->count);
930
}
931

932

933
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
934
{
935
	htab_put_fd_value(htab, l);
936

937
	if (htab_is_prealloc(htab)) {
938
		bpf_map_dec_elem_count(&htab->map);
939
		check_and_free_fields(htab, l);
940
		pcpu_freelist_push(&htab->freelist, &l->fnode);
941
	} else {
942
		dec_elem_count(htab);
943
		htab_elem_free(htab, l);
944
	}
945
}
946

947
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
948
			    void *value, bool onallcpus)
949
{
950
	if (!onallcpus) {
951
		/* copy true value_size bytes */
952
		copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
953
	} else {
954
		u32 size = round_up(htab->map.value_size, 8);
955
		int off = 0, cpu;
956

957
		for_each_possible_cpu(cpu) {
958
			copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
959
			off += size;
960
		}
961
	}
962
}
963

964
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
965
			    void *value, bool onallcpus)
966
{
967
	/* When not setting the initial value on all cpus, zero-fill element
968
	 * values for other cpus. Otherwise, bpf program has no way to ensure
969
	 * known initial values for cpus other than current one
970
	 * (onallcpus=false always when coming from bpf prog).
971
	 */
972
	if (!onallcpus) {
973
		int current_cpu = raw_smp_processor_id();
974
		int cpu;
975

976
		for_each_possible_cpu(cpu) {
977
			if (cpu == current_cpu)
978
				copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
979
			else /* Since elem is preallocated, we cannot touch special fields */
980
				zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
981
		}
982
	} else {
983
		pcpu_copy_value(htab, pptr, value, onallcpus);
984
	}
985
}
986

987
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
988
{
989
	return is_fd_htab(htab) && BITS_PER_LONG == 64;
990
}
991

992
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
993
					 void *value, u32 key_size, u32 hash,
994
					 bool percpu, bool onallcpus,
995
					 struct htab_elem *old_elem)
996
{
997
	u32 size = htab->map.value_size;
998
	bool prealloc = htab_is_prealloc(htab);
999
	struct htab_elem *l_new, **pl_new;
1000
	void __percpu *pptr;
1001

1002
	if (prealloc) {
1003
		if (old_elem) {
1004
			/* if we're updating the existing element,
1005
			 * use per-cpu extra elems to avoid freelist_pop/push
1006
			 */
1007
			pl_new = this_cpu_ptr(htab->extra_elems);
1008
			l_new = *pl_new;
1009
			*pl_new = old_elem;
1010
		} else {
1011
			struct pcpu_freelist_node *l;
1012

1013
			l = __pcpu_freelist_pop(&htab->freelist);
1014
			if (!l)
1015
				return ERR_PTR(-E2BIG);
1016
			l_new = container_of(l, struct htab_elem, fnode);
1017
			bpf_map_inc_elem_count(&htab->map);
1018
		}
1019
	} else {
1020
		if (is_map_full(htab))
1021
			if (!old_elem)
1022
				/* when map is full and update() is replacing
1023
				 * old element, it's ok to allocate, since
1024
				 * old element will be freed immediately.
1025
				 * Otherwise return an error
1026
				 */
1027
				return ERR_PTR(-E2BIG);
1028
		inc_elem_count(htab);
1029
		l_new = bpf_mem_cache_alloc(&htab->ma);
1030
		if (!l_new) {
1031
			l_new = ERR_PTR(-ENOMEM);
1032
			goto dec_count;
1033
		}
1034
	}
1035

1036
	memcpy(l_new->key, key, key_size);
1037
	if (percpu) {
1038
		if (prealloc) {
1039
			pptr = htab_elem_get_ptr(l_new, key_size);
1040
		} else {
1041
			/* alloc_percpu zero-fills */
1042
			void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1043

1044
			if (!ptr) {
1045
				bpf_mem_cache_free(&htab->ma, l_new);
1046
				l_new = ERR_PTR(-ENOMEM);
1047
				goto dec_count;
1048
			}
1049
			l_new->ptr_to_pptr = ptr;
1050
			pptr = *(void __percpu **)ptr;
1051
		}
1052

1053
		pcpu_init_value(htab, pptr, value, onallcpus);
1054

1055
		if (!prealloc)
1056
			htab_elem_set_ptr(l_new, key_size, pptr);
1057
	} else if (fd_htab_map_needs_adjust(htab)) {
1058
		size = round_up(size, 8);
1059
		memcpy(htab_elem_value(l_new, key_size), value, size);
1060
	} else {
1061
		copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1062
	}
1063

1064
	l_new->hash = hash;
1065
	return l_new;
1066
dec_count:
1067
	dec_elem_count(htab);
1068
	return l_new;
1069
}
1070

1071
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1072
		       u64 map_flags)
1073
{
1074
	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1075
		/* elem already exists */
1076
		return -EEXIST;
1077

1078
	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1079
		/* elem doesn't exist, cannot update it */
1080
		return -ENOENT;
1081

1082
	return 0;
1083
}
1084

1085
/* Called from syscall or from eBPF program */
1086
static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1087
				 u64 map_flags)
1088
{
1089
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1090
	struct htab_elem *l_new, *l_old;
1091
	struct hlist_nulls_head *head;
1092
	unsigned long flags;
1093
	struct bucket *b;
1094
	u32 key_size, hash;
1095
	int ret;
1096

1097
	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1098
		/* unknown flags */
1099
		return -EINVAL;
1100

1101
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1102
		     !rcu_read_lock_bh_held());
1103

1104
	key_size = map->key_size;
1105

1106
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1107

1108
	b = __select_bucket(htab, hash);
1109
	head = &b->head;
1110

1111
	if (unlikely(map_flags & BPF_F_LOCK)) {
1112
		if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1113
			return -EINVAL;
1114
		/* find an element without taking the bucket lock */
1115
		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1116
					      htab->n_buckets);
1117
		ret = check_flags(htab, l_old, map_flags);
1118
		if (ret)
1119
			return ret;
1120
		if (l_old) {
1121
			/* grab the element lock and update value in place */
1122
			copy_map_value_locked(map,
1123
					      htab_elem_value(l_old, key_size),
1124
					      value, false);
1125
			return 0;
1126
		}
1127
		/* fall through, grab the bucket lock and lookup again.
1128
		 * 99.9% chance that the element won't be found,
1129
		 * but second lookup under lock has to be done.
1130
		 */
1131
	}
1132

1133
	ret = htab_lock_bucket(b, &flags);
1134
	if (ret)
1135
		return ret;
1136

1137
	l_old = lookup_elem_raw(head, hash, key, key_size);
1138

1139
	ret = check_flags(htab, l_old, map_flags);
1140
	if (ret)
1141
		goto err;
1142

1143
	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1144
		/* first lookup without the bucket lock didn't find the element,
1145
		 * but second lookup with the bucket lock found it.
1146
		 * This case is highly unlikely, but has to be dealt with:
1147
		 * grab the element lock in addition to the bucket lock
1148
		 * and update element in place
1149
		 */
1150
		copy_map_value_locked(map,
1151
				      htab_elem_value(l_old, key_size),
1152
				      value, false);
1153
		ret = 0;
1154
		goto err;
1155
	}
1156

1157
	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1158
				l_old);
1159
	if (IS_ERR(l_new)) {
1160
		/* all pre-allocated elements are in use or memory exhausted */
1161
		ret = PTR_ERR(l_new);
1162
		goto err;
1163
	}
1164

1165
	/* add new element to the head of the list, so that
1166
	 * concurrent search will find it before old elem
1167
	 */
1168
	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1169
	if (l_old) {
1170
		hlist_nulls_del_rcu(&l_old->hash_node);
1171

1172
		/* l_old has already been stashed in htab->extra_elems, free
1173
		 * its special fields before it is available for reuse.
1174
		 */
1175
		if (htab_is_prealloc(htab))
1176
			check_and_free_fields(htab, l_old);
1177
	}
1178
	htab_unlock_bucket(b, flags);
1179
	if (l_old && !htab_is_prealloc(htab))
1180
		free_htab_elem(htab, l_old);
1181
	return 0;
1182
err:
1183
	htab_unlock_bucket(b, flags);
1184
	return ret;
1185
}
1186

1187
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1188
{
1189
	check_and_free_fields(htab, elem);
1190
	bpf_map_dec_elem_count(&htab->map);
1191
	bpf_lru_push_free(&htab->lru, &elem->lru_node);
1192
}
1193

1194
static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1195
				     u64 map_flags)
1196
{
1197
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1198
	struct htab_elem *l_new, *l_old = NULL;
1199
	struct hlist_nulls_head *head;
1200
	unsigned long flags;
1201
	struct bucket *b;
1202
	u32 key_size, hash;
1203
	int ret;
1204

1205
	if (unlikely(map_flags > BPF_EXIST))
1206
		/* unknown flags */
1207
		return -EINVAL;
1208

1209
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1210
		     !rcu_read_lock_bh_held());
1211

1212
	key_size = map->key_size;
1213

1214
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1215

1216
	b = __select_bucket(htab, hash);
1217
	head = &b->head;
1218

1219
	/* For LRU, we need to alloc before taking bucket's
1220
	 * spinlock because getting free nodes from LRU may need
1221
	 * to remove older elements from htab and this removal
1222
	 * operation will need a bucket lock.
1223
	 */
1224
	l_new = prealloc_lru_pop(htab, key, hash);
1225
	if (!l_new)
1226
		return -ENOMEM;
1227
	copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1228

1229
	ret = htab_lock_bucket(b, &flags);
1230
	if (ret)
1231
		goto err_lock_bucket;
1232

1233
	l_old = lookup_elem_raw(head, hash, key, key_size);
1234

1235
	ret = check_flags(htab, l_old, map_flags);
1236
	if (ret)
1237
		goto err;
1238

1239
	/* add new element to the head of the list, so that
1240
	 * concurrent search will find it before old elem
1241
	 */
1242
	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1243
	if (l_old) {
1244
		bpf_lru_node_set_ref(&l_new->lru_node);
1245
		hlist_nulls_del_rcu(&l_old->hash_node);
1246
	}
1247
	ret = 0;
1248

1249
err:
1250
	htab_unlock_bucket(b, flags);
1251

1252
err_lock_bucket:
1253
	if (ret)
1254
		htab_lru_push_free(htab, l_new);
1255
	else if (l_old)
1256
		htab_lru_push_free(htab, l_old);
1257

1258
	return ret;
1259
}
1260

1261
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1262
					  void *value, u64 map_flags,
1263
					  bool percpu, bool onallcpus)
1264
{
1265
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1266
	struct htab_elem *l_new, *l_old;
1267
	struct hlist_nulls_head *head;
1268
	void *old_map_ptr = NULL;
1269
	unsigned long flags;
1270
	struct bucket *b;
1271
	u32 key_size, hash;
1272
	int ret;
1273

1274
	if (unlikely(map_flags > BPF_EXIST))
1275
		/* unknown flags */
1276
		return -EINVAL;
1277

1278
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1279
		     !rcu_read_lock_bh_held());
1280

1281
	key_size = map->key_size;
1282

1283
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1284

1285
	b = __select_bucket(htab, hash);
1286
	head = &b->head;
1287

1288
	ret = htab_lock_bucket(b, &flags);
1289
	if (ret)
1290
		return ret;
1291

1292
	l_old = lookup_elem_raw(head, hash, key, key_size);
1293

1294
	ret = check_flags(htab, l_old, map_flags);
1295
	if (ret)
1296
		goto err;
1297

1298
	if (l_old) {
1299
		/* Update value in-place */
1300
		if (percpu) {
1301
			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1302
					value, onallcpus);
1303
		} else {
1304
			void **inner_map_pptr = htab_elem_value(l_old, key_size);
1305

1306
			old_map_ptr = *inner_map_pptr;
1307
			WRITE_ONCE(*inner_map_pptr, *(void **)value);
1308
		}
1309
	} else {
1310
		l_new = alloc_htab_elem(htab, key, value, key_size,
1311
					hash, percpu, onallcpus, NULL);
1312
		if (IS_ERR(l_new)) {
1313
			ret = PTR_ERR(l_new);
1314
			goto err;
1315
		}
1316
		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1317
	}
1318
err:
1319
	htab_unlock_bucket(b, flags);
1320
	if (old_map_ptr)
1321
		map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1322
	return ret;
1323
}
1324

1325
static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1326
					      void *value, u64 map_flags,
1327
					      bool onallcpus)
1328
{
1329
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1330
	struct htab_elem *l_new = NULL, *l_old;
1331
	struct hlist_nulls_head *head;
1332
	unsigned long flags;
1333
	struct bucket *b;
1334
	u32 key_size, hash;
1335
	int ret;
1336

1337
	if (unlikely(map_flags > BPF_EXIST))
1338
		/* unknown flags */
1339
		return -EINVAL;
1340

1341
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1342
		     !rcu_read_lock_bh_held());
1343

1344
	key_size = map->key_size;
1345

1346
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1347

1348
	b = __select_bucket(htab, hash);
1349
	head = &b->head;
1350

1351
	/* For LRU, we need to alloc before taking bucket's
1352
	 * spinlock because LRU's elem alloc may need
1353
	 * to remove older elem from htab and this removal
1354
	 * operation will need a bucket lock.
1355
	 */
1356
	if (map_flags != BPF_EXIST) {
1357
		l_new = prealloc_lru_pop(htab, key, hash);
1358
		if (!l_new)
1359
			return -ENOMEM;
1360
	}
1361

1362
	ret = htab_lock_bucket(b, &flags);
1363
	if (ret)
1364
		goto err_lock_bucket;
1365

1366
	l_old = lookup_elem_raw(head, hash, key, key_size);
1367

1368
	ret = check_flags(htab, l_old, map_flags);
1369
	if (ret)
1370
		goto err;
1371

1372
	if (l_old) {
1373
		bpf_lru_node_set_ref(&l_old->lru_node);
1374

1375
		/* per-cpu hash map can update value in-place */
1376
		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1377
				value, onallcpus);
1378
	} else {
1379
		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1380
				value, onallcpus);
1381
		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1382
		l_new = NULL;
1383
	}
1384
	ret = 0;
1385
err:
1386
	htab_unlock_bucket(b, flags);
1387
err_lock_bucket:
1388
	if (l_new) {
1389
		bpf_map_dec_elem_count(&htab->map);
1390
		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1391
	}
1392
	return ret;
1393
}
1394

1395
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1396
					void *value, u64 map_flags)
1397
{
1398
	return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1399
}
1400

1401
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1402
					    void *value, u64 map_flags)
1403
{
1404
	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1405
						 false);
1406
}
1407

1408
/* Called from syscall or from eBPF program */
1409
static long htab_map_delete_elem(struct bpf_map *map, void *key)
1410
{
1411
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1412
	struct hlist_nulls_head *head;
1413
	struct bucket *b;
1414
	struct htab_elem *l;
1415
	unsigned long flags;
1416
	u32 hash, key_size;
1417
	int ret;
1418

1419
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1420
		     !rcu_read_lock_bh_held());
1421

1422
	key_size = map->key_size;
1423

1424
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1425
	b = __select_bucket(htab, hash);
1426
	head = &b->head;
1427

1428
	ret = htab_lock_bucket(b, &flags);
1429
	if (ret)
1430
		return ret;
1431

1432
	l = lookup_elem_raw(head, hash, key, key_size);
1433
	if (l)
1434
		hlist_nulls_del_rcu(&l->hash_node);
1435
	else
1436
		ret = -ENOENT;
1437

1438
	htab_unlock_bucket(b, flags);
1439

1440
	if (l)
1441
		free_htab_elem(htab, l);
1442
	return ret;
1443
}
1444

1445
static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1446
{
1447
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1448
	struct hlist_nulls_head *head;
1449
	struct bucket *b;
1450
	struct htab_elem *l;
1451
	unsigned long flags;
1452
	u32 hash, key_size;
1453
	int ret;
1454

1455
	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1456
		     !rcu_read_lock_bh_held());
1457

1458
	key_size = map->key_size;
1459

1460
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1461
	b = __select_bucket(htab, hash);
1462
	head = &b->head;
1463

1464
	ret = htab_lock_bucket(b, &flags);
1465
	if (ret)
1466
		return ret;
1467

1468
	l = lookup_elem_raw(head, hash, key, key_size);
1469

1470
	if (l)
1471
		hlist_nulls_del_rcu(&l->hash_node);
1472
	else
1473
		ret = -ENOENT;
1474

1475
	htab_unlock_bucket(b, flags);
1476
	if (l)
1477
		htab_lru_push_free(htab, l);
1478
	return ret;
1479
}
1480

1481
static void delete_all_elements(struct bpf_htab *htab)
1482
{
1483
	int i;
1484

1485
	/* It's called from a worker thread and migration has been disabled,
1486
	 * therefore, it is OK to invoke bpf_mem_cache_free() directly.
1487
	 */
1488
	for (i = 0; i < htab->n_buckets; i++) {
1489
		struct hlist_nulls_head *head = select_bucket(htab, i);
1490
		struct hlist_nulls_node *n;
1491
		struct htab_elem *l;
1492

1493
		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1494
			hlist_nulls_del_rcu(&l->hash_node);
1495
			htab_elem_free(htab, l);
1496
		}
1497
		cond_resched();
1498
	}
1499
}
1500

1501
static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1502
{
1503
	int i;
1504

1505
	rcu_read_lock();
1506
	for (i = 0; i < htab->n_buckets; i++) {
1507
		struct hlist_nulls_head *head = select_bucket(htab, i);
1508
		struct hlist_nulls_node *n;
1509
		struct htab_elem *l;
1510

1511
		hlist_nulls_for_each_entry(l, n, head, hash_node) {
1512
			/* We only free timer on uref dropping to zero */
1513
			htab_free_internal_structs(htab, l);
1514
		}
1515
		cond_resched_rcu();
1516
	}
1517
	rcu_read_unlock();
1518
}
1519

1520
static void htab_map_free_internal_structs(struct bpf_map *map)
1521
{
1522
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1523

1524
	/* We only free timer and workqueue on uref dropping to zero */
1525
	if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
1526
		if (!htab_is_prealloc(htab))
1527
			htab_free_malloced_internal_structs(htab);
1528
		else
1529
			htab_free_prealloced_internal_structs(htab);
1530
	}
1531
}
1532

1533
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1534
static void htab_map_free(struct bpf_map *map)
1535
{
1536
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1537

1538
	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1539
	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1540
	 * There is no need to synchronize_rcu() here to protect map elements.
1541
	 */
1542

1543
	/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1544
	 * underneath and is responsible for waiting for callbacks to finish
1545
	 * during bpf_mem_alloc_destroy().
1546
	 */
1547
	if (!htab_is_prealloc(htab)) {
1548
		delete_all_elements(htab);
1549
	} else {
1550
		htab_free_prealloced_fields(htab);
1551
		prealloc_destroy(htab);
1552
	}
1553

1554
	bpf_map_free_elem_count(map);
1555
	free_percpu(htab->extra_elems);
1556
	bpf_map_area_free(htab->buckets);
1557
	bpf_mem_alloc_destroy(&htab->pcpu_ma);
1558
	bpf_mem_alloc_destroy(&htab->ma);
1559
	if (htab->use_percpu_counter)
1560
		percpu_counter_destroy(&htab->pcount);
1561
	bpf_map_area_free(htab);
1562
}
1563

1564
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1565
				   struct seq_file *m)
1566
{
1567
	void *value;
1568

1569
	rcu_read_lock();
1570

1571
	value = htab_map_lookup_elem(map, key);
1572
	if (!value) {
1573
		rcu_read_unlock();
1574
		return;
1575
	}
1576

1577
	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1578
	seq_puts(m, ": ");
1579
	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1580
	seq_putc(m, '\n');
1581

1582
	rcu_read_unlock();
1583
}
1584

1585
static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1586
					     void *value, bool is_lru_map,
1587
					     bool is_percpu, u64 flags)
1588
{
1589
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1590
	struct hlist_nulls_head *head;
1591
	unsigned long bflags;
1592
	struct htab_elem *l;
1593
	u32 hash, key_size;
1594
	struct bucket *b;
1595
	int ret;
1596

1597
	key_size = map->key_size;
1598

1599
	hash = htab_map_hash(key, key_size, htab->hashrnd);
1600
	b = __select_bucket(htab, hash);
1601
	head = &b->head;
1602

1603
	ret = htab_lock_bucket(b, &bflags);
1604
	if (ret)
1605
		return ret;
1606

1607
	l = lookup_elem_raw(head, hash, key, key_size);
1608
	if (!l) {
1609
		ret = -ENOENT;
1610
		goto out_unlock;
1611
	}
1612

1613
	if (is_percpu) {
1614
		u32 roundup_value_size = round_up(map->value_size, 8);
1615
		void __percpu *pptr;
1616
		int off = 0, cpu;
1617

1618
		pptr = htab_elem_get_ptr(l, key_size);
1619
		for_each_possible_cpu(cpu) {
1620
			copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1621
			check_and_init_map_value(&htab->map, value + off);
1622
			off += roundup_value_size;
1623
		}
1624
	} else {
1625
		void *src = htab_elem_value(l, map->key_size);
1626

1627
		if (flags & BPF_F_LOCK)
1628
			copy_map_value_locked(map, value, src, true);
1629
		else
1630
			copy_map_value(map, value, src);
1631
		/* Zeroing special fields in the temp buffer */
1632
		check_and_init_map_value(map, value);
1633
	}
1634
	hlist_nulls_del_rcu(&l->hash_node);
1635

1636
out_unlock:
1637
	htab_unlock_bucket(b, bflags);
1638

1639
	if (l) {
1640
		if (is_lru_map)
1641
			htab_lru_push_free(htab, l);
1642
		else
1643
			free_htab_elem(htab, l);
1644
	}
1645

1646
	return ret;
1647
}
1648

1649
static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1650
					   void *value, u64 flags)
1651
{
1652
	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1653
						 flags);
1654
}
1655

1656
static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1657
						  void *key, void *value,
1658
						  u64 flags)
1659
{
1660
	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1661
						 flags);
1662
}
1663

1664
static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1665
					       void *value, u64 flags)
1666
{
1667
	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1668
						 flags);
1669
}
1670

1671
static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1672
						      void *key, void *value,
1673
						      u64 flags)
1674
{
1675
	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1676
						 flags);
1677
}
1678

1679
static int
1680
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1681
				   const union bpf_attr *attr,
1682
				   union bpf_attr __user *uattr,
1683
				   bool do_delete, bool is_lru_map,
1684
				   bool is_percpu)
1685
{
1686
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1687
	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1688
	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1689
	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1690
	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1691
	u32 batch, max_count, size, bucket_size, map_id;
1692
	u32 bucket_cnt, total, key_size, value_size;
1693
	struct htab_elem *node_to_free = NULL;
1694
	u64 elem_map_flags, map_flags;
1695
	struct hlist_nulls_head *head;
1696
	struct hlist_nulls_node *n;
1697
	unsigned long flags = 0;
1698
	bool locked = false;
1699
	struct htab_elem *l;
1700
	struct bucket *b;
1701
	int ret = 0;
1702

1703
	elem_map_flags = attr->batch.elem_flags;
1704
	if ((elem_map_flags & ~BPF_F_LOCK) ||
1705
	    ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1706
		return -EINVAL;
1707

1708
	map_flags = attr->batch.flags;
1709
	if (map_flags)
1710
		return -EINVAL;
1711

1712
	max_count = attr->batch.count;
1713
	if (!max_count)
1714
		return 0;
1715

1716
	if (put_user(0, &uattr->batch.count))
1717
		return -EFAULT;
1718

1719
	batch = 0;
1720
	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1721
		return -EFAULT;
1722

1723
	if (batch >= htab->n_buckets)
1724
		return -ENOENT;
1725

1726
	key_size = htab->map.key_size;
1727
	value_size = htab->map.value_size;
1728
	size = round_up(value_size, 8);
1729
	if (is_percpu)
1730
		value_size = size * num_possible_cpus();
1731
	total = 0;
1732
	/* while experimenting with hash tables with sizes ranging from 10 to
1733
	 * 1000, it was observed that a bucket can have up to 5 entries.
1734
	 */
1735
	bucket_size = 5;
1736

1737
alloc:
1738
	/* We cannot do copy_from_user or copy_to_user inside
1739
	 * the rcu_read_lock. Allocate enough space here.
1740
	 */
1741
	keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1742
	values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1743
	if (!keys || !values) {
1744
		ret = -ENOMEM;
1745
		goto after_loop;
1746
	}
1747

1748
again:
1749
	bpf_disable_instrumentation();
1750
	rcu_read_lock();
1751
again_nocopy:
1752
	dst_key = keys;
1753
	dst_val = values;
1754
	b = &htab->buckets[batch];
1755
	head = &b->head;
1756
	/* do not grab the lock unless need it (bucket_cnt > 0). */
1757
	if (locked) {
1758
		ret = htab_lock_bucket(b, &flags);
1759
		if (ret) {
1760
			rcu_read_unlock();
1761
			bpf_enable_instrumentation();
1762
			goto after_loop;
1763
		}
1764
	}
1765

1766
	bucket_cnt = 0;
1767
	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1768
		bucket_cnt++;
1769

1770
	if (bucket_cnt && !locked) {
1771
		locked = true;
1772
		goto again_nocopy;
1773
	}
1774

1775
	if (bucket_cnt > (max_count - total)) {
1776
		if (total == 0)
1777
			ret = -ENOSPC;
1778
		/* Note that since bucket_cnt > 0 here, it is implicit
1779
		 * that the locked was grabbed, so release it.
1780
		 */
1781
		htab_unlock_bucket(b, flags);
1782
		rcu_read_unlock();
1783
		bpf_enable_instrumentation();
1784
		goto after_loop;
1785
	}
1786

1787
	if (bucket_cnt > bucket_size) {
1788
		bucket_size = bucket_cnt;
1789
		/* Note that since bucket_cnt > 0 here, it is implicit
1790
		 * that the locked was grabbed, so release it.
1791
		 */
1792
		htab_unlock_bucket(b, flags);
1793
		rcu_read_unlock();
1794
		bpf_enable_instrumentation();
1795
		kvfree(keys);
1796
		kvfree(values);
1797
		goto alloc;
1798
	}
1799

1800
	/* Next block is only safe to run if you have grabbed the lock */
1801
	if (!locked)
1802
		goto next_batch;
1803

1804
	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1805
		memcpy(dst_key, l->key, key_size);
1806

1807
		if (is_percpu) {
1808
			int off = 0, cpu;
1809
			void __percpu *pptr;
1810

1811
			pptr = htab_elem_get_ptr(l, map->key_size);
1812
			for_each_possible_cpu(cpu) {
1813
				copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
1814
				check_and_init_map_value(&htab->map, dst_val + off);
1815
				off += size;
1816
			}
1817
		} else {
1818
			value = htab_elem_value(l, key_size);
1819
			if (is_fd_htab(htab)) {
1820
				struct bpf_map **inner_map = value;
1821

1822
				 /* Actual value is the id of the inner map */
1823
				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1824
				value = &map_id;
1825
			}
1826

1827
			if (elem_map_flags & BPF_F_LOCK)
1828
				copy_map_value_locked(map, dst_val, value,
1829
						      true);
1830
			else
1831
				copy_map_value(map, dst_val, value);
1832
			/* Zeroing special fields in the temp buffer */
1833
			check_and_init_map_value(map, dst_val);
1834
		}
1835
		if (do_delete) {
1836
			hlist_nulls_del_rcu(&l->hash_node);
1837

1838
			/* bpf_lru_push_free() will acquire lru_lock, which
1839
			 * may cause deadlock. See comments in function
1840
			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1841
			 * after releasing the bucket lock.
1842
			 *
1843
			 * For htab of maps, htab_put_fd_value() in
1844
			 * free_htab_elem() may acquire a spinlock with bucket
1845
			 * lock being held and it violates the lock rule, so
1846
			 * invoke free_htab_elem() after unlock as well.
1847
			 */
1848
			l->batch_flink = node_to_free;
1849
			node_to_free = l;
1850
		}
1851
		dst_key += key_size;
1852
		dst_val += value_size;
1853
	}
1854

1855
	htab_unlock_bucket(b, flags);
1856
	locked = false;
1857

1858
	while (node_to_free) {
1859
		l = node_to_free;
1860
		node_to_free = node_to_free->batch_flink;
1861
		if (is_lru_map)
1862
			htab_lru_push_free(htab, l);
1863
		else
1864
			free_htab_elem(htab, l);
1865
	}
1866

1867
next_batch:
1868
	/* If we are not copying data, we can go to next bucket and avoid
1869
	 * unlocking the rcu.
1870
	 */
1871
	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1872
		batch++;
1873
		goto again_nocopy;
1874
	}
1875

1876
	rcu_read_unlock();
1877
	bpf_enable_instrumentation();
1878
	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1879
	    key_size * bucket_cnt) ||
1880
	    copy_to_user(uvalues + total * value_size, values,
1881
	    value_size * bucket_cnt))) {
1882
		ret = -EFAULT;
1883
		goto after_loop;
1884
	}
1885

1886
	total += bucket_cnt;
1887
	batch++;
1888
	if (batch >= htab->n_buckets) {
1889
		ret = -ENOENT;
1890
		goto after_loop;
1891
	}
1892
	goto again;
1893

1894
after_loop:
1895
	if (ret == -EFAULT)
1896
		goto out;
1897

1898
	/* copy # of entries and next batch */
1899
	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1900
	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1901
	    put_user(total, &uattr->batch.count))
1902
		ret = -EFAULT;
1903

1904
out:
1905
	kvfree(keys);
1906
	kvfree(values);
1907
	return ret;
1908
}
1909

1910
static int
1911
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1912
			     union bpf_attr __user *uattr)
1913
{
1914
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1915
						  false, true);
1916
}
1917

1918
static int
1919
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1920
					const union bpf_attr *attr,
1921
					union bpf_attr __user *uattr)
1922
{
1923
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1924
						  false, true);
1925
}
1926

1927
static int
1928
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1929
		      union bpf_attr __user *uattr)
1930
{
1931
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1932
						  false, false);
1933
}
1934

1935
static int
1936
htab_map_lookup_and_delete_batch(struct bpf_map *map,
1937
				 const union bpf_attr *attr,
1938
				 union bpf_attr __user *uattr)
1939
{
1940
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1941
						  false, false);
1942
}
1943

1944
static int
1945
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1946
				 const union bpf_attr *attr,
1947
				 union bpf_attr __user *uattr)
1948
{
1949
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1950
						  true, true);
1951
}
1952

1953
static int
1954
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1955
					    const union bpf_attr *attr,
1956
					    union bpf_attr __user *uattr)
1957
{
1958
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1959
						  true, true);
1960
}
1961

1962
static int
1963
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1964
			  union bpf_attr __user *uattr)
1965
{
1966
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1967
						  true, false);
1968
}
1969

1970
static int
1971
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1972
				     const union bpf_attr *attr,
1973
				     union bpf_attr __user *uattr)
1974
{
1975
	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1976
						  true, false);
1977
}
1978

1979
struct bpf_iter_seq_hash_map_info {
1980
	struct bpf_map *map;
1981
	struct bpf_htab *htab;
1982
	void *percpu_value_buf; // non-zero means percpu hash
1983
	u32 bucket_id;
1984
	u32 skip_elems;
1985
};
1986

1987
static struct htab_elem *
1988
bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1989
			   struct htab_elem *prev_elem)
1990
{
1991
	const struct bpf_htab *htab = info->htab;
1992
	u32 skip_elems = info->skip_elems;
1993
	u32 bucket_id = info->bucket_id;
1994
	struct hlist_nulls_head *head;
1995
	struct hlist_nulls_node *n;
1996
	struct htab_elem *elem;
1997
	struct bucket *b;
1998
	u32 i, count;
1999

2000
	if (bucket_id >= htab->n_buckets)
2001
		return NULL;
2002

2003
	/* try to find next elem in the same bucket */
2004
	if (prev_elem) {
2005
		/* no update/deletion on this bucket, prev_elem should be still valid
2006
		 * and we won't skip elements.
2007
		 */
2008
		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2009
		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2010
		if (elem)
2011
			return elem;
2012

2013
		/* not found, unlock and go to the next bucket */
2014
		b = &htab->buckets[bucket_id++];
2015
		rcu_read_unlock();
2016
		skip_elems = 0;
2017
	}
2018

2019
	for (i = bucket_id; i < htab->n_buckets; i++) {
2020
		b = &htab->buckets[i];
2021
		rcu_read_lock();
2022

2023
		count = 0;
2024
		head = &b->head;
2025
		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2026
			if (count >= skip_elems) {
2027
				info->bucket_id = i;
2028
				info->skip_elems = count;
2029
				return elem;
2030
			}
2031
			count++;
2032
		}
2033

2034
		rcu_read_unlock();
2035
		skip_elems = 0;
2036
	}
2037

2038
	info->bucket_id = i;
2039
	info->skip_elems = 0;
2040
	return NULL;
2041
}
2042

2043
static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2044
{
2045
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2046
	struct htab_elem *elem;
2047

2048
	elem = bpf_hash_map_seq_find_next(info, NULL);
2049
	if (!elem)
2050
		return NULL;
2051

2052
	if (*pos == 0)
2053
		++*pos;
2054
	return elem;
2055
}
2056

2057
static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2058
{
2059
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2060

2061
	++*pos;
2062
	++info->skip_elems;
2063
	return bpf_hash_map_seq_find_next(info, v);
2064
}
2065

2066
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2067
{
2068
	struct bpf_iter_seq_hash_map_info *info = seq->private;
2069
	struct bpf_iter__bpf_map_elem ctx = {};
2070
	struct bpf_map *map = info->map;
2071
	struct bpf_iter_meta meta;
2072
	int ret = 0, off = 0, cpu;
2073
	u32 roundup_value_size;
2074
	struct bpf_prog *prog;
2075
	void __percpu *pptr;
2076

2077
	meta.seq = seq;
2078
	prog = bpf_iter_get_info(&meta, elem == NULL);
2079
	if (prog) {
2080
		ctx.meta = &meta;
2081
		ctx.map = info->map;
2082
		if (elem) {
2083
			ctx.key = elem->key;
2084
			if (!info->percpu_value_buf) {
2085
				ctx.value = htab_elem_value(elem, map->key_size);
2086
			} else {
2087
				roundup_value_size = round_up(map->value_size, 8);
2088
				pptr = htab_elem_get_ptr(elem, map->key_size);
2089
				for_each_possible_cpu(cpu) {
2090
					copy_map_value_long(map, info->percpu_value_buf + off,
2091
							    per_cpu_ptr(pptr, cpu));
2092
					check_and_init_map_value(map, info->percpu_value_buf + off);
2093
					off += roundup_value_size;
2094
				}
2095
				ctx.value = info->percpu_value_buf;
2096
			}
2097
		}
2098
		ret = bpf_iter_run_prog(prog, &ctx);
2099
	}
2100

2101
	return ret;
2102
}
2103

2104
static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2105
{
2106
	return __bpf_hash_map_seq_show(seq, v);
2107
}
2108

2109
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2110
{
2111
	if (!v)
2112
		(void)__bpf_hash_map_seq_show(seq, NULL);
2113
	else
2114
		rcu_read_unlock();
2115
}
2116

2117
static int bpf_iter_init_hash_map(void *priv_data,
2118
				  struct bpf_iter_aux_info *aux)
2119
{
2120
	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2121
	struct bpf_map *map = aux->map;
2122
	void *value_buf;
2123
	u32 buf_size;
2124

2125
	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2126
	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2127
		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2128
		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2129
		if (!value_buf)
2130
			return -ENOMEM;
2131

2132
		seq_info->percpu_value_buf = value_buf;
2133
	}
2134

2135
	bpf_map_inc_with_uref(map);
2136
	seq_info->map = map;
2137
	seq_info->htab = container_of(map, struct bpf_htab, map);
2138
	return 0;
2139
}
2140

2141
static void bpf_iter_fini_hash_map(void *priv_data)
2142
{
2143
	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2144

2145
	bpf_map_put_with_uref(seq_info->map);
2146
	kfree(seq_info->percpu_value_buf);
2147
}
2148

2149
static const struct seq_operations bpf_hash_map_seq_ops = {
2150
	.start	= bpf_hash_map_seq_start,
2151
	.next	= bpf_hash_map_seq_next,
2152
	.stop	= bpf_hash_map_seq_stop,
2153
	.show	= bpf_hash_map_seq_show,
2154
};
2155

2156
static const struct bpf_iter_seq_info iter_seq_info = {
2157
	.seq_ops		= &bpf_hash_map_seq_ops,
2158
	.init_seq_private	= bpf_iter_init_hash_map,
2159
	.fini_seq_private	= bpf_iter_fini_hash_map,
2160
	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
2161
};
2162

2163
static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2164
				   void *callback_ctx, u64 flags)
2165
{
2166
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2167
	struct hlist_nulls_head *head;
2168
	struct hlist_nulls_node *n;
2169
	struct htab_elem *elem;
2170
	int i, num_elems = 0;
2171
	void __percpu *pptr;
2172
	struct bucket *b;
2173
	void *key, *val;
2174
	bool is_percpu;
2175
	u64 ret = 0;
2176

2177
	cant_migrate();
2178

2179
	if (flags != 0)
2180
		return -EINVAL;
2181

2182
	is_percpu = htab_is_percpu(htab);
2183

2184
	/* migration has been disabled, so percpu value prepared here will be
2185
	 * the same as the one seen by the bpf program with
2186
	 * bpf_map_lookup_elem().
2187
	 */
2188
	for (i = 0; i < htab->n_buckets; i++) {
2189
		b = &htab->buckets[i];
2190
		rcu_read_lock();
2191
		head = &b->head;
2192
		hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2193
			key = elem->key;
2194
			if (is_percpu) {
2195
				/* current cpu value for percpu map */
2196
				pptr = htab_elem_get_ptr(elem, map->key_size);
2197
				val = this_cpu_ptr(pptr);
2198
			} else {
2199
				val = htab_elem_value(elem, map->key_size);
2200
			}
2201
			num_elems++;
2202
			ret = callback_fn((u64)(long)map, (u64)(long)key,
2203
					  (u64)(long)val, (u64)(long)callback_ctx, 0);
2204
			/* return value: 0 - continue, 1 - stop and return */
2205
			if (ret) {
2206
				rcu_read_unlock();
2207
				goto out;
2208
			}
2209
		}
2210
		rcu_read_unlock();
2211
	}
2212
out:
2213
	return num_elems;
2214
}
2215

2216
static u64 htab_map_mem_usage(const struct bpf_map *map)
2217
{
2218
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2219
	u32 value_size = round_up(htab->map.value_size, 8);
2220
	bool prealloc = htab_is_prealloc(htab);
2221
	bool percpu = htab_is_percpu(htab);
2222
	bool lru = htab_is_lru(htab);
2223
	u64 num_entries;
2224
	u64 usage = sizeof(struct bpf_htab);
2225

2226
	usage += sizeof(struct bucket) * htab->n_buckets;
2227
	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2228
	if (prealloc) {
2229
		num_entries = map->max_entries;
2230
		if (htab_has_extra_elems(htab))
2231
			num_entries += num_possible_cpus();
2232

2233
		usage += htab->elem_size * num_entries;
2234

2235
		if (percpu)
2236
			usage += value_size * num_possible_cpus() * num_entries;
2237
		else if (!lru)
2238
			usage += sizeof(struct htab_elem *) * num_possible_cpus();
2239
	} else {
2240
#define LLIST_NODE_SZ sizeof(struct llist_node)
2241

2242
		num_entries = htab->use_percpu_counter ?
2243
					  percpu_counter_sum(&htab->pcount) :
2244
					  atomic_read(&htab->count);
2245
		usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2246
		if (percpu) {
2247
			usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2248
			usage += value_size * num_possible_cpus() * num_entries;
2249
		}
2250
	}
2251
	return usage;
2252
}
2253

2254
BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2255
const struct bpf_map_ops htab_map_ops = {
2256
	.map_meta_equal = bpf_map_meta_equal,
2257
	.map_alloc_check = htab_map_alloc_check,
2258
	.map_alloc = htab_map_alloc,
2259
	.map_free = htab_map_free,
2260
	.map_get_next_key = htab_map_get_next_key,
2261
	.map_release_uref = htab_map_free_internal_structs,
2262
	.map_lookup_elem = htab_map_lookup_elem,
2263
	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2264
	.map_update_elem = htab_map_update_elem,
2265
	.map_delete_elem = htab_map_delete_elem,
2266
	.map_gen_lookup = htab_map_gen_lookup,
2267
	.map_seq_show_elem = htab_map_seq_show_elem,
2268
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2269
	.map_for_each_callback = bpf_for_each_hash_elem,
2270
	.map_mem_usage = htab_map_mem_usage,
2271
	BATCH_OPS(htab),
2272
	.map_btf_id = &htab_map_btf_ids[0],
2273
	.iter_seq_info = &iter_seq_info,
2274
};
2275

2276
const struct bpf_map_ops htab_lru_map_ops = {
2277
	.map_meta_equal = bpf_map_meta_equal,
2278
	.map_alloc_check = htab_map_alloc_check,
2279
	.map_alloc = htab_map_alloc,
2280
	.map_free = htab_map_free,
2281
	.map_get_next_key = htab_map_get_next_key,
2282
	.map_release_uref = htab_map_free_internal_structs,
2283
	.map_lookup_elem = htab_lru_map_lookup_elem,
2284
	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2285
	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2286
	.map_update_elem = htab_lru_map_update_elem,
2287
	.map_delete_elem = htab_lru_map_delete_elem,
2288
	.map_gen_lookup = htab_lru_map_gen_lookup,
2289
	.map_seq_show_elem = htab_map_seq_show_elem,
2290
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2291
	.map_for_each_callback = bpf_for_each_hash_elem,
2292
	.map_mem_usage = htab_map_mem_usage,
2293
	BATCH_OPS(htab_lru),
2294
	.map_btf_id = &htab_map_btf_ids[0],
2295
	.iter_seq_info = &iter_seq_info,
2296
};
2297

2298
/* Called from eBPF program */
2299
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2300
{
2301
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2302

2303
	if (l)
2304
		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2305
	else
2306
		return NULL;
2307
}
2308

2309
/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
2310
static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2311
{
2312
	struct bpf_insn *insn = insn_buf;
2313

2314
	if (!bpf_jit_supports_percpu_insn())
2315
		return -EOPNOTSUPP;
2316

2317
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2318
		     (void *(*)(struct bpf_map *map, void *key))NULL));
2319
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2320
	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2321
	*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2322
				offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2323
	*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2324
	*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2325

2326
	return insn - insn_buf;
2327
}
2328

2329
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2330
{
2331
	struct htab_elem *l;
2332

2333
	if (cpu >= nr_cpu_ids)
2334
		return NULL;
2335

2336
	l = __htab_map_lookup_elem(map, key);
2337
	if (l)
2338
		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2339
	else
2340
		return NULL;
2341
}
2342

2343
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2344
{
2345
	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2346

2347
	if (l) {
2348
		bpf_lru_node_set_ref(&l->lru_node);
2349
		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2350
	}
2351

2352
	return NULL;
2353
}
2354

2355
static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2356
{
2357
	struct htab_elem *l;
2358

2359
	if (cpu >= nr_cpu_ids)
2360
		return NULL;
2361

2362
	l = __htab_map_lookup_elem(map, key);
2363
	if (l) {
2364
		bpf_lru_node_set_ref(&l->lru_node);
2365
		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2366
	}
2367

2368
	return NULL;
2369
}
2370

2371
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2372
{
2373
	struct htab_elem *l;
2374
	void __percpu *pptr;
2375
	int ret = -ENOENT;
2376
	int cpu, off = 0;
2377
	u32 size;
2378

2379
	/* per_cpu areas are zero-filled and bpf programs can only
2380
	 * access 'value_size' of them, so copying rounded areas
2381
	 * will not leak any kernel data
2382
	 */
2383
	size = round_up(map->value_size, 8);
2384
	rcu_read_lock();
2385
	l = __htab_map_lookup_elem(map, key);
2386
	if (!l)
2387
		goto out;
2388
	/* We do not mark LRU map element here in order to not mess up
2389
	 * eviction heuristics when user space does a map walk.
2390
	 */
2391
	pptr = htab_elem_get_ptr(l, map->key_size);
2392
	for_each_possible_cpu(cpu) {
2393
		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2394
		check_and_init_map_value(map, value + off);
2395
		off += size;
2396
	}
2397
	ret = 0;
2398
out:
2399
	rcu_read_unlock();
2400
	return ret;
2401
}
2402

2403
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2404
			   u64 map_flags)
2405
{
2406
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2407
	int ret;
2408

2409
	rcu_read_lock();
2410
	if (htab_is_lru(htab))
2411
		ret = __htab_lru_percpu_map_update_elem(map, key, value,
2412
							map_flags, true);
2413
	else
2414
		ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2415
						    true, true);
2416
	rcu_read_unlock();
2417

2418
	return ret;
2419
}
2420

2421
static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2422
					  struct seq_file *m)
2423
{
2424
	struct htab_elem *l;
2425
	void __percpu *pptr;
2426
	int cpu;
2427

2428
	rcu_read_lock();
2429

2430
	l = __htab_map_lookup_elem(map, key);
2431
	if (!l) {
2432
		rcu_read_unlock();
2433
		return;
2434
	}
2435

2436
	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2437
	seq_puts(m, ": {\n");
2438
	pptr = htab_elem_get_ptr(l, map->key_size);
2439
	for_each_possible_cpu(cpu) {
2440
		seq_printf(m, "\tcpu%d: ", cpu);
2441
		btf_type_seq_show(map->btf, map->btf_value_type_id,
2442
				  per_cpu_ptr(pptr, cpu), m);
2443
		seq_putc(m, '\n');
2444
	}
2445
	seq_puts(m, "}\n");
2446

2447
	rcu_read_unlock();
2448
}
2449

2450
const struct bpf_map_ops htab_percpu_map_ops = {
2451
	.map_meta_equal = bpf_map_meta_equal,
2452
	.map_alloc_check = htab_map_alloc_check,
2453
	.map_alloc = htab_map_alloc,
2454
	.map_free = htab_map_free,
2455
	.map_get_next_key = htab_map_get_next_key,
2456
	.map_lookup_elem = htab_percpu_map_lookup_elem,
2457
	.map_gen_lookup = htab_percpu_map_gen_lookup,
2458
	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2459
	.map_update_elem = htab_percpu_map_update_elem,
2460
	.map_delete_elem = htab_map_delete_elem,
2461
	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2462
	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2463
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2464
	.map_for_each_callback = bpf_for_each_hash_elem,
2465
	.map_mem_usage = htab_map_mem_usage,
2466
	BATCH_OPS(htab_percpu),
2467
	.map_btf_id = &htab_map_btf_ids[0],
2468
	.iter_seq_info = &iter_seq_info,
2469
};
2470

2471
const struct bpf_map_ops htab_lru_percpu_map_ops = {
2472
	.map_meta_equal = bpf_map_meta_equal,
2473
	.map_alloc_check = htab_map_alloc_check,
2474
	.map_alloc = htab_map_alloc,
2475
	.map_free = htab_map_free,
2476
	.map_get_next_key = htab_map_get_next_key,
2477
	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2478
	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2479
	.map_update_elem = htab_lru_percpu_map_update_elem,
2480
	.map_delete_elem = htab_lru_map_delete_elem,
2481
	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2482
	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2483
	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2484
	.map_for_each_callback = bpf_for_each_hash_elem,
2485
	.map_mem_usage = htab_map_mem_usage,
2486
	BATCH_OPS(htab_lru_percpu),
2487
	.map_btf_id = &htab_map_btf_ids[0],
2488
	.iter_seq_info = &iter_seq_info,
2489
};
2490

2491
static int fd_htab_map_alloc_check(union bpf_attr *attr)
2492
{
2493
	if (attr->value_size != sizeof(u32))
2494
		return -EINVAL;
2495
	return htab_map_alloc_check(attr);
2496
}
2497

2498
static void fd_htab_map_free(struct bpf_map *map)
2499
{
2500
	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2501
	struct hlist_nulls_node *n;
2502
	struct hlist_nulls_head *head;
2503
	struct htab_elem *l;
2504
	int i;
2505

2506
	for (i = 0; i < htab->n_buckets; i++) {
2507
		head = select_bucket(htab, i);
2508

2509
		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2510
			void *ptr = fd_htab_map_get_ptr(map, l);
2511

2512
			map->ops->map_fd_put_ptr(map, ptr, false);
2513
		}
2514
	}
2515

2516
	htab_map_free(map);
2517
}
2518

2519
/* only called from syscall */
2520
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2521
{
2522
	void **ptr;
2523
	int ret = 0;
2524

2525
	if (!map->ops->map_fd_sys_lookup_elem)
2526
		return -ENOTSUPP;
2527

2528
	rcu_read_lock();
2529
	ptr = htab_map_lookup_elem(map, key);
2530
	if (ptr)
2531
		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2532
	else
2533
		ret = -ENOENT;
2534
	rcu_read_unlock();
2535

2536
	return ret;
2537
}
2538

2539
/* Only called from syscall */
2540
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2541
				void *key, void *value, u64 map_flags)
2542
{
2543
	void *ptr;
2544
	int ret;
2545

2546
	ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2547
	if (IS_ERR(ptr))
2548
		return PTR_ERR(ptr);
2549

2550
	/* The htab bucket lock is always held during update operations in fd
2551
	 * htab map, and the following rcu_read_lock() is only used to avoid
2552
	 * the WARN_ON_ONCE in htab_map_update_elem_in_place().
2553
	 */
2554
	rcu_read_lock();
2555
	ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2556
	rcu_read_unlock();
2557
	if (ret)
2558
		map->ops->map_fd_put_ptr(map, ptr, false);
2559

2560
	return ret;
2561
}
2562

2563
static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2564
{
2565
	struct bpf_map *map, *inner_map_meta;
2566

2567
	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2568
	if (IS_ERR(inner_map_meta))
2569
		return inner_map_meta;
2570

2571
	map = htab_map_alloc(attr);
2572
	if (IS_ERR(map)) {
2573
		bpf_map_meta_free(inner_map_meta);
2574
		return map;
2575
	}
2576

2577
	map->inner_map_meta = inner_map_meta;
2578

2579
	return map;
2580
}
2581

2582
static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2583
{
2584
	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2585

2586
	if (!inner_map)
2587
		return NULL;
2588

2589
	return READ_ONCE(*inner_map);
2590
}
2591

2592
static int htab_of_map_gen_lookup(struct bpf_map *map,
2593
				  struct bpf_insn *insn_buf)
2594
{
2595
	struct bpf_insn *insn = insn_buf;
2596
	const int ret = BPF_REG_0;
2597

2598
	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2599
		     (void *(*)(struct bpf_map *map, void *key))NULL));
2600
	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2601
	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2602
	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2603
				offsetof(struct htab_elem, key) +
2604
				round_up(map->key_size, 8));
2605
	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2606

2607
	return insn - insn_buf;
2608
}
2609

2610
static void htab_of_map_free(struct bpf_map *map)
2611
{
2612
	bpf_map_meta_free(map->inner_map_meta);
2613
	fd_htab_map_free(map);
2614
}
2615

2616
const struct bpf_map_ops htab_of_maps_map_ops = {
2617
	.map_alloc_check = fd_htab_map_alloc_check,
2618
	.map_alloc = htab_of_map_alloc,
2619
	.map_free = htab_of_map_free,
2620
	.map_get_next_key = htab_map_get_next_key,
2621
	.map_lookup_elem = htab_of_map_lookup_elem,
2622
	.map_delete_elem = htab_map_delete_elem,
2623
	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2624
	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2625
	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2626
	.map_gen_lookup = htab_of_map_gen_lookup,
2627
	.map_check_btf = map_check_no_btf,
2628
	.map_mem_usage = htab_map_mem_usage,
2629
	BATCH_OPS(htab),
2630
	.map_btf_id = &htab_map_btf_ids[0],
2631
};
2632

2633
Product

Resources

Company