CoCalc -- ring

GitHub Repository: torvalds/linux
Path: blob/master/kernel/events/ring_buffer.c
²⁹²⁶⁵ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Performance events ring-buffer code:
4
 *
5
 *  Copyright (C) 2008 Thomas Gleixner <[email protected]>
6
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <[email protected]>
9
 */
10

11
#include <linux/perf_event.h>
12
#include <linux/vmalloc.h>
13
#include <linux/slab.h>
14
#include <linux/circ_buf.h>
15
#include <linux/poll.h>
16
#include <linux/nospec.h>
17

18
#include "internal.h"
19

20
static void perf_output_wakeup(struct perf_output_handle *handle)
21
{
22
	atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
23

24
	handle->event->pending_wakeup = 1;
25

26
	if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
27
		handle->event->pending_kill = POLL_IN;
28

29
	irq_work_queue(&handle->event->pending_irq);
30
}
31

32
/*
33
 * We need to ensure a later event_id doesn't publish a head when a former
34
 * event isn't done writing. However since we need to deal with NMIs we
35
 * cannot fully serialize things.
36
 *
37
 * We only publish the head (and generate a wakeup) when the outer-most
38
 * event completes.
39
 */
40
static void perf_output_get_handle(struct perf_output_handle *handle)
41
{
42
	struct perf_buffer *rb = handle->rb;
43

44
	preempt_disable();
45

46
	/*
47
	 * Avoid an explicit LOAD/STORE such that architectures with memops
48
	 * can use them.
49
	 */
50
	(*(volatile unsigned int *)&rb->nest)++;
51
	handle->wakeup = local_read(&rb->wakeup);
52
}
53

54
static void perf_output_put_handle(struct perf_output_handle *handle)
55
{
56
	struct perf_buffer *rb = handle->rb;
57
	unsigned long head;
58
	unsigned int nest;
59

60
	/*
61
	 * If this isn't the outermost nesting, we don't have to update
62
	 * @rb->user_page->data_head.
63
	 */
64
	nest = READ_ONCE(rb->nest);
65
	if (nest > 1) {
66
		WRITE_ONCE(rb->nest, nest - 1);
67
		goto out;
68
	}
69

70
again:
71
	/*
72
	 * In order to avoid publishing a head value that goes backwards,
73
	 * we must ensure the load of @rb->head happens after we've
74
	 * incremented @rb->nest.
75
	 *
76
	 * Otherwise we can observe a @rb->head value before one published
77
	 * by an IRQ/NMI happening between the load and the increment.
78
	 */
79
	barrier();
80
	head = local_read(&rb->head);
81

82
	/*
83
	 * IRQ/NMI can happen here and advance @rb->head, causing our
84
	 * load above to be stale.
85
	 */
86

87
	/*
88
	 * Since the mmap() consumer (userspace) can run on a different CPU:
89
	 *
90
	 *   kernel				user
91
	 *
92
	 *   if (LOAD ->data_tail) {		LOAD ->data_head
93
	 *			(A)		smp_rmb()	(C)
94
	 *	STORE $data			LOAD $data
95
	 *	smp_wmb()	(B)		smp_mb()	(D)
96
	 *	STORE ->data_head		STORE ->data_tail
97
	 *   }
98
	 *
99
	 * Where A pairs with D, and B pairs with C.
100
	 *
101
	 * In our case (A) is a control dependency that separates the load of
102
	 * the ->data_tail and the stores of $data. In case ->data_tail
103
	 * indicates there is no room in the buffer to store $data we do not.
104
	 *
105
	 * D needs to be a full barrier since it separates the data READ
106
	 * from the tail WRITE.
107
	 *
108
	 * For B a WMB is sufficient since it separates two WRITEs, and for C
109
	 * an RMB is sufficient since it separates two READs.
110
	 *
111
	 * See perf_output_begin().
112
	 */
113
	smp_wmb(); /* B, matches C */
114
	WRITE_ONCE(rb->user_page->data_head, head);
115

116
	/*
117
	 * We must publish the head before decrementing the nest count,
118
	 * otherwise an IRQ/NMI can publish a more recent head value and our
119
	 * write will (temporarily) publish a stale value.
120
	 */
121
	barrier();
122
	WRITE_ONCE(rb->nest, 0);
123

124
	/*
125
	 * Ensure we decrement @rb->nest before we validate the @rb->head.
126
	 * Otherwise we cannot be sure we caught the 'last' nested update.
127
	 */
128
	barrier();
129
	if (unlikely(head != local_read(&rb->head))) {
130
		WRITE_ONCE(rb->nest, 1);
131
		goto again;
132
	}
133

134
	if (handle->wakeup != local_read(&rb->wakeup))
135
		perf_output_wakeup(handle);
136

137
out:
138
	preempt_enable();
139
}
140

141
static __always_inline bool
142
ring_buffer_has_space(unsigned long head, unsigned long tail,
143
		      unsigned long data_size, unsigned int size,
144
		      bool backward)
145
{
146
	if (!backward)
147
		return CIRC_SPACE(head, tail, data_size) >= size;
148
	else
149
		return CIRC_SPACE(tail, head, data_size) >= size;
150
}
151

152
static __always_inline int
153
__perf_output_begin(struct perf_output_handle *handle,
154
		    struct perf_sample_data *data,
155
		    struct perf_event *event, unsigned int size,
156
		    bool backward)
157
{
158
	struct perf_buffer *rb;
159
	unsigned long tail, offset, head;
160
	int have_lost, page_shift;
161
	struct {
162
		struct perf_event_header header;
163
		u64			 id;
164
		u64			 lost;
165
	} lost_event;
166

167
	rcu_read_lock();
168
	/*
169
	 * For inherited events we send all the output towards the parent.
170
	 */
171
	if (event->parent)
172
		event = event->parent;
173

174
	rb = rcu_dereference(event->rb);
175
	if (unlikely(!rb))
176
		goto out;
177

178
	if (unlikely(rb->paused)) {
179
		if (rb->nr_pages) {
180
			local_inc(&rb->lost);
181
			atomic64_inc(&event->lost_samples);
182
		}
183
		goto out;
184
	}
185

186
	handle->rb    = rb;
187
	handle->event = event;
188
	handle->flags = 0;
189

190
	have_lost = local_read(&rb->lost);
191
	if (unlikely(have_lost)) {
192
		size += sizeof(lost_event);
193
		if (event->attr.sample_id_all)
194
			size += event->id_header_size;
195
	}
196

197
	perf_output_get_handle(handle);
198

199
	offset = local_read(&rb->head);
200
	do {
201
		head = offset;
202
		tail = READ_ONCE(rb->user_page->data_tail);
203
		if (!rb->overwrite) {
204
			if (unlikely(!ring_buffer_has_space(head, tail,
205
							    perf_data_size(rb),
206
							    size, backward)))
207
				goto fail;
208
		}
209

210
		/*
211
		 * The above forms a control dependency barrier separating the
212
		 * @tail load above from the data stores below. Since the @tail
213
		 * load is required to compute the branch to fail below.
214
		 *
215
		 * A, matches D; the full memory barrier userspace SHOULD issue
216
		 * after reading the data and before storing the new tail
217
		 * position.
218
		 *
219
		 * See perf_output_put_handle().
220
		 */
221

222
		if (!backward)
223
			head += size;
224
		else
225
			head -= size;
226
	} while (!local_try_cmpxchg(&rb->head, &offset, head));
227

228
	if (backward) {
229
		offset = head;
230
		head = (u64)(-head);
231
	}
232

233
	/*
234
	 * We rely on the implied barrier() by local_cmpxchg() to ensure
235
	 * none of the data stores below can be lifted up by the compiler.
236
	 */
237

238
	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
239
		local_add(rb->watermark, &rb->wakeup);
240

241
	page_shift = PAGE_SHIFT + page_order(rb);
242

243
	handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
244
	offset &= (1UL << page_shift) - 1;
245
	handle->addr = rb->data_pages[handle->page] + offset;
246
	handle->size = (1UL << page_shift) - offset;
247

248
	if (unlikely(have_lost)) {
249
		lost_event.header.size = sizeof(lost_event);
250
		lost_event.header.type = PERF_RECORD_LOST;
251
		lost_event.header.misc = 0;
252
		lost_event.id          = event->id;
253
		lost_event.lost        = local_xchg(&rb->lost, 0);
254

255
		/* XXX mostly redundant; @data is already fully initializes */
256
		perf_event_header__init_id(&lost_event.header, data, event);
257
		perf_output_put(handle, lost_event);
258
		perf_event__output_id_sample(event, handle, data);
259
	}
260

261
	return 0;
262

263
fail:
264
	local_inc(&rb->lost);
265
	atomic64_inc(&event->lost_samples);
266
	perf_output_put_handle(handle);
267
out:
268
	rcu_read_unlock();
269

270
	return -ENOSPC;
271
}
272

273
int perf_output_begin_forward(struct perf_output_handle *handle,
274
			      struct perf_sample_data *data,
275
			      struct perf_event *event, unsigned int size)
276
{
277
	return __perf_output_begin(handle, data, event, size, false);
278
}
279

280
int perf_output_begin_backward(struct perf_output_handle *handle,
281
			       struct perf_sample_data *data,
282
			       struct perf_event *event, unsigned int size)
283
{
284
	return __perf_output_begin(handle, data, event, size, true);
285
}
286

287
int perf_output_begin(struct perf_output_handle *handle,
288
		      struct perf_sample_data *data,
289
		      struct perf_event *event, unsigned int size)
290
{
291

292
	return __perf_output_begin(handle, data, event, size,
293
				   unlikely(is_write_backward(event)));
294
}
295

296
unsigned int perf_output_copy(struct perf_output_handle *handle,
297
		      const void *buf, unsigned int len)
298
{
299
	return __output_copy(handle, buf, len);
300
}
301

302
unsigned int perf_output_skip(struct perf_output_handle *handle,
303
			      unsigned int len)
304
{
305
	return __output_skip(handle, NULL, len);
306
}
307

308
void perf_output_end(struct perf_output_handle *handle)
309
{
310
	perf_output_put_handle(handle);
311
	rcu_read_unlock();
312
}
313

314
static void
315
ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
316
{
317
	long max_size = perf_data_size(rb);
318

319
	if (watermark)
320
		rb->watermark = min(max_size, watermark);
321

322
	if (!rb->watermark)
323
		rb->watermark = max_size / 2;
324

325
	if (flags & RING_BUFFER_WRITABLE)
326
		rb->overwrite = 0;
327
	else
328
		rb->overwrite = 1;
329

330
	refcount_set(&rb->refcount, 1);
331

332
	INIT_LIST_HEAD(&rb->event_list);
333
	spin_lock_init(&rb->event_lock);
334

335
	/*
336
	 * perf_output_begin() only checks rb->paused, therefore
337
	 * rb->paused must be true if we have no pages for output.
338
	 */
339
	if (!rb->nr_pages)
340
		rb->paused = 1;
341

342
	mutex_init(&rb->aux_mutex);
343
}
344

345
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
346
{
347
	/*
348
	 * OVERWRITE is determined by perf_aux_output_end() and can't
349
	 * be passed in directly.
350
	 */
351
	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
352
		return;
353

354
	handle->aux_flags |= flags;
355
}
356
EXPORT_SYMBOL_GPL(perf_aux_output_flag);
357

358
/*
359
 * This is called before hardware starts writing to the AUX area to
360
 * obtain an output handle and make sure there's room in the buffer.
361
 * When the capture completes, call perf_aux_output_end() to commit
362
 * the recorded data to the buffer.
363
 *
364
 * The ordering is similar to that of perf_output_{begin,end}, with
365
 * the exception of (B), which should be taken care of by the pmu
366
 * driver, since ordering rules will differ depending on hardware.
367
 *
368
 * Call this from pmu::start(); see the comment in perf_aux_output_end()
369
 * about its use in pmu callbacks. Both can also be called from the PMI
370
 * handler if needed.
371
 */
372
void *perf_aux_output_begin(struct perf_output_handle *handle,
373
			    struct perf_event *event)
374
{
375
	struct perf_event *output_event = event;
376
	unsigned long aux_head, aux_tail;
377
	struct perf_buffer *rb;
378
	unsigned int nest;
379

380
	if (output_event->parent)
381
		output_event = output_event->parent;
382

383
	/*
384
	 * Since this will typically be open across pmu::add/pmu::del, we
385
	 * grab ring_buffer's refcount instead of holding rcu read lock
386
	 * to make sure it doesn't disappear under us.
387
	 */
388
	rb = ring_buffer_get(output_event);
389
	if (!rb)
390
		return NULL;
391

392
	if (!rb_has_aux(rb))
393
		goto err;
394

395
	/*
396
	 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
397
	 * about to get freed, so we leave immediately.
398
	 *
399
	 * Checking rb::aux_mmap_count and rb::refcount has to be done in
400
	 * the same order, see perf_mmap_close. Otherwise we end up freeing
401
	 * aux pages in this path, which is a bug, because in_atomic().
402
	 */
403
	if (!refcount_read(&rb->aux_mmap_count))
404
		goto err;
405

406
	if (!refcount_inc_not_zero(&rb->aux_refcount))
407
		goto err;
408

409
	nest = READ_ONCE(rb->aux_nest);
410
	/*
411
	 * Nesting is not supported for AUX area, make sure nested
412
	 * writers are caught early
413
	 */
414
	if (WARN_ON_ONCE(nest))
415
		goto err_put;
416

417
	WRITE_ONCE(rb->aux_nest, nest + 1);
418

419
	aux_head = rb->aux_head;
420

421
	handle->rb = rb;
422
	handle->event = event;
423
	handle->head = aux_head;
424
	handle->size = 0;
425
	handle->aux_flags = 0;
426

427
	/*
428
	 * In overwrite mode, AUX data stores do not depend on aux_tail,
429
	 * therefore (A) control dependency barrier does not exist. The
430
	 * (B) <-> (C) ordering is still observed by the pmu driver.
431
	 */
432
	if (!rb->aux_overwrite) {
433
		aux_tail = READ_ONCE(rb->user_page->aux_tail);
434
		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
435
		if (aux_head - aux_tail < perf_aux_size(rb))
436
			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
437

438
		/*
439
		 * handle->size computation depends on aux_tail load; this forms a
440
		 * control dependency barrier separating aux_tail load from aux data
441
		 * store that will be enabled on successful return
442
		 */
443
		if (!handle->size) { /* A, matches D */
444
			perf_event_disable_inatomic(handle->event);
445
			perf_output_wakeup(handle);
446
			WRITE_ONCE(rb->aux_nest, 0);
447
			goto err_put;
448
		}
449
	}
450

451
	return handle->rb->aux_priv;
452

453
err_put:
454
	/* can't be last */
455
	rb_free_aux(rb);
456

457
err:
458
	ring_buffer_put(rb);
459
	handle->event = NULL;
460

461
	return NULL;
462
}
463
EXPORT_SYMBOL_GPL(perf_aux_output_begin);
464

465
static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
466
{
467
	if (rb->aux_overwrite)
468
		return false;
469

470
	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
471
		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
472
		return true;
473
	}
474

475
	return false;
476
}
477

478
/*
479
 * Commit the data written by hardware into the ring buffer by adjusting
480
 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
481
 * pmu driver's responsibility to observe ordering rules of the hardware,
482
 * so that all the data is externally visible before this is called.
483
 *
484
 * Note: this has to be called from pmu::stop() callback, as the assumption
485
 * of the AUX buffer management code is that after pmu::stop(), the AUX
486
 * transaction must be stopped and therefore drop the AUX reference count.
487
 */
488
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
489
{
490
	bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
491
	struct perf_buffer *rb = handle->rb;
492
	unsigned long aux_head;
493

494
	/* in overwrite mode, driver provides aux_head via handle */
495
	if (rb->aux_overwrite) {
496
		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
497

498
		aux_head = handle->head;
499
		rb->aux_head = aux_head;
500
	} else {
501
		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
502

503
		aux_head = rb->aux_head;
504
		rb->aux_head += size;
505
	}
506

507
	/*
508
	 * Only send RECORD_AUX if we have something useful to communicate
509
	 *
510
	 * Note: the OVERWRITE records by themselves are not considered
511
	 * useful, as they don't communicate any *new* information,
512
	 * aside from the short-lived offset, that becomes history at
513
	 * the next event sched-in and therefore isn't useful.
514
	 * The userspace that needs to copy out AUX data in overwrite
515
	 * mode should know to use user_page::aux_head for the actual
516
	 * offset. So, from now on we don't output AUX records that
517
	 * have *only* OVERWRITE flag set.
518
	 */
519
	if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
520
		perf_event_aux_event(handle->event, aux_head, size,
521
				     handle->aux_flags);
522

523
	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
524
	if (rb_need_aux_wakeup(rb))
525
		wakeup = true;
526

527
	if (wakeup) {
528
		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
529
			perf_event_disable_inatomic(handle->event);
530
		perf_output_wakeup(handle);
531
	}
532

533
	handle->event = NULL;
534

535
	WRITE_ONCE(rb->aux_nest, 0);
536
	/* can't be last */
537
	rb_free_aux(rb);
538
	ring_buffer_put(rb);
539
}
540
EXPORT_SYMBOL_GPL(perf_aux_output_end);
541

542
/*
543
 * Skip over a given number of bytes in the AUX buffer, due to, for example,
544
 * hardware's alignment constraints.
545
 */
546
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
547
{
548
	struct perf_buffer *rb = handle->rb;
549

550
	if (size > handle->size)
551
		return -ENOSPC;
552

553
	rb->aux_head += size;
554

555
	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
556
	if (rb_need_aux_wakeup(rb)) {
557
		perf_output_wakeup(handle);
558
		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
559
	}
560

561
	handle->head = rb->aux_head;
562
	handle->size -= size;
563

564
	return 0;
565
}
566
EXPORT_SYMBOL_GPL(perf_aux_output_skip);
567

568
void *perf_get_aux(struct perf_output_handle *handle)
569
{
570
	/* this is only valid between perf_aux_output_begin and *_end */
571
	if (!handle->event)
572
		return NULL;
573

574
	return handle->rb->aux_priv;
575
}
576
EXPORT_SYMBOL_GPL(perf_get_aux);
577

578
/*
579
 * Copy out AUX data from an AUX handle.
580
 */
581
long perf_output_copy_aux(struct perf_output_handle *aux_handle,
582
			  struct perf_output_handle *handle,
583
			  unsigned long from, unsigned long to)
584
{
585
	struct perf_buffer *rb = aux_handle->rb;
586
	unsigned long tocopy, remainder, len = 0;
587
	void *addr;
588

589
	from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
590
	to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
591

592
	do {
593
		tocopy = PAGE_SIZE - offset_in_page(from);
594
		if (to > from)
595
			tocopy = min(tocopy, to - from);
596
		if (!tocopy)
597
			break;
598

599
		addr = rb->aux_pages[from >> PAGE_SHIFT];
600
		addr += offset_in_page(from);
601

602
		remainder = perf_output_copy(handle, addr, tocopy);
603
		if (remainder)
604
			return -EFAULT;
605

606
		len += tocopy;
607
		from += tocopy;
608
		from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
609
	} while (to != from);
610

611
	return len;
612
}
613

614
#define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
615

616
static struct page *rb_alloc_aux_page(int node, int order)
617
{
618
	struct page *page;
619

620
	if (order > MAX_PAGE_ORDER)
621
		order = MAX_PAGE_ORDER;
622

623
	do {
624
		page = alloc_pages_node(node, PERF_AUX_GFP, order);
625
	} while (!page && order--);
626

627
	if (page && order) {
628
		/*
629
		 * Communicate the allocation size to the driver:
630
		 * if we managed to secure a high-order allocation,
631
		 * set its first page's private to this order;
632
		 * !PagePrivate(page) means it's just a normal page.
633
		 */
634
		split_page(page, order);
635
		SetPagePrivate(page);
636
		set_page_private(page, order);
637
	}
638

639
	return page;
640
}
641

642
static void rb_free_aux_page(struct perf_buffer *rb, int idx)
643
{
644
	struct page *page = virt_to_page(rb->aux_pages[idx]);
645

646
	ClearPagePrivate(page);
647
	__free_page(page);
648
}
649

650
static void __rb_free_aux(struct perf_buffer *rb)
651
{
652
	int pg;
653

654
	/*
655
	 * Should never happen, the last reference should be dropped from
656
	 * perf_mmap_close() path, which first stops aux transactions (which
657
	 * in turn are the atomic holders of aux_refcount) and then does the
658
	 * last rb_free_aux().
659
	 */
660
	WARN_ON_ONCE(in_atomic());
661

662
	if (rb->aux_priv) {
663
		rb->free_aux(rb->aux_priv);
664
		rb->free_aux = NULL;
665
		rb->aux_priv = NULL;
666
	}
667

668
	if (rb->aux_nr_pages) {
669
		for (pg = 0; pg < rb->aux_nr_pages; pg++)
670
			rb_free_aux_page(rb, pg);
671

672
		kfree(rb->aux_pages);
673
		rb->aux_nr_pages = 0;
674
	}
675
}
676

677
int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
678
		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
679
{
680
	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
681
	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
682
	bool use_contiguous_pages = event->pmu->capabilities & (
683
		PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
684
	/*
685
	 * Initialize max_order to 0 for page allocation. This allocates single
686
	 * pages to minimize memory fragmentation. This is overridden if the
687
	 * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
688
	 */
689
	int max_order = 0;
690
	int ret = -ENOMEM;
691

692
	if (!has_aux(event))
693
		return -EOPNOTSUPP;
694

695
	if (nr_pages <= 0)
696
		return -EINVAL;
697

698
	if (!overwrite) {
699
		/*
700
		 * Watermark defaults to half the buffer, to aid PMU drivers
701
		 * in double buffering.
702
		 */
703
		if (!watermark)
704
			watermark = min_t(unsigned long,
705
					  U32_MAX,
706
					  (unsigned long)nr_pages << (PAGE_SHIFT - 1));
707

708
		/*
709
		 * If using contiguous pages, use aux_watermark as the basis
710
		 * for chunking to help PMU drivers honor the watermark.
711
		 */
712
		if (use_contiguous_pages)
713
			max_order = get_order(watermark);
714
	} else {
715
		/*
716
		 * If using contiguous pages, we need to start with the
717
		 * max_order that fits in nr_pages, not the other way around,
718
		 * hence ilog2() and not get_order.
719
		 */
720
		if (use_contiguous_pages)
721
			max_order = ilog2(nr_pages);
722
		watermark = 0;
723
	}
724

725
	/*
726
	 * kcalloc_node() is unable to allocate buffer if the size is larger
727
	 * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
728
	 */
729
	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
730
		return -ENOMEM;
731
	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
732
				     node);
733
	if (!rb->aux_pages)
734
		return -ENOMEM;
735

736
	rb->free_aux = event->pmu->free_aux;
737
	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
738
		struct page *page;
739
		int last, order;
740

741
		order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
742
		page = rb_alloc_aux_page(node, order);
743
		if (!page)
744
			goto out;
745

746
		for (last = rb->aux_nr_pages + (1 << page_private(page));
747
		     last > rb->aux_nr_pages; rb->aux_nr_pages++)
748
			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
749
	}
750

751
	/*
752
	 * In overwrite mode, PMUs that don't support SG may not handle more
753
	 * than one contiguous allocation, since they rely on PMI to do double
754
	 * buffering. In this case, the entire buffer has to be one contiguous
755
	 * chunk.
756
	 */
757
	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
758
	    overwrite) {
759
		struct page *page = virt_to_page(rb->aux_pages[0]);
760

761
		if (page_private(page) != max_order)
762
			goto out;
763
	}
764

765
	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
766
					     overwrite);
767
	if (!rb->aux_priv)
768
		goto out;
769

770
	ret = 0;
771

772
	/*
773
	 * aux_pages (and pmu driver's private data, aux_priv) will be
774
	 * referenced in both producer's and consumer's contexts, thus
775
	 * we keep a refcount here to make sure either of the two can
776
	 * reference them safely.
777
	 */
778
	refcount_set(&rb->aux_refcount, 1);
779

780
	rb->aux_overwrite = overwrite;
781
	rb->aux_watermark = watermark;
782

783
out:
784
	if (!ret)
785
		rb->aux_pgoff = pgoff;
786
	else
787
		__rb_free_aux(rb);
788

789
	return ret;
790
}
791

792
void rb_free_aux(struct perf_buffer *rb)
793
{
794
	if (refcount_dec_and_test(&rb->aux_refcount))
795
		__rb_free_aux(rb);
796
}
797

798
#ifndef CONFIG_PERF_USE_VMALLOC
799

800
/*
801
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
802
 */
803

804
static struct page *
805
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
806
{
807
	if (pgoff > rb->nr_pages)
808
		return NULL;
809

810
	if (pgoff == 0)
811
		return virt_to_page(rb->user_page);
812

813
	return virt_to_page(rb->data_pages[pgoff - 1]);
814
}
815

816
static void *perf_mmap_alloc_page(int cpu)
817
{
818
	struct page *page;
819
	int node;
820

821
	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
822
	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
823
	if (!page)
824
		return NULL;
825

826
	return page_address(page);
827
}
828

829
static void perf_mmap_free_page(void *addr)
830
{
831
	struct page *page = virt_to_page(addr);
832

833
	__free_page(page);
834
}
835

836
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
837
{
838
	struct perf_buffer *rb;
839
	unsigned long size;
840
	int i, node;
841

842
	size = sizeof(struct perf_buffer);
843
	size += nr_pages * sizeof(void *);
844

845
	if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
846
		goto fail;
847

848
	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
849
	rb = kzalloc_node(size, GFP_KERNEL, node);
850
	if (!rb)
851
		goto fail;
852

853
	rb->user_page = perf_mmap_alloc_page(cpu);
854
	if (!rb->user_page)
855
		goto fail_user_page;
856

857
	for (i = 0; i < nr_pages; i++) {
858
		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
859
		if (!rb->data_pages[i])
860
			goto fail_data_pages;
861
	}
862

863
	rb->nr_pages = nr_pages;
864

865
	ring_buffer_init(rb, watermark, flags);
866

867
	return rb;
868

869
fail_data_pages:
870
	for (i--; i >= 0; i--)
871
		perf_mmap_free_page(rb->data_pages[i]);
872

873
	perf_mmap_free_page(rb->user_page);
874

875
fail_user_page:
876
	kfree(rb);
877

878
fail:
879
	return NULL;
880
}
881

882
void rb_free(struct perf_buffer *rb)
883
{
884
	int i;
885

886
	perf_mmap_free_page(rb->user_page);
887
	for (i = 0; i < rb->nr_pages; i++)
888
		perf_mmap_free_page(rb->data_pages[i]);
889
	kfree(rb);
890
}
891

892
#else
893
static struct page *
894
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
895
{
896
	/* The '>' counts in the user page. */
897
	if (pgoff > data_page_nr(rb))
898
		return NULL;
899

900
	return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
901
}
902

903
static void rb_free_work(struct work_struct *work)
904
{
905
	struct perf_buffer *rb;
906

907
	rb = container_of(work, struct perf_buffer, work);
908

909
	vfree(rb->user_page);
910
	kfree(rb);
911
}
912

913
void rb_free(struct perf_buffer *rb)
914
{
915
	schedule_work(&rb->work);
916
}
917

918
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
919
{
920
	struct perf_buffer *rb;
921
	unsigned long size;
922
	void *all_buf;
923
	int node;
924

925
	size = sizeof(struct perf_buffer);
926
	size += sizeof(void *);
927

928
	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
929
	rb = kzalloc_node(size, GFP_KERNEL, node);
930
	if (!rb)
931
		goto fail;
932

933
	INIT_WORK(&rb->work, rb_free_work);
934

935
	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
936
	if (!all_buf)
937
		goto fail_all_buf;
938

939
	rb->user_page = all_buf;
940
	rb->data_pages[0] = all_buf + PAGE_SIZE;
941
	if (nr_pages) {
942
		rb->nr_pages = 1;
943
		rb->page_order = ilog2(nr_pages);
944
	}
945

946
	ring_buffer_init(rb, watermark, flags);
947

948
	return rb;
949

950
fail_all_buf:
951
	kfree(rb);
952

953
fail:
954
	return NULL;
955
}
956

957
#endif
958

959
struct page *
960
perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
961
{
962
	if (rb->aux_nr_pages) {
963
		/* above AUX space */
964
		if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
965
			return NULL;
966

967
		/* AUX space */
968
		if (pgoff >= rb->aux_pgoff) {
969
			int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
970
			return virt_to_page(rb->aux_pages[aux_pgoff]);
971
		}
972
	}
973

974
	return __perf_mmap_to_page(rb, pgoff);
975
}
976

977
Product

Resources

Company