CoCalc -- vpa-dtl.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/perf/vpa-dtl.c
²⁹²⁶⁹ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
 * Perf interface to expose Dispatch Trace Log counters.
4
 *
5
 * Copyright (C) 2024 Kajol Jain, IBM Corporation
6
 */
7

8
#ifdef CONFIG_PPC_SPLPAR
9
#define pr_fmt(fmt) "vpa_dtl: " fmt
10

11
#include <asm/dtl.h>
12
#include <linux/perf_event.h>
13
#include <asm/plpar_wrappers.h>
14
#include <linux/vmalloc.h>
15

16
#define EVENT(_name, _code)     enum{_name = _code}
17

18
/*
19
 * Based on Power Architecture Platform Reference(PAPR) documentation,
20
 * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL)
21
 * Enable Mask used to get corresponding virtual processor dispatch
22
 * to preempt traces:
23
 *   DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual
24
 *   processor waits
25
 *   DTL_PREEMPT(0x2): Trace time slice preempts
26
 *   DTL_FAULT(0x4): Trace virtual partition memory page
27
 faults.
28
 *   DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT)
29
 *
30
 * Event codes based on Dispatch Trace Log Enable Mask.
31
 */
32
EVENT(DTL_CEDE,         0x1);
33
EVENT(DTL_PREEMPT,      0x2);
34
EVENT(DTL_FAULT,        0x4);
35
EVENT(DTL_ALL,          0x7);
36

37
GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
38
GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
39
GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
40
GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
41

42
PMU_FORMAT_ATTR(event, "config:0-7");
43

44
static struct attribute *events_attr[] = {
45
	GENERIC_EVENT_PTR(DTL_CEDE),
46
	GENERIC_EVENT_PTR(DTL_PREEMPT),
47
	GENERIC_EVENT_PTR(DTL_FAULT),
48
	GENERIC_EVENT_PTR(DTL_ALL),
49
	NULL
50
};
51

52
static struct attribute_group event_group = {
53
	.name = "events",
54
	.attrs = events_attr,
55
};
56

57
static struct attribute *format_attrs[] = {
58
	&format_attr_event.attr,
59
	NULL,
60
};
61

62
static const struct attribute_group format_group = {
63
	.name = "format",
64
	.attrs = format_attrs,
65
};
66

67
static const struct attribute_group *attr_groups[] = {
68
	&format_group,
69
	&event_group,
70
	NULL,
71
};
72

73
struct vpa_dtl {
74
	struct dtl_entry	*buf;
75
	u64			last_idx;
76
};
77

78
struct vpa_pmu_ctx {
79
	struct perf_output_handle handle;
80
};
81

82
struct vpa_pmu_buf {
83
	int     nr_pages;
84
	bool    snapshot;
85
	u64     *base;
86
	u64     size;
87
	u64     head;
88
	u64	head_size;
89
	/* boot timebase and frequency needs to be saved only at once */
90
	int	boottb_freq_saved;
91
	u64	threshold;
92
	bool	full;
93
};
94

95
/*
96
 * To corelate each DTL entry with other events across CPU's,
97
 * we need to map timebase from "struct dtl_entry" which phyp
98
 * provides with boot timebase. This also needs timebase frequency.
99
 * Formula is: ((timbase from DTL entry - boot time) / frequency)
100
 *
101
 * To match with size of "struct dtl_entry" to ease post processing,
102
 * padded 24 bytes to the structure.
103
 */
104
struct boottb_freq {
105
	u64	boot_tb;
106
	u64	tb_freq;
107
	u64	timebase;
108
	u64	padded[3];
109
};
110

111
static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx);
112
static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu);
113

114
/* variable to capture reference count for the active dtl threads */
115
static int dtl_global_refc;
116
static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
117

118
/*
119
 * Capture DTL data in AUX buffer
120
 */
121
static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf,
122
		struct vpa_dtl *dtl, int index)
123
{
124
	struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base;
125

126
	/*
127
	 * check if there is enough space to contain the
128
	 * DTL data. If not, save the data for available
129
	 * memory and set full to true.
130
	 */
131
	if (buf->head + *n_entries >= buf->threshold) {
132
		*n_entries = buf->threshold - buf->head;
133
		buf->full = 1;
134
	}
135

136
	/*
137
	 * Copy to AUX buffer from per-thread address
138
	 */
139
	memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry));
140

141
	if (buf->full) {
142
		/*
143
		 * Set head of private aux to zero when buffer is full
144
		 * so that next data will be copied to beginning of the
145
		 * buffer
146
		 */
147
		buf->head = 0;
148
		return;
149
	}
150

151
	buf->head += *n_entries;
152

153
	return;
154
}
155

156
/*
157
 * Function to dump the dispatch trace log buffer data to the
158
 * perf data.
159
 *
160
 * perf_aux_output_begin: This function is called before writing
161
 * to AUX area. This returns the pointer to aux area private structure,
162
 * ie "struct vpa_pmu_buf" here which is set in setup_aux() function.
163
 * The function obtains the output handle (used in perf_aux_output_end).
164
 * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end()
165
 * to commit the recorded data.
166
 *
167
 * perf_aux_output_end: This function commits data by adjusting the
168
 * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools
169
 * side when writing the data from aux buffer to perf.data file in disk.
170
 *
171
 * Here in the private aux structure, we maintain head to know where
172
 * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to
173
 * maintain the aux head for PMU driver. It is responsiblity of PMU
174
 * driver to make sure data is copied between perf_aux_output_begin and
175
 * perf_aux_output_end.
176
 *
177
 * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end()
178
 * is called to move the aux->head of "struct perf_buffer" to indicate size of
179
 * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer.
180
 * Data will be written to disk only when the allocated buffer is full.
181
 *
182
 * By this approach, all the DTL data will be present as-is in the
183
 * perf.data. The data will be pre-processed in perf tools side when doing
184
 * perf report/perf script and this will avoid time taken to create samples
185
 * in the kernel space.
186
 */
187
static void vpa_dtl_dump_sample_data(struct perf_event *event)
188
{
189
	u64 cur_idx, last_idx, i;
190
	u64 boot_tb;
191
	struct boottb_freq boottb_freq;
192

193
	/* actual number of entries read */
194
	long n_read = 0, read_size = 0;
195

196
	/* number of entries added to dtl buffer */
197
	long n_req;
198

199
	struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx);
200

201
	struct vpa_pmu_buf *aux_buf;
202

203
	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
204
	u64 size;
205

206
	cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx);
207
	last_idx = dtl->last_idx;
208

209
	if (last_idx + N_DISPATCH_LOG <= cur_idx)
210
		last_idx = cur_idx - N_DISPATCH_LOG + 1;
211

212
	n_req = cur_idx - last_idx;
213

214
	/* no new entry added to the buffer, return */
215
	if (n_req <= 0)
216
		return;
217

218
	dtl->last_idx = last_idx + n_req;
219
	boot_tb = get_boot_tb();
220

221
	i = last_idx % N_DISPATCH_LOG;
222

223
	aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event);
224
	if (!aux_buf) {
225
		pr_debug("returning. no aux\n");
226
		return;
227
	}
228

229
	if (!aux_buf->boottb_freq_saved) {
230
		pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb);
231
		/* Save boot_tb to convert raw timebase to it's relative system boot time */
232
		boottb_freq.boot_tb = boot_tb;
233
		/* Save tb_ticks_per_sec to convert timebase to sec */
234
		boottb_freq.tb_freq = tb_ticks_per_sec;
235
		boottb_freq.timebase = 0;
236
		memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq));
237
		aux_buf->head += 1;
238
		aux_buf->boottb_freq_saved = 1;
239
		n_read += 1;
240
	}
241

242
	/* read the tail of the buffer if we've wrapped */
243
	if (i + n_req > N_DISPATCH_LOG) {
244
		read_size = N_DISPATCH_LOG - i;
245
		vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i);
246
		n_req -= read_size;
247
		n_read += read_size;
248
		i = 0;
249
		if (aux_buf->full) {
250
			size = (n_read * sizeof(struct dtl_entry));
251
			if ((size +  aux_buf->head_size) > aux_buf->size) {
252
				size = aux_buf->size - aux_buf->head_size;
253
				perf_aux_output_end(&vpa_ctx->handle, size);
254
				aux_buf->head = 0;
255
				aux_buf->head_size = 0;
256
			} else {
257
				aux_buf->head_size += (n_read * sizeof(struct dtl_entry));
258
				perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry));
259
			}
260
			goto out;
261
		}
262
	}
263

264
	/* .. and now the head */
265
	vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i);
266

267
	size = ((n_req + n_read) * sizeof(struct dtl_entry));
268
	if ((size +  aux_buf->head_size) > aux_buf->size) {
269
		size = aux_buf->size - aux_buf->head_size;
270
		perf_aux_output_end(&vpa_ctx->handle, size);
271
		aux_buf->head = 0;
272
		aux_buf->head_size = 0;
273
	} else {
274
		aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry));
275
		/* Move the aux->head to indicate size of data in aux buffer */
276
		perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry));
277
	}
278
out:
279
	aux_buf->full = 0;
280
}
281

282
/*
283
 * The VPA Dispatch Trace log counters do not interrupt on overflow.
284
 * Therefore, the kernel needs to poll the counters to avoid missing
285
 * an overflow using hrtimer. The timer interval is based on sample_period
286
 * count provided by user, and minimum interval is 1 millisecond.
287
 */
288
static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
289
{
290
	struct perf_event *event;
291
	u64 period;
292

293
	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
294

295
	if (event->state != PERF_EVENT_STATE_ACTIVE)
296
		return HRTIMER_NORESTART;
297

298
	vpa_dtl_dump_sample_data(event);
299
	period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period);
300
	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
301

302
	return HRTIMER_RESTART;
303
}
304

305
static void vpa_dtl_start_hrtimer(struct perf_event *event)
306
{
307
	u64 period;
308
	struct hw_perf_event *hwc = &event->hw;
309

310
	period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period);
311
	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
312
}
313

314
static void vpa_dtl_stop_hrtimer(struct perf_event *event)
315
{
316
	struct hw_perf_event *hwc = &event->hw;
317

318
	hrtimer_cancel(&hwc->hrtimer);
319
}
320

321
static void vpa_dtl_reset_global_refc(struct perf_event *event)
322
{
323
	spin_lock(&dtl_global_lock);
324
	dtl_global_refc--;
325
	if (dtl_global_refc <= 0) {
326
		dtl_global_refc = 0;
327
		up_write(&dtl_access_lock);
328
	}
329
	spin_unlock(&dtl_global_lock);
330
}
331

332
static int vpa_dtl_mem_alloc(int cpu)
333
{
334
	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu);
335
	struct dtl_entry *buf = NULL;
336

337
	/* Check for dispatch trace log buffer cache */
338
	if (!dtl_cache)
339
		return -ENOMEM;
340

341
	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu));
342
	if (!buf) {
343
		pr_warn("buffer allocation failed for cpu %d\n", cpu);
344
		return -ENOMEM;
345
	}
346
	dtl->buf = buf;
347
	return 0;
348
}
349

350
static int vpa_dtl_event_init(struct perf_event *event)
351
{
352
	struct hw_perf_event *hwc = &event->hw;
353

354
	/* test the event attr type for PMU enumeration */
355
	if (event->attr.type != event->pmu->type)
356
		return -ENOENT;
357

358
	if (!perfmon_capable())
359
		return -EACCES;
360

361
	/* Return if this is a counting event */
362
	if (!is_sampling_event(event))
363
		return -EOPNOTSUPP;
364

365
	/* no branch sampling */
366
	if (has_branch_stack(event))
367
		return -EOPNOTSUPP;
368

369
	/* Invalid eventcode */
370
	switch (event->attr.config) {
371
	case DTL_LOG_CEDE:
372
	case DTL_LOG_PREEMPT:
373
	case DTL_LOG_FAULT:
374
	case DTL_LOG_ALL:
375
		break;
376
	default:
377
		return -EINVAL;
378
	}
379

380
	spin_lock(&dtl_global_lock);
381

382
	/*
383
	 * To ensure there are no other conflicting dtl users
384
	 * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl),
385
	 * below code try to take the dtl_access_lock.
386
	 * The dtl_access_lock is a rwlock defined in dtl.h, which is used
387
	 * to unsure there is no conflicting dtl users.
388
	 * Based on below code, vpa_dtl pmu tries to take write access lock
389
	 * and also checks for dtl_global_refc, to make sure that the
390
	 * dtl_access_lock is taken by vpa_dtl pmu interface.
391
	 */
392
	if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) {
393
		spin_unlock(&dtl_global_lock);
394
		return -EBUSY;
395
	}
396

397
	/* Allocate dtl buffer memory */
398
	if (vpa_dtl_mem_alloc(event->cpu)) {
399
		spin_unlock(&dtl_global_lock);
400
		return -ENOMEM;
401
	}
402

403
	/*
404
	 * Increment the number of active vpa_dtl pmu threads. The
405
	 * dtl_global_refc is used to keep count of cpu threads that
406
	 * currently capturing dtl data using vpa_dtl pmu interface.
407
	 */
408
	dtl_global_refc++;
409

410
	spin_unlock(&dtl_global_lock);
411

412
	hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
413

414
	/*
415
	 * Since hrtimers have a fixed rate, we can do a static freq->period
416
	 * mapping and avoid the whole period adjust feedback stuff.
417
	 */
418
	if (event->attr.freq) {
419
		long freq = event->attr.sample_freq;
420

421
		event->attr.sample_period = NSEC_PER_SEC / freq;
422
		hwc->sample_period = event->attr.sample_period;
423
		local64_set(&hwc->period_left, hwc->sample_period);
424
		hwc->last_period = hwc->sample_period;
425
		event->attr.freq = 0;
426
	}
427

428
	event->destroy = vpa_dtl_reset_global_refc;
429
	return 0;
430
}
431

432
static int vpa_dtl_event_add(struct perf_event *event, int flags)
433
{
434
	int ret, hwcpu;
435
	unsigned long addr;
436
	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
437

438
	/*
439
	 * Register our dtl buffer with the hypervisor. The
440
	 * HV expects the buffer size to be passed in the second
441
	 * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA'
442
	 * from PAPR for more information.
443
	 */
444
	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
445
	dtl->last_idx = 0;
446

447
	hwcpu = get_hard_smp_processor_id(event->cpu);
448
	addr = __pa(dtl->buf);
449

450
	ret = register_dtl(hwcpu, addr);
451
	if (ret) {
452
		pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
453
			event->cpu, hwcpu, ret);
454
		return ret;
455
	}
456

457
	/* set our initial buffer indices */
458
	lppaca_of(event->cpu).dtl_idx = 0;
459

460
	/*
461
	 * Ensure that our updates to the lppaca fields have
462
	 * occurred before we actually enable the logging
463
	 */
464
	smp_wmb();
465

466
	/* enable event logging */
467
	lppaca_of(event->cpu).dtl_enable_mask = event->attr.config;
468

469
	vpa_dtl_start_hrtimer(event);
470

471
	return 0;
472
}
473

474
static void vpa_dtl_event_del(struct perf_event *event, int flags)
475
{
476
	int hwcpu = get_hard_smp_processor_id(event->cpu);
477
	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
478

479
	vpa_dtl_stop_hrtimer(event);
480
	unregister_dtl(hwcpu);
481
	kmem_cache_free(dtl_cache, dtl->buf);
482
	dtl->buf = NULL;
483
	lppaca_of(event->cpu).dtl_enable_mask = 0x0;
484
}
485

486
/*
487
 * This function definition is empty as vpa_dtl_dump_sample_data
488
 * is used to parse and dump the dispatch trace log data,
489
 * to perf data.
490
 */
491
static void vpa_dtl_event_read(struct perf_event *event)
492
{
493
}
494

495
/*
496
 * Set up pmu-private data structures for an AUX area
497
 * **pages contains the aux buffer allocated for this event
498
 * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node"
499
 * and returns pointer to each page address. Map these pages to
500
 * contiguous space using vmap and use that as base address.
501
 *
502
 * The aux private data structure ie, "struct vpa_pmu_buf" mainly
503
 * saves
504
 * - buf->base: aux buffer base address
505
 * - buf->head: offset from base address where data will be written to.
506
 * - buf->size: Size of allocated memory
507
 */
508
static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages,
509
		int nr_pages, bool snapshot)
510
{
511
	int i, cpu = event->cpu;
512
	struct vpa_pmu_buf *buf __free(kfree) = NULL;
513
	struct page **pglist __free(kfree) = NULL;
514

515
	/* We need at least one page for this to work. */
516
	if (!nr_pages)
517
		return NULL;
518

519
	if (cpu == -1)
520
		cpu = raw_smp_processor_id();
521

522
	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
523
	if (!buf)
524
		return NULL;
525

526
	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
527
	if (!pglist)
528
		return NULL;
529

530
	for (i = 0; i < nr_pages; ++i)
531
		pglist[i] = virt_to_page(pages[i]);
532

533
	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
534
	if (!buf->base)
535
		return NULL;
536

537
	buf->nr_pages = nr_pages;
538
	buf->snapshot = false;
539

540
	buf->size = nr_pages << PAGE_SHIFT;
541
	buf->head = 0;
542
	buf->head_size = 0;
543
	buf->boottb_freq_saved = 0;
544
	buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry));
545
	return no_free_ptr(buf);
546
}
547

548
/*
549
 * free pmu-private AUX data structures
550
 */
551
static void vpa_dtl_free_aux(void *aux)
552
{
553
	struct vpa_pmu_buf *buf = aux;
554

555
	vunmap(buf->base);
556
	kfree(buf);
557
}
558

559
static struct pmu vpa_dtl_pmu = {
560
	.task_ctx_nr = perf_invalid_context,
561

562
	.name = "vpa_dtl",
563
	.attr_groups = attr_groups,
564
	.event_init  = vpa_dtl_event_init,
565
	.add         = vpa_dtl_event_add,
566
	.del         = vpa_dtl_event_del,
567
	.read        = vpa_dtl_event_read,
568
	.setup_aux   = vpa_dtl_setup_aux,
569
	.free_aux    = vpa_dtl_free_aux,
570
	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE,
571
};
572

573
static int vpa_dtl_init(void)
574
{
575
	int r;
576

577
	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
578
		pr_debug("not a shared virtualized system, not enabling\n");
579
		return -ENODEV;
580
	}
581

582
	/* This driver is intended only for L1 host. */
583
	if (is_kvm_guest()) {
584
		pr_debug("Only supported for L1 host system\n");
585
		return -ENODEV;
586
	}
587

588
	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1);
589
	if (r)
590
		return r;
591

592
	return 0;
593
}
594

595
device_initcall(vpa_dtl_init);
596
#endif //CONFIG_PPC_SPLPAR
597

598
Product

Resources

Company