CoCalc -- monitor.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/resctrl/monitor.c
²⁹²⁷¹ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Resource Director Technology(RDT)
4
 * - Monitoring code
5
 *
6
 * Copyright (C) 2017 Intel Corporation
7
 *
8
 * Author:
9
 *    Vikas Shivappa <[email protected]>
10
 *
11
 * This replaces the cqm.c based on perf but we reuse a lot of
12
 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
13
 *
14
 * More information about RDT be found in the Intel (R) x86 Architecture
15
 * Software Developer Manual June 2016, volume 3, section 17.17.
16
 */
17

18
#define pr_fmt(fmt)	"resctrl: " fmt
19

20
#include <linux/cpu.h>
21
#include <linux/resctrl.h>
22

23
#include <asm/cpu_device_id.h>
24
#include <asm/msr.h>
25

26
#include "internal.h"
27

28
/*
29
 * Global boolean for rdt_monitor which is true if any
30
 * resource monitoring is enabled.
31
 */
32
bool rdt_mon_capable;
33

34
#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
35

36
static int snc_nodes_per_l3_cache = 1;
37

38
/*
39
 * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
40
 * If rmid > rmid threshold, MBM total and local values should be multiplied
41
 * by the correction factor.
42
 *
43
 * The original table is modified for better code:
44
 *
45
 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
46
 *    for the case.
47
 * 2. MBM total and local correction table indexed by core counter which is
48
 *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
49
 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
50
 *    to calculate corrected value by shifting:
51
 *    corrected_value = (original_value * correction_factor) >> 20
52
 */
53
static const struct mbm_correction_factor_table {
54
	u32 rmidthreshold;
55
	u64 cf;
56
} mbm_cf_table[] __initconst = {
57
	{7,	CF(1.000000)},
58
	{15,	CF(1.000000)},
59
	{15,	CF(0.969650)},
60
	{31,	CF(1.000000)},
61
	{31,	CF(1.066667)},
62
	{31,	CF(0.969650)},
63
	{47,	CF(1.142857)},
64
	{63,	CF(1.000000)},
65
	{63,	CF(1.185115)},
66
	{63,	CF(1.066553)},
67
	{79,	CF(1.454545)},
68
	{95,	CF(1.000000)},
69
	{95,	CF(1.230769)},
70
	{95,	CF(1.142857)},
71
	{95,	CF(1.066667)},
72
	{127,	CF(1.000000)},
73
	{127,	CF(1.254863)},
74
	{127,	CF(1.185255)},
75
	{151,	CF(1.000000)},
76
	{127,	CF(1.066667)},
77
	{167,	CF(1.000000)},
78
	{159,	CF(1.454334)},
79
	{183,	CF(1.000000)},
80
	{127,	CF(0.969744)},
81
	{191,	CF(1.280246)},
82
	{191,	CF(1.230921)},
83
	{215,	CF(1.000000)},
84
	{191,	CF(1.143118)},
85
};
86

87
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
88

89
static u64 mbm_cf __read_mostly;
90

91
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
92
{
93
	/* Correct MBM value. */
94
	if (rmid > mbm_cf_rmidthreshold)
95
		val = (val * mbm_cf) >> 20;
96

97
	return val;
98
}
99

100
/*
101
 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
102
 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
103
 * needed. The physical RMID is the same as the logical RMID.
104
 *
105
 * On a platform with SNC mode enabled, Linux enables RMID sharing mode
106
 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
107
 * Resource Director Technology Architecture Specification" for a full
108
 * description of RMID sharing mode).
109
 *
110
 * In RMID sharing mode there are fewer "logical RMID" values available
111
 * to accumulate data ("physical RMIDs" are divided evenly between SNC
112
 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
113
 * each SNC node.
114
 *
115
 * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
116
 *
117
 * Data is collected independently on each SNC node and can be retrieved
118
 * using the "physical RMID" value computed by this function and loaded
119
 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
120
 *
121
 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
122
 * cache.  So a "physical RMID" may be read from any CPU that shares
123
 * the L3 cache with the desired SNC node, not just from a CPU in
124
 * the specific SNC node.
125
 */
126
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
127
{
128
	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
129

130
	if (snc_nodes_per_l3_cache == 1)
131
		return lrmid;
132

133
	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
134
}
135

136
static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
137
{
138
	u64 msr_val;
139

140
	/*
141
	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
142
	 * with a valid event code for supported resource type and the bits
143
	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
144
	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
145
	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
146
	 * are error bits.
147
	 */
148
	wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
149
	rdmsrq(MSR_IA32_QM_CTR, msr_val);
150

151
	if (msr_val & RMID_VAL_ERROR)
152
		return -EIO;
153
	if (msr_val & RMID_VAL_UNAVAIL)
154
		return -EINVAL;
155

156
	*val = msr_val;
157
	return 0;
158
}
159

160
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
161
						 u32 rmid,
162
						 enum resctrl_event_id eventid)
163
{
164
	struct arch_mbm_state *state;
165

166
	if (!resctrl_is_mbm_event(eventid))
167
		return NULL;
168

169
	state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
170

171
	return state ? &state[rmid] : NULL;
172
}
173

174
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
175
			     u32 unused, u32 rmid,
176
			     enum resctrl_event_id eventid)
177
{
178
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
179
	int cpu = cpumask_any(&d->hdr.cpu_mask);
180
	struct arch_mbm_state *am;
181
	u32 prmid;
182

183
	am = get_arch_mbm_state(hw_dom, rmid, eventid);
184
	if (am) {
185
		memset(am, 0, sizeof(*am));
186

187
		prmid = logical_rmid_to_physical_rmid(cpu, rmid);
188
		/* Record any initial, non-zero count value. */
189
		__rmid_read_phys(prmid, eventid, &am->prev_msr);
190
	}
191
}
192

193
/*
194
 * Assumes that hardware counters are also reset and thus that there is
195
 * no need to record initial non-zero counts.
196
 */
197
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
198
{
199
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
200
	enum resctrl_event_id eventid;
201
	int idx;
202

203
	for_each_mbm_event_id(eventid) {
204
		if (!resctrl_is_mon_event_enabled(eventid))
205
			continue;
206
		idx = MBM_STATE_IDX(eventid);
207
		memset(hw_dom->arch_mbm_states[idx], 0,
208
		       sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
209
	}
210
}
211

212
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
213
{
214
	u64 shift = 64 - width, chunks;
215

216
	chunks = (cur_msr << shift) - (prev_msr << shift);
217
	return chunks >> shift;
218
}
219

220
static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
221
			     u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
222
{
223
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
224
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
225
	struct arch_mbm_state *am;
226
	u64 chunks;
227

228
	am = get_arch_mbm_state(hw_dom, rmid, eventid);
229
	if (am) {
230
		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
231
						 hw_res->mbm_width);
232
		chunks = get_corrected_mbm_count(rmid, am->chunks);
233
		am->prev_msr = msr_val;
234
	} else {
235
		chunks = msr_val;
236
	}
237

238
	return chunks * hw_res->mon_scale;
239
}
240

241
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
242
			   u32 unused, u32 rmid, enum resctrl_event_id eventid,
243
			   u64 *val, void *ignored)
244
{
245
	int cpu = cpumask_any(&d->hdr.cpu_mask);
246
	u64 msr_val;
247
	u32 prmid;
248
	int ret;
249

250
	resctrl_arch_rmid_read_context_check();
251

252
	prmid = logical_rmid_to_physical_rmid(cpu, rmid);
253
	ret = __rmid_read_phys(prmid, eventid, &msr_val);
254
	if (ret)
255
		return ret;
256

257
	*val = get_corrected_val(r, d, rmid, eventid, msr_val);
258

259
	return 0;
260
}
261

262
static int __cntr_id_read(u32 cntr_id, u64 *val)
263
{
264
	u64 msr_val;
265

266
	/*
267
	 * QM_EVTSEL Register definition:
268
	 * =======================================================
269
	 * Bits    Mnemonic        Description
270
	 * =======================================================
271
	 * 63:44   --              Reserved
272
	 * 43:32   RMID            RMID or counter ID in ABMC mode
273
	 *                         when reading an MBM event
274
	 * 31      ExtendedEvtID   Extended Event Identifier
275
	 * 30:8    --              Reserved
276
	 * 7:0     EvtID           Event Identifier
277
	 * =======================================================
278
	 * The contents of a specific counter can be read by setting the
279
	 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
280
	 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
281
	 * to the desired counter ID. Reading the QM_CTR then returns the
282
	 * contents of the specified counter. The RMID_VAL_ERROR bit is set
283
	 * if the counter configuration is invalid, or if an invalid counter
284
	 * ID is set in the QM_EVTSEL.RMID field.  The RMID_VAL_UNAVAIL bit
285
	 * is set if the counter data is unavailable.
286
	 */
287
	wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
288
	rdmsrl(MSR_IA32_QM_CTR, msr_val);
289

290
	if (msr_val & RMID_VAL_ERROR)
291
		return -EIO;
292
	if (msr_val & RMID_VAL_UNAVAIL)
293
		return -EINVAL;
294

295
	*val = msr_val;
296
	return 0;
297
}
298

299
void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
300
			     u32 unused, u32 rmid, int cntr_id,
301
			     enum resctrl_event_id eventid)
302
{
303
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
304
	struct arch_mbm_state *am;
305

306
	am = get_arch_mbm_state(hw_dom, rmid, eventid);
307
	if (am) {
308
		memset(am, 0, sizeof(*am));
309

310
		/* Record any initial, non-zero count value. */
311
		__cntr_id_read(cntr_id, &am->prev_msr);
312
	}
313
}
314

315
int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
316
			   u32 unused, u32 rmid, int cntr_id,
317
			   enum resctrl_event_id eventid, u64 *val)
318
{
319
	u64 msr_val;
320
	int ret;
321

322
	ret = __cntr_id_read(cntr_id, &msr_val);
323
	if (ret)
324
		return ret;
325

326
	*val = get_corrected_val(r, d, rmid, eventid, msr_val);
327

328
	return 0;
329
}
330

331
/*
332
 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
333
 * which indicates that RMIDs are configured in legacy mode.
334
 * This mode is incompatible with Linux resctrl semantics
335
 * as RMIDs are partitioned between SNC nodes, which requires
336
 * a user to know which RMID is allocated to a task.
337
 * Clearing bit 0 reconfigures the RMID counters for use
338
 * in RMID sharing mode. This mode is better for Linux.
339
 * The RMID space is divided between all SNC nodes with the
340
 * RMIDs renumbered to start from zero in each node when
341
 * counting operations from tasks. Code to read the counters
342
 * must adjust RMID counter numbers based on SNC node. See
343
 * logical_rmid_to_physical_rmid() for code that does this.
344
 */
345
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
346
{
347
	if (snc_nodes_per_l3_cache > 1)
348
		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
349
}
350

351
/* CPU models that support MSR_RMID_SNC_CONFIG */
352
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
353
	X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
354
	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
355
	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
356
	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
357
	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
358
	{}
359
};
360

361
/*
362
 * There isn't a simple hardware bit that indicates whether a CPU is running
363
 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
364
 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
365
 * the same NUMA node as CPU0.
366
 * It is not possible to accurately determine SNC state if the system is
367
 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
368
 * to L3 caches. It will be OK if system is booted with hyperthreading
369
 * disabled (since this doesn't affect the ratio).
370
 */
371
static __init int snc_get_config(void)
372
{
373
	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
374
	const cpumask_t *node0_cpumask;
375
	int cpus_per_node, cpus_per_l3;
376
	int ret;
377

378
	if (!x86_match_cpu(snc_cpu_ids) || !ci)
379
		return 1;
380

381
	cpus_read_lock();
382
	if (num_online_cpus() != num_present_cpus())
383
		pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
384
	cpus_read_unlock();
385

386
	node0_cpumask = cpumask_of_node(cpu_to_node(0));
387

388
	cpus_per_node = cpumask_weight(node0_cpumask);
389
	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
390

391
	if (!cpus_per_node || !cpus_per_l3)
392
		return 1;
393

394
	ret = cpus_per_l3 / cpus_per_node;
395

396
	/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
397
	switch (ret) {
398
	case 1:
399
		break;
400
	case 2 ... 4:
401
	case 6:
402
		pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
403
		rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
404
		break;
405
	default:
406
		pr_warn("Ignore improbable SNC node count %d\n", ret);
407
		ret = 1;
408
		break;
409
	}
410

411
	return ret;
412
}
413

414
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
415
{
416
	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
417
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
418
	unsigned int threshold;
419
	u32 eax, ebx, ecx, edx;
420

421
	snc_nodes_per_l3_cache = snc_get_config();
422

423
	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
424
	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
425
	r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
426
	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
427

428
	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
429
		hw_res->mbm_width += mbm_offset;
430
	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
431
		pr_warn("Ignoring impossible MBM counter offset\n");
432

433
	/*
434
	 * A reasonable upper limit on the max threshold is the number
435
	 * of lines tagged per RMID if all RMIDs have the same number of
436
	 * lines tagged in the LLC.
437
	 *
438
	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
439
	 */
440
	threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
441

442
	/*
443
	 * Because num_rmid may not be a power of two, round the value
444
	 * to the nearest multiple of hw_res->mon_scale so it matches a
445
	 * value the hardware will measure. mon_scale may not be a power of 2.
446
	 */
447
	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
448

449
	if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
450
		/* Detect list of bandwidth sources that can be tracked */
451
		cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
452
		r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
453
	}
454

455
	if (rdt_cpu_has(X86_FEATURE_ABMC)) {
456
		r->mon.mbm_cntr_assignable = true;
457
		cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
458
		r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
459
		hw_res->mbm_cntr_assign_enabled = true;
460
	}
461

462
	r->mon_capable = true;
463

464
	return 0;
465
}
466

467
void __init intel_rdt_mbm_apply_quirk(void)
468
{
469
	int cf_index;
470

471
	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
472
	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
473
		pr_info("No MBM correction factor available\n");
474
		return;
475
	}
476

477
	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
478
	mbm_cf = mbm_cf_table[cf_index].cf;
479
}
480

481
static void resctrl_abmc_set_one_amd(void *arg)
482
{
483
	bool *enable = arg;
484

485
	if (*enable)
486
		msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
487
	else
488
		msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
489
}
490

491
/*
492
 * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
493
 * associated with all monitor domains.
494
 */
495
static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
496
{
497
	struct rdt_mon_domain *d;
498

499
	lockdep_assert_cpus_held();
500

501
	list_for_each_entry(d, &r->mon_domains, hdr.list) {
502
		on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
503
				 &enable, 1);
504
		resctrl_arch_reset_rmid_all(r, d);
505
	}
506
}
507

508
int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
509
{
510
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
511

512
	if (r->mon.mbm_cntr_assignable &&
513
	    hw_res->mbm_cntr_assign_enabled != enable) {
514
		_resctrl_abmc_enable(r, enable);
515
		hw_res->mbm_cntr_assign_enabled = enable;
516
	}
517

518
	return 0;
519
}
520

521
bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
522
{
523
	return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
524
}
525

526
static void resctrl_abmc_config_one_amd(void *info)
527
{
528
	union l3_qos_abmc_cfg *abmc_cfg = info;
529

530
	wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
531
}
532

533
/*
534
 * Send an IPI to the domain to assign the counter to RMID, event pair.
535
 */
536
void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
537
			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
538
			      u32 cntr_id, bool assign)
539
{
540
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
541
	union l3_qos_abmc_cfg abmc_cfg = { 0 };
542
	struct arch_mbm_state *am;
543

544
	abmc_cfg.split.cfg_en = 1;
545
	abmc_cfg.split.cntr_en = assign ? 1 : 0;
546
	abmc_cfg.split.cntr_id = cntr_id;
547
	abmc_cfg.split.bw_src = rmid;
548
	if (assign)
549
		abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
550

551
	smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
552

553
	/*
554
	 * The hardware counter is reset (because cfg_en == 1) so there is no
555
	 * need to record initial non-zero counts.
556
	 */
557
	am = get_arch_mbm_state(hw_dom, rmid, evtid);
558
	if (am)
559
		memset(am, 0, sizeof(*am));
560
}
561

562
void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
563
{
564
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
565

566
	resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
567
}
568

569
Product

Resources

Company