CoCalc -- scx_qmap.bpf.c

GitHub Repository: torvalds/linux
Path: blob/master/tools/sched_ext/scx_qmap.bpf.c
²⁹²⁶⁹ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * A simple five-level FIFO queue scheduler.
4
 *
5
 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6
 * assigned to one depending on its compound weight. Each CPU round robins
7
 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8
 * queue0, 2 from queue1, 4 from queue2 and so on.
9
 *
10
 * This scheduler demonstrates:
11
 *
12
 * - BPF-side queueing using PIDs.
13
 * - Sleepable per-task storage allocation using ops.prep_enable().
14
 * - Using ops.cpu_release() to handle a higher priority scheduling class taking
15
 *   the CPU away.
16
 * - Core-sched support.
17
 *
18
 * This scheduler is primarily for demonstration and testing of sched_ext
19
 * features and unlikely to be useful for actual workloads.
20
 *
21
 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
22
 * Copyright (c) 2022 Tejun Heo <[email protected]>
23
 * Copyright (c) 2022 David Vernet <[email protected]>
24
 */
25
#include <scx/common.bpf.h>
26

27
enum consts {
28
	ONE_SEC_IN_NS		= 1000000000,
29
	SHARED_DSQ		= 0,
30
	HIGHPRI_DSQ		= 1,
31
	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
32
};
33

34
char _license[] SEC("license") = "GPL";
35

36
const volatile u64 slice_ns;
37
const volatile u32 stall_user_nth;
38
const volatile u32 stall_kernel_nth;
39
const volatile u32 dsp_inf_loop_after;
40
const volatile u32 dsp_batch;
41
const volatile bool highpri_boosting;
42
const volatile bool print_dsqs_and_events;
43
const volatile bool print_msgs;
44
const volatile s32 disallow_tgid;
45
const volatile bool suppress_dump;
46

47
u64 nr_highpri_queued;
48
u32 test_error_cnt;
49

50
UEI_DEFINE(uei);
51

52
struct qmap {
53
	__uint(type, BPF_MAP_TYPE_QUEUE);
54
	__uint(max_entries, 4096);
55
	__type(value, u32);
56
} queue0 SEC(".maps"),
57
  queue1 SEC(".maps"),
58
  queue2 SEC(".maps"),
59
  queue3 SEC(".maps"),
60
  queue4 SEC(".maps"),
61
  dump_store SEC(".maps");
62

63
struct {
64
	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
65
	__uint(max_entries, 5);
66
	__type(key, int);
67
	__array(values, struct qmap);
68
} queue_arr SEC(".maps") = {
69
	.values = {
70
		[0] = &queue0,
71
		[1] = &queue1,
72
		[2] = &queue2,
73
		[3] = &queue3,
74
		[4] = &queue4,
75
	},
76
};
77

78
/*
79
 * If enabled, CPU performance target is set according to the queue index
80
 * according to the following table.
81
 */
82
static const u32 qidx_to_cpuperf_target[] = {
83
	[0] = SCX_CPUPERF_ONE * 0 / 4,
84
	[1] = SCX_CPUPERF_ONE * 1 / 4,
85
	[2] = SCX_CPUPERF_ONE * 2 / 4,
86
	[3] = SCX_CPUPERF_ONE * 3 / 4,
87
	[4] = SCX_CPUPERF_ONE * 4 / 4,
88
};
89

90
/*
91
 * Per-queue sequence numbers to implement core-sched ordering.
92
 *
93
 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
94
 * sequence number of the latest dispatched task. The distance between the a
95
 * task's seq and the associated queue's head seq is called the queue distance
96
 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
97
 */
98
static u64 core_sched_head_seqs[5];
99
static u64 core_sched_tail_seqs[5];
100

101
/* Per-task scheduling context */
102
struct task_ctx {
103
	bool	force_local;	/* Dispatch directly to local_dsq */
104
	bool	highpri;
105
	u64	core_sched_seq;
106
};
107

108
struct {
109
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
110
	__uint(map_flags, BPF_F_NO_PREALLOC);
111
	__type(key, int);
112
	__type(value, struct task_ctx);
113
} task_ctx_stor SEC(".maps");
114

115
struct cpu_ctx {
116
	u64	dsp_idx;	/* dispatch index */
117
	u64	dsp_cnt;	/* remaining count */
118
	u32	avg_weight;
119
	u32	cpuperf_target;
120
};
121

122
struct {
123
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124
	__uint(max_entries, 1);
125
	__type(key, u32);
126
	__type(value, struct cpu_ctx);
127
} cpu_ctx_stor SEC(".maps");
128

129
/* Statistics */
130
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
131
u64 nr_core_sched_execed;
132
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
133
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
134
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
135

136
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
137
{
138
	s32 cpu;
139

140
	if (p->nr_cpus_allowed == 1 ||
141
	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
142
		return prev_cpu;
143

144
	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
145
	if (cpu >= 0)
146
		return cpu;
147

148
	return -1;
149
}
150

151
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
152
{
153
	struct task_ctx *tctx;
154

155
	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
156
		scx_bpf_error("task_ctx lookup failed");
157
		return NULL;
158
	}
159
	return tctx;
160
}
161

162
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
163
		   s32 prev_cpu, u64 wake_flags)
164
{
165
	struct task_ctx *tctx;
166
	s32 cpu;
167

168
	if (!(tctx = lookup_task_ctx(p)))
169
		return -ESRCH;
170

171
	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
172

173
	if (cpu >= 0) {
174
		tctx->force_local = true;
175
		return cpu;
176
	} else {
177
		return prev_cpu;
178
	}
179
}
180

181
static int weight_to_idx(u32 weight)
182
{
183
	/* Coarsely map the compound weight to a FIFO. */
184
	if (weight <= 25)
185
		return 0;
186
	else if (weight <= 50)
187
		return 1;
188
	else if (weight < 200)
189
		return 2;
190
	else if (weight < 400)
191
		return 3;
192
	else
193
		return 4;
194
}
195

196
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
197
{
198
	static u32 user_cnt, kernel_cnt;
199
	struct task_ctx *tctx;
200
	u32 pid = p->pid;
201
	int idx = weight_to_idx(p->scx.weight);
202
	void *ring;
203
	s32 cpu;
204

205
	if (p->flags & PF_KTHREAD) {
206
		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
207
			return;
208
	} else {
209
		if (stall_user_nth && !(++user_cnt % stall_user_nth))
210
			return;
211
	}
212

213
	if (test_error_cnt && !--test_error_cnt)
214
		scx_bpf_error("test triggering error");
215

216
	if (!(tctx = lookup_task_ctx(p)))
217
		return;
218

219
	/*
220
	 * All enqueued tasks must have their core_sched_seq updated for correct
221
	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
222
	 */
223
	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
224

225
	/*
226
	 * If qmap_select_cpu() is telling us to or this is the last runnable
227
	 * task on the CPU, enqueue locally.
228
	 */
229
	if (tctx->force_local) {
230
		tctx->force_local = false;
231
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
232
		return;
233
	}
234

235
	/* if select_cpu() wasn't called, try direct dispatch */
236
	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
237
	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
238
		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
239
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
240
		return;
241
	}
242

243
	/*
244
	 * If the task was re-enqueued due to the CPU being preempted by a
245
	 * higher priority scheduling class, just re-enqueue the task directly
246
	 * on the global DSQ. As we want another CPU to pick it up, find and
247
	 * kick an idle CPU.
248
	 */
249
	if (enq_flags & SCX_ENQ_REENQ) {
250
		s32 cpu;
251

252
		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
253
		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
254
		if (cpu >= 0)
255
			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
256
		return;
257
	}
258

259
	ring = bpf_map_lookup_elem(&queue_arr, &idx);
260
	if (!ring) {
261
		scx_bpf_error("failed to find ring %d", idx);
262
		return;
263
	}
264

265
	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
266
	if (bpf_map_push_elem(ring, &pid, 0)) {
267
		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
268
		return;
269
	}
270

271
	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
272
		tctx->highpri = true;
273
		__sync_fetch_and_add(&nr_highpri_queued, 1);
274
	}
275
	__sync_fetch_and_add(&nr_enqueued, 1);
276
}
277

278
/*
279
 * The BPF queue map doesn't support removal and sched_ext can handle spurious
280
 * dispatches. qmap_dequeue() is only used to collect statistics.
281
 */
282
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
283
{
284
	__sync_fetch_and_add(&nr_dequeued, 1);
285
	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
286
		__sync_fetch_and_add(&nr_core_sched_execed, 1);
287
}
288

289
static void update_core_sched_head_seq(struct task_struct *p)
290
{
291
	int idx = weight_to_idx(p->scx.weight);
292
	struct task_ctx *tctx;
293

294
	if ((tctx = lookup_task_ctx(p)))
295
		core_sched_head_seqs[idx] = tctx->core_sched_seq;
296
}
297

298
/*
299
 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
300
 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
301
 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
302
 * difference only when dsp_batch is larger than 1.
303
 *
304
 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
305
 * non-rq-lock holding BPF programs. As demonstration, this function is called
306
 * from qmap_dispatch() and monitor_timerfn().
307
 */
308
static bool dispatch_highpri(bool from_timer)
309
{
310
	struct task_struct *p;
311
	s32 this_cpu = bpf_get_smp_processor_id();
312

313
	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
314
	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
315
		static u64 highpri_seq;
316
		struct task_ctx *tctx;
317

318
		if (!(tctx = lookup_task_ctx(p)))
319
			return false;
320

321
		if (tctx->highpri) {
322
			/* exercise the set_*() and vtime interface too */
323
			__COMPAT_scx_bpf_dsq_move_set_slice(
324
				BPF_FOR_EACH_ITER, slice_ns * 2);
325
			__COMPAT_scx_bpf_dsq_move_set_vtime(
326
				BPF_FOR_EACH_ITER, highpri_seq++);
327
			__COMPAT_scx_bpf_dsq_move_vtime(
328
				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
329
		}
330
	}
331

332
	/*
333
	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
334
	 * is found.
335
	 */
336
	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
337
		bool dispatched = false;
338
		s32 cpu;
339

340
		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
341
			cpu = this_cpu;
342
		else
343
			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
344

345
		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
346
					      SCX_DSQ_LOCAL_ON | cpu,
347
					      SCX_ENQ_PREEMPT)) {
348
			if (cpu == this_cpu) {
349
				dispatched = true;
350
				__sync_fetch_and_add(&nr_expedited_local, 1);
351
			} else {
352
				__sync_fetch_and_add(&nr_expedited_remote, 1);
353
			}
354
			if (from_timer)
355
				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
356
		} else {
357
			__sync_fetch_and_add(&nr_expedited_lost, 1);
358
		}
359

360
		if (dispatched)
361
			return true;
362
	}
363

364
	return false;
365
}
366

367
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
368
{
369
	struct task_struct *p;
370
	struct cpu_ctx *cpuc;
371
	struct task_ctx *tctx;
372
	u32 zero = 0, batch = dsp_batch ?: 1;
373
	void *fifo;
374
	s32 i, pid;
375

376
	if (dispatch_highpri(false))
377
		return;
378

379
	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
380
		return;
381

382
	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
383
		/*
384
		 * PID 2 should be kthreadd which should mostly be idle and off
385
		 * the scheduler. Let's keep dispatching it to force the kernel
386
		 * to call this function over and over again.
387
		 */
388
		p = bpf_task_from_pid(2);
389
		if (p) {
390
			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
391
			bpf_task_release(p);
392
			return;
393
		}
394
	}
395

396
	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
397
		scx_bpf_error("failed to look up cpu_ctx");
398
		return;
399
	}
400

401
	for (i = 0; i < 5; i++) {
402
		/* Advance the dispatch cursor and pick the fifo. */
403
		if (!cpuc->dsp_cnt) {
404
			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
405
			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
406
		}
407

408
		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
409
		if (!fifo) {
410
			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
411
			return;
412
		}
413

414
		/* Dispatch or advance. */
415
		bpf_repeat(BPF_MAX_LOOPS) {
416
			struct task_ctx *tctx;
417

418
			if (bpf_map_pop_elem(fifo, &pid))
419
				break;
420

421
			p = bpf_task_from_pid(pid);
422
			if (!p)
423
				continue;
424

425
			if (!(tctx = lookup_task_ctx(p))) {
426
				bpf_task_release(p);
427
				return;
428
			}
429

430
			if (tctx->highpri)
431
				__sync_fetch_and_sub(&nr_highpri_queued, 1);
432

433
			update_core_sched_head_seq(p);
434
			__sync_fetch_and_add(&nr_dispatched, 1);
435

436
			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
437
			bpf_task_release(p);
438

439
			batch--;
440
			cpuc->dsp_cnt--;
441
			if (!batch || !scx_bpf_dispatch_nr_slots()) {
442
				if (dispatch_highpri(false))
443
					return;
444
				scx_bpf_dsq_move_to_local(SHARED_DSQ);
445
				return;
446
			}
447
			if (!cpuc->dsp_cnt)
448
				break;
449
		}
450

451
		cpuc->dsp_cnt = 0;
452
	}
453

454
	/*
455
	 * No other tasks. @prev will keep running. Update its core_sched_seq as
456
	 * if the task were enqueued and dispatched immediately.
457
	 */
458
	if (prev) {
459
		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
460
		if (!tctx) {
461
			scx_bpf_error("task_ctx lookup failed");
462
			return;
463
		}
464

465
		tctx->core_sched_seq =
466
			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
467
	}
468
}
469

470
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
471
{
472
	struct cpu_ctx *cpuc;
473
	u32 zero = 0;
474
	int idx;
475

476
	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
477
		scx_bpf_error("failed to look up cpu_ctx");
478
		return;
479
	}
480

481
	/*
482
	 * Use the running avg of weights to select the target cpuperf level.
483
	 * This is a demonstration of the cpuperf feature rather than a
484
	 * practical strategy to regulate CPU frequency.
485
	 */
486
	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
487
	idx = weight_to_idx(cpuc->avg_weight);
488
	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
489

490
	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
491
}
492

493
/*
494
 * The distance from the head of the queue scaled by the weight of the queue.
495
 * The lower the number, the older the task and the higher the priority.
496
 */
497
static s64 task_qdist(struct task_struct *p)
498
{
499
	int idx = weight_to_idx(p->scx.weight);
500
	struct task_ctx *tctx;
501
	s64 qdist;
502

503
	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
504
	if (!tctx) {
505
		scx_bpf_error("task_ctx lookup failed");
506
		return 0;
507
	}
508

509
	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
510

511
	/*
512
	 * As queue index increments, the priority doubles. The queue w/ index 3
513
	 * is dispatched twice more frequently than 2. Reflect the difference by
514
	 * scaling qdists accordingly. Note that the shift amount needs to be
515
	 * flipped depending on the sign to avoid flipping priority direction.
516
	 */
517
	if (qdist >= 0)
518
		return qdist << (4 - idx);
519
	else
520
		return qdist << idx;
521
}
522

523
/*
524
 * This is called to determine the task ordering when core-sched is picking
525
 * tasks to execute on SMT siblings and should encode about the same ordering as
526
 * the regular scheduling path. Use the priority-scaled distances from the head
527
 * of the queues to compare the two tasks which should be consistent with the
528
 * dispatch path behavior.
529
 */
530
bool BPF_STRUCT_OPS(qmap_core_sched_before,
531
		    struct task_struct *a, struct task_struct *b)
532
{
533
	return task_qdist(a) > task_qdist(b);
534
}
535

536
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
537
{
538
	u32 cnt;
539

540
	/*
541
	 * Called when @cpu is taken by a higher priority scheduling class. This
542
	 * makes @cpu no longer available for executing sched_ext tasks. As we
543
	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
544
	 * becomes available again, re-enqueue them into the global dsq. See
545
	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
546
	 */
547
	cnt = scx_bpf_reenqueue_local();
548
	if (cnt)
549
		__sync_fetch_and_add(&nr_reenqueued, cnt);
550
}
551

552
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
553
		   struct scx_init_task_args *args)
554
{
555
	if (p->tgid == disallow_tgid)
556
		p->scx.disallow = true;
557

558
	/*
559
	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
560
	 * in this function and the following will automatically use GFP_KERNEL.
561
	 */
562
	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
563
				 BPF_LOCAL_STORAGE_GET_F_CREATE))
564
		return 0;
565
	else
566
		return -ENOMEM;
567
}
568

569
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
570
{
571
	s32 i, pid;
572

573
	if (suppress_dump)
574
		return;
575

576
	bpf_for(i, 0, 5) {
577
		void *fifo;
578

579
		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
580
			return;
581

582
		scx_bpf_dump("QMAP FIFO[%d]:", i);
583

584
		/*
585
		 * Dump can be invoked anytime and there is no way to iterate in
586
		 * a non-destructive way. Pop and store in dump_store and then
587
		 * restore afterwards. If racing against new enqueues, ordering
588
		 * can get mixed up.
589
		 */
590
		bpf_repeat(4096) {
591
			if (bpf_map_pop_elem(fifo, &pid))
592
				break;
593
			bpf_map_push_elem(&dump_store, &pid, 0);
594
			scx_bpf_dump(" %d", pid);
595
		}
596

597
		bpf_repeat(4096) {
598
			if (bpf_map_pop_elem(&dump_store, &pid))
599
				break;
600
			bpf_map_push_elem(fifo, &pid, 0);
601
		}
602

603
		scx_bpf_dump("\n");
604
	}
605
}
606

607
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
608
{
609
	u32 zero = 0;
610
	struct cpu_ctx *cpuc;
611

612
	if (suppress_dump || idle)
613
		return;
614
	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
615
		return;
616

617
	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
618
		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
619
		     cpuc->cpuperf_target);
620
}
621

622
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
623
{
624
	struct task_ctx *taskc;
625

626
	if (suppress_dump)
627
		return;
628
	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
629
		return;
630

631
	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
632
		     taskc->force_local, taskc->core_sched_seq);
633
}
634

635
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
636
{
637
	if (print_msgs)
638
		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
639
			   cgrp->kn->id, args->weight, args->bw_period_us,
640
			   args->bw_quota_us, args->bw_burst_us);
641
	return 0;
642
}
643

644
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
645
{
646
	if (print_msgs)
647
		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
648
}
649

650
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
651
		    u64 period_us, u64 quota_us, u64 burst_us)
652
{
653
	if (print_msgs)
654
		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
655
			   cgrp->kn->id, period_us, quota_us, burst_us);
656
}
657

658
/*
659
 * Print out the online and possible CPU map using bpf_printk() as a
660
 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
661
 */
662
static void print_cpus(void)
663
{
664
	const struct cpumask *possible, *online;
665
	s32 cpu;
666
	char buf[128] = "", *p;
667
	int idx;
668

669
	possible = scx_bpf_get_possible_cpumask();
670
	online = scx_bpf_get_online_cpumask();
671

672
	idx = 0;
673
	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
674
		if (!(p = MEMBER_VPTR(buf, [idx++])))
675
			break;
676
		if (bpf_cpumask_test_cpu(cpu, online))
677
			*p++ = 'O';
678
		else if (bpf_cpumask_test_cpu(cpu, possible))
679
			*p++ = 'X';
680
		else
681
			*p++ = ' ';
682

683
		if ((cpu & 7) == 7) {
684
			if (!(p = MEMBER_VPTR(buf, [idx++])))
685
				break;
686
			*p++ = '|';
687
		}
688
	}
689
	buf[sizeof(buf) - 1] = '\0';
690

691
	scx_bpf_put_cpumask(online);
692
	scx_bpf_put_cpumask(possible);
693

694
	bpf_printk("CPUS: |%s", buf);
695
}
696

697
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
698
{
699
	if (print_msgs) {
700
		bpf_printk("CPU %d coming online", cpu);
701
		/* @cpu is already online at this point */
702
		print_cpus();
703
	}
704
}
705

706
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
707
{
708
	if (print_msgs) {
709
		bpf_printk("CPU %d going offline", cpu);
710
		/* @cpu is still online at this point */
711
		print_cpus();
712
	}
713
}
714

715
struct monitor_timer {
716
	struct bpf_timer timer;
717
};
718

719
struct {
720
	__uint(type, BPF_MAP_TYPE_ARRAY);
721
	__uint(max_entries, 1);
722
	__type(key, u32);
723
	__type(value, struct monitor_timer);
724
} monitor_timer SEC(".maps");
725

726
/*
727
 * Print out the min, avg and max performance levels of CPUs every second to
728
 * demonstrate the cpuperf interface.
729
 */
730
static void monitor_cpuperf(void)
731
{
732
	u32 zero = 0, nr_cpu_ids;
733
	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
734
	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
735
	const struct cpumask *online;
736
	int i, nr_online_cpus = 0;
737

738
	nr_cpu_ids = scx_bpf_nr_cpu_ids();
739
	online = scx_bpf_get_online_cpumask();
740

741
	bpf_for(i, 0, nr_cpu_ids) {
742
		struct cpu_ctx *cpuc;
743
		u32 cap, cur;
744

745
		if (!bpf_cpumask_test_cpu(i, online))
746
			continue;
747
		nr_online_cpus++;
748

749
		/* collect the capacity and current cpuperf */
750
		cap = scx_bpf_cpuperf_cap(i);
751
		cur = scx_bpf_cpuperf_cur(i);
752

753
		cur_min = cur < cur_min ? cur : cur_min;
754
		cur_max = cur > cur_max ? cur : cur_max;
755

756
		/*
757
		 * $cur is relative to $cap. Scale it down accordingly so that
758
		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
759
		 * makes sense.
760
		 */
761
		cur_sum += cur * cap / SCX_CPUPERF_ONE;
762
		cap_sum += cap;
763

764
		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
765
			scx_bpf_error("failed to look up cpu_ctx");
766
			goto out;
767
		}
768

769
		/* collect target */
770
		cur = cpuc->cpuperf_target;
771
		target_sum += cur;
772
		target_min = cur < target_min ? cur : target_min;
773
		target_max = cur > target_max ? cur : target_max;
774
	}
775

776
	cpuperf_min = cur_min;
777
	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
778
	cpuperf_max = cur_max;
779

780
	cpuperf_target_min = target_min;
781
	cpuperf_target_avg = target_sum / nr_online_cpus;
782
	cpuperf_target_max = target_max;
783
out:
784
	scx_bpf_put_cpumask(online);
785
}
786

787
/*
788
 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
789
 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
790
 * see meaningful dumps in the trace pipe.
791
 */
792
static void dump_shared_dsq(void)
793
{
794
	struct task_struct *p;
795
	s32 nr;
796

797
	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
798
		return;
799

800
	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
801

802
	bpf_rcu_read_lock();
803
	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
804
		bpf_printk("%s[%d]", p->comm, p->pid);
805
	bpf_rcu_read_unlock();
806
}
807

808
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
809
{
810
	bpf_rcu_read_lock();
811
	dispatch_highpri(true);
812
	bpf_rcu_read_unlock();
813

814
	monitor_cpuperf();
815

816
	if (print_dsqs_and_events) {
817
		struct scx_event_stats events;
818

819
		dump_shared_dsq();
820

821
		__COMPAT_scx_bpf_events(&events, sizeof(events));
822

823
		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
824
			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
825
		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
826
			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
827
		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
828
			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
829
		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
830
			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
831
		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
832
			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
833
		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
834
			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
835
		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
836
			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
837
		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
838
			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
839
	}
840

841
	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
842
	return 0;
843
}
844

845
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
846
{
847
	u32 key = 0;
848
	struct bpf_timer *timer;
849
	s32 ret;
850

851
	if (print_msgs)
852
		print_cpus();
853

854
	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
855
	if (ret)
856
		return ret;
857

858
	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
859
	if (ret)
860
		return ret;
861

862
	timer = bpf_map_lookup_elem(&monitor_timer, &key);
863
	if (!timer)
864
		return -ESRCH;
865

866
	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
867
	bpf_timer_set_callback(timer, monitor_timerfn);
868

869
	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
870
}
871

872
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
873
{
874
	UEI_RECORD(uei, ei);
875
}
876

877
SCX_OPS_DEFINE(qmap_ops,
878
	       .select_cpu		= (void *)qmap_select_cpu,
879
	       .enqueue			= (void *)qmap_enqueue,
880
	       .dequeue			= (void *)qmap_dequeue,
881
	       .dispatch		= (void *)qmap_dispatch,
882
	       .tick			= (void *)qmap_tick,
883
	       .core_sched_before	= (void *)qmap_core_sched_before,
884
	       .cpu_release		= (void *)qmap_cpu_release,
885
	       .init_task		= (void *)qmap_init_task,
886
	       .dump			= (void *)qmap_dump,
887
	       .dump_cpu		= (void *)qmap_dump_cpu,
888
	       .dump_task		= (void *)qmap_dump_task,
889
	       .cgroup_init		= (void *)qmap_cgroup_init,
890
	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
891
	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
892
	       .cpu_online		= (void *)qmap_cpu_online,
893
	       .cpu_offline		= (void *)qmap_cpu_offline,
894
	       .init			= (void *)qmap_init,
895
	       .exit			= (void *)qmap_exit,
896
	       .timeout_ms		= 5000U,
897
	       .name			= "qmap");
898

899
Product

Resources

Company