xref: /linux-6.15/include/linux/sched/ext.h (revision fa48e8d2)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4  *
5  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6  * Copyright (c) 2022 Tejun Heo <[email protected]>
7  * Copyright (c) 2022 David Vernet <[email protected]>
8  */
9 #ifndef _LINUX_SCHED_EXT_H
10 #define _LINUX_SCHED_EXT_H
11 
12 #ifdef CONFIG_SCHED_CLASS_EXT
13 
14 #include <linux/llist.h>
15 #include <linux/rhashtable-types.h>
16 
17 enum scx_public_consts {
18 	SCX_OPS_NAME_LEN	= 128,
19 
20 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
21 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
22 };
23 
24 /*
25  * DSQ (dispatch queue) IDs are 64bit of the format:
26  *
27  *   Bits: [63] [62 ..  0]
28  *         [ B] [   ID   ]
29  *
30  *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
31  *   ID: 63 bit ID
32  *
33  * Built-in IDs:
34  *
35  *   Bits: [63] [62] [61..32] [31 ..  0]
36  *         [ 1] [ L] [   R  ] [    V   ]
37  *
38  *    1: 1 for built-in DSQs.
39  *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
40  *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
41  */
42 enum scx_dsq_id_flags {
43 	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
44 	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
45 
46 	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
47 	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
48 	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
49 	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
50 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
51 };
52 
53 /*
54  * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
55  * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
56  * buffer between the scheduler core and the BPF scheduler. See the
57  * documentation for more details.
58  */
59 struct scx_dispatch_q {
60 	raw_spinlock_t		lock;
61 	struct list_head	list;	/* tasks in dispatch order */
62 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
63 	u32			nr;
64 	u64			id;
65 	struct rhash_head	hash_node;
66 	struct llist_node	free_node;
67 	struct rcu_head		rcu;
68 };
69 
70 /* scx_entity.flags */
71 enum scx_ent_flags {
72 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
73 	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
74 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
75 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
76 
77 	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
78 	SCX_TASK_STATE_BITS	= 2,
79 	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
80 
81 	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
82 };
83 
84 /* scx_entity.flags & SCX_TASK_STATE_MASK */
85 enum scx_task_state {
86 	SCX_TASK_NONE,		/* ops.init_task() not called yet */
87 	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
88 	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
89 	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
90 
91 	SCX_TASK_NR_STATES,
92 };
93 
94 /* scx_entity.dsq_flags */
95 enum scx_ent_dsq_flags {
96 	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
97 };
98 
99 /*
100  * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
101  * everywhere and the following bits track which kfunc sets are currently
102  * allowed for %current. This simple per-task tracking works because SCX ops
103  * nest in a limited way. BPF will likely implement a way to allow and disallow
104  * kfuncs depending on the calling context which will replace this manual
105  * mechanism. See scx_kf_allow().
106  */
107 enum scx_kf_mask {
108 	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
109 	/* all non-sleepables may be nested inside SLEEPABLE */
110 	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
111 	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
112 	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
113 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
114 	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
115 	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
116 	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
117 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
118 
119 	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
120 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
121 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
122 };
123 
124 struct scx_dsq_node {
125 	struct list_head	list;		/* dispatch order */
126 	struct rb_node		priq;		/* p->scx.dsq_vtime order */
127 	u32			flags;		/* SCX_TASK_DSQ_* flags */
128 };
129 
130 /*
131  * The following is embedded in task_struct and contains all fields necessary
132  * for a task to be scheduled by SCX.
133  */
134 struct sched_ext_entity {
135 	struct scx_dispatch_q	*dsq;
136 	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
137 	u32			flags;		/* protected by rq lock */
138 	u32			weight;
139 	s32			sticky_cpu;
140 	s32			holding_cpu;
141 	u32			kf_mask;	/* see scx_kf_mask above */
142 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
143 	atomic_long_t		ops_state;
144 
145 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
146 	unsigned long		runnable_at;
147 
148 #ifdef CONFIG_SCHED_CORE
149 	u64			core_sched_at;	/* see scx_prio_less() */
150 #endif
151 	u64			ddsp_dsq_id;
152 	u64			ddsp_enq_flags;
153 
154 	/* BPF scheduler modifiable fields */
155 
156 	/*
157 	 * Runtime budget in nsecs. This is usually set through
158 	 * scx_bpf_dispatch() but can also be modified directly by the BPF
159 	 * scheduler. Automatically decreased by SCX as the task executes. On
160 	 * depletion, a scheduling event is triggered.
161 	 *
162 	 * This value is cleared to zero if the task is preempted by
163 	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
164 	 * task ran. Use p->se.sum_exec_runtime instead.
165 	 */
166 	u64			slice;
167 
168 	/*
169 	 * Used to order tasks when dispatching to the vtime-ordered priority
170 	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
171 	 * but can also be modified directly by the BPF scheduler. Modifying it
172 	 * while a task is queued on a dsq may mangle the ordering and is not
173 	 * recommended.
174 	 */
175 	u64			dsq_vtime;
176 
177 	/*
178 	 * If set, reject future sched_setscheduler(2) calls updating the policy
179 	 * to %SCHED_EXT with -%EACCES.
180 	 *
181 	 * If set from ops.init_task() and the task's policy is already
182 	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
183 	 * or by inhering the parent's policy during fork, the task's policy is
184 	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
185 	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
186 	 */
187 	bool			disallow;	/* reject switching into SCX */
188 
189 	/* cold fields */
190 	/* must be the last field, see init_scx_entity() */
191 	struct list_head	tasks_node;
192 };
193 
194 void sched_ext_free(struct task_struct *p);
195 void print_scx_info(const char *log_lvl, struct task_struct *p);
196 
197 #else	/* !CONFIG_SCHED_CLASS_EXT */
198 
199 static inline void sched_ext_free(struct task_struct *p) {}
200 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
201 
202 #endif	/* CONFIG_SCHED_CLASS_EXT */
203 #endif	/* _LINUX_SCHED_EXT_H */
204