1a7a9fc54STejun Heo /* SPDX-License-Identifier: GPL-2.0 */
2f0e1a064STejun Heo /*
3fa48e8d2STejun Heo * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4fa48e8d2STejun Heo *
5f0e1a064STejun Heo * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
6f0e1a064STejun Heo * Copyright (c) 2022 Tejun Heo <[email protected]>
7f0e1a064STejun Heo * Copyright (c) 2022 David Vernet <[email protected]>
8f0e1a064STejun Heo */
9a7a9fc54STejun Heo #ifndef _LINUX_SCHED_EXT_H
10a7a9fc54STejun Heo #define _LINUX_SCHED_EXT_H
11a7a9fc54STejun Heo
12a7a9fc54STejun Heo #ifdef CONFIG_SCHED_CLASS_EXT
13f0e1a064STejun Heo
14f0e1a064STejun Heo #include <linux/llist.h>
15f0e1a064STejun Heo #include <linux/rhashtable-types.h>
16f0e1a064STejun Heo
17f0e1a064STejun Heo enum scx_public_consts {
18f0e1a064STejun Heo SCX_OPS_NAME_LEN = 128,
19f0e1a064STejun Heo
20f0e1a064STejun Heo SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
2122a92020STejun Heo SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
22f0e1a064STejun Heo };
23f0e1a064STejun Heo
24f0e1a064STejun Heo /*
25f0e1a064STejun Heo * DSQ (dispatch queue) IDs are 64bit of the format:
26f0e1a064STejun Heo *
27f0e1a064STejun Heo * Bits: [63] [62 .. 0]
28f0e1a064STejun Heo * [ B] [ ID ]
29f0e1a064STejun Heo *
30f0e1a064STejun Heo * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
31f0e1a064STejun Heo * ID: 63 bit ID
32f0e1a064STejun Heo *
33f0e1a064STejun Heo * Built-in IDs:
34f0e1a064STejun Heo *
35f0e1a064STejun Heo * Bits: [63] [62] [61..32] [31 .. 0]
36f0e1a064STejun Heo * [ 1] [ L] [ R ] [ V ]
37f0e1a064STejun Heo *
38f0e1a064STejun Heo * 1: 1 for built-in DSQs.
39f0e1a064STejun Heo * L: 1 for LOCAL_ON DSQ IDs, 0 for others
40f0e1a064STejun Heo * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
41f0e1a064STejun Heo */
42f0e1a064STejun Heo enum scx_dsq_id_flags {
43f0e1a064STejun Heo SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
44f0e1a064STejun Heo SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
45f0e1a064STejun Heo
46f0e1a064STejun Heo SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
47f0e1a064STejun Heo SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
48f0e1a064STejun Heo SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
49f0e1a064STejun Heo SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
50f0e1a064STejun Heo SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
51f0e1a064STejun Heo };
52f0e1a064STejun Heo
53f0e1a064STejun Heo /*
5406e51be3STejun Heo * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
5506e51be3STejun Heo * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
5606e51be3STejun Heo * buffer between the scheduler core and the BPF scheduler. See the
5706e51be3STejun Heo * documentation for more details.
58f0e1a064STejun Heo */
59f0e1a064STejun Heo struct scx_dispatch_q {
60f0e1a064STejun Heo raw_spinlock_t lock;
61f0e1a064STejun Heo struct list_head list; /* tasks in dispatch order */
6206e51be3STejun Heo struct rb_root priq; /* used to order by p->scx.dsq_vtime */
63f0e1a064STejun Heo u32 nr;
64650ba21bSTejun Heo u32 seq; /* used by BPF iter */
65f0e1a064STejun Heo u64 id;
66f0e1a064STejun Heo struct rhash_head hash_node;
67f0e1a064STejun Heo struct llist_node free_node;
68f0e1a064STejun Heo struct rcu_head rcu;
69f0e1a064STejun Heo };
70f0e1a064STejun Heo
71f0e1a064STejun Heo /* scx_entity.flags */
72f0e1a064STejun Heo enum scx_ent_flags {
73f0e1a064STejun Heo SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
74f0e1a064STejun Heo SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
75f0e1a064STejun Heo SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
76f0e1a064STejun Heo
77f0e1a064STejun Heo SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
78f0e1a064STejun Heo SCX_TASK_STATE_BITS = 2,
79f0e1a064STejun Heo SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
80f0e1a064STejun Heo
81f0e1a064STejun Heo SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
82f0e1a064STejun Heo };
83f0e1a064STejun Heo
84f0e1a064STejun Heo /* scx_entity.flags & SCX_TASK_STATE_MASK */
85f0e1a064STejun Heo enum scx_task_state {
86f0e1a064STejun Heo SCX_TASK_NONE, /* ops.init_task() not called yet */
87f0e1a064STejun Heo SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
88f0e1a064STejun Heo SCX_TASK_READY, /* fully initialized, but not in sched_ext */
89f0e1a064STejun Heo SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
90f0e1a064STejun Heo
91f0e1a064STejun Heo SCX_TASK_NR_STATES,
92f0e1a064STejun Heo };
93f0e1a064STejun Heo
9406e51be3STejun Heo /* scx_entity.dsq_flags */
9506e51be3STejun Heo enum scx_ent_dsq_flags {
9606e51be3STejun Heo SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */
9706e51be3STejun Heo };
9806e51be3STejun Heo
99f0e1a064STejun Heo /*
100f0e1a064STejun Heo * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
101f0e1a064STejun Heo * everywhere and the following bits track which kfunc sets are currently
102f0e1a064STejun Heo * allowed for %current. This simple per-task tracking works because SCX ops
103f0e1a064STejun Heo * nest in a limited way. BPF will likely implement a way to allow and disallow
104f0e1a064STejun Heo * kfuncs depending on the calling context which will replace this manual
105f0e1a064STejun Heo * mechanism. See scx_kf_allow().
106f0e1a064STejun Heo */
107f0e1a064STejun Heo enum scx_kf_mask {
108298dec19SDavid Vernet SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
109245254f7SDavid Vernet /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
110298dec19SDavid Vernet SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
111f0e1a064STejun Heo /* ops.dequeue (in REST) may be nested inside DISPATCH */
112298dec19SDavid Vernet SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
113298dec19SDavid Vernet SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
114298dec19SDavid Vernet SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
115298dec19SDavid Vernet SCX_KF_REST = 1 << 4, /* other rq-locked operations */
116f0e1a064STejun Heo
117245254f7SDavid Vernet __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
118f0e1a064STejun Heo SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
11936454023STejun Heo __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
120f0e1a064STejun Heo };
121f0e1a064STejun Heo
1226462dd53STejun Heo enum scx_dsq_lnode_flags {
1236462dd53STejun Heo SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
1246462dd53STejun Heo
1256462dd53STejun Heo /* high 16 bits can be for iter cursor flags */
1266462dd53STejun Heo __SCX_DSQ_LNODE_PRIV_SHIFT = 16,
1276462dd53STejun Heo };
1286462dd53STejun Heo
129d4af01c3STejun Heo struct scx_dsq_list_node {
130d4af01c3STejun Heo struct list_head node;
1316462dd53STejun Heo u32 flags;
1326462dd53STejun Heo u32 priv; /* can be used by iter cursor */
13306e51be3STejun Heo };
13406e51be3STejun Heo
135f0e1a064STejun Heo /*
136f0e1a064STejun Heo * The following is embedded in task_struct and contains all fields necessary
137f0e1a064STejun Heo * for a task to be scheduled by SCX.
138f0e1a064STejun Heo */
139f0e1a064STejun Heo struct sched_ext_entity {
140f0e1a064STejun Heo struct scx_dispatch_q *dsq;
141d4af01c3STejun Heo struct scx_dsq_list_node dsq_list; /* dispatch order */
142d4af01c3STejun Heo struct rb_node dsq_priq; /* p->scx.dsq_vtime order */
143650ba21bSTejun Heo u32 dsq_seq;
144d4af01c3STejun Heo u32 dsq_flags; /* protected by DSQ lock */
145f0e1a064STejun Heo u32 flags; /* protected by rq lock */
146f0e1a064STejun Heo u32 weight;
147f0e1a064STejun Heo s32 sticky_cpu;
148f0e1a064STejun Heo s32 holding_cpu;
149*f7f61421SChangwoo Min s32 selected_cpu;
150f0e1a064STejun Heo u32 kf_mask; /* see scx_kf_mask above */
15136454023STejun Heo struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
152f0e1a064STejun Heo atomic_long_t ops_state;
153f0e1a064STejun Heo
154f0e1a064STejun Heo struct list_head runnable_node; /* rq->scx.runnable_list */
1558a010b81SDavid Vernet unsigned long runnable_at;
156f0e1a064STejun Heo
1577b0888b7STejun Heo #ifdef CONFIG_SCHED_CORE
1587b0888b7STejun Heo u64 core_sched_at; /* see scx_prio_less() */
1597b0888b7STejun Heo #endif
160f0e1a064STejun Heo u64 ddsp_dsq_id;
161f0e1a064STejun Heo u64 ddsp_enq_flags;
162f0e1a064STejun Heo
163f0e1a064STejun Heo /* BPF scheduler modifiable fields */
164f0e1a064STejun Heo
165f0e1a064STejun Heo /*
166f0e1a064STejun Heo * Runtime budget in nsecs. This is usually set through
167f0e1a064STejun Heo * scx_bpf_dispatch() but can also be modified directly by the BPF
168f0e1a064STejun Heo * scheduler. Automatically decreased by SCX as the task executes. On
169f0e1a064STejun Heo * depletion, a scheduling event is triggered.
17081aae789STejun Heo *
17181aae789STejun Heo * This value is cleared to zero if the task is preempted by
17281aae789STejun Heo * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
17381aae789STejun Heo * task ran. Use p->se.sum_exec_runtime instead.
174f0e1a064STejun Heo */
175f0e1a064STejun Heo u64 slice;
176f0e1a064STejun Heo
1777bb6f081STejun Heo /*
17806e51be3STejun Heo * Used to order tasks when dispatching to the vtime-ordered priority
17906e51be3STejun Heo * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
18006e51be3STejun Heo * but can also be modified directly by the BPF scheduler. Modifying it
18106e51be3STejun Heo * while a task is queued on a dsq may mangle the ordering and is not
18206e51be3STejun Heo * recommended.
18306e51be3STejun Heo */
18406e51be3STejun Heo u64 dsq_vtime;
18506e51be3STejun Heo
18606e51be3STejun Heo /*
1877bb6f081STejun Heo * If set, reject future sched_setscheduler(2) calls updating the policy
1887bb6f081STejun Heo * to %SCHED_EXT with -%EACCES.
1897bb6f081STejun Heo *
190e99129e5STejun Heo * Can be set from ops.init_task() while the BPF scheduler is being
191e99129e5STejun Heo * loaded (!scx_init_task_args->fork). If set and the task's policy is
192e99129e5STejun Heo * already %SCHED_EXT, the task's policy is rejected and forcefully
193e99129e5STejun Heo * reverted to %SCHED_NORMAL. The number of such events are reported
194e99129e5STejun Heo * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
195e99129e5STejun Heo * during fork is not allowed.
1967bb6f081STejun Heo */
1977bb6f081STejun Heo bool disallow; /* reject switching into SCX */
1987bb6f081STejun Heo
199f0e1a064STejun Heo /* cold fields */
20081951366STejun Heo #ifdef CONFIG_EXT_GROUP_SCHED
20181951366STejun Heo struct cgroup *cgrp_moving_from;
20281951366STejun Heo #endif
203f0e1a064STejun Heo struct list_head tasks_node;
204f0e1a064STejun Heo };
205f0e1a064STejun Heo
206f0e1a064STejun Heo void sched_ext_free(struct task_struct *p);
2071538e339SDavid Vernet void print_scx_info(const char *log_lvl, struct task_struct *p);
208e32c2601STejun Heo void scx_softlockup(u32 dur_s);
209f0e1a064STejun Heo
210a7a9fc54STejun Heo #else /* !CONFIG_SCHED_CLASS_EXT */
211a7a9fc54STejun Heo
sched_ext_free(struct task_struct * p)212a7a9fc54STejun Heo static inline void sched_ext_free(struct task_struct *p) {}
print_scx_info(const char * log_lvl,struct task_struct * p)2131538e339SDavid Vernet static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
scx_softlockup(u32 dur_s)214e32c2601STejun Heo static inline void scx_softlockup(u32 dur_s) {}
215a7a9fc54STejun Heo
216a7a9fc54STejun Heo #endif /* CONFIG_SCHED_CLASS_EXT */
217a7a9fc54STejun Heo #endif /* _LINUX_SCHED_EXT_H */
218