1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_SCHED_H 3 #define _LINUX_SCHED_H 4 5 /* 6 * Define 'struct task_struct' and provide the main scheduler 7 * APIs (schedule(), wakeup variants, etc.) 8 */ 9 10 #include <uapi/linux/sched.h> 11 12 #include <asm/current.h> 13 14 #include <linux/pid.h> 15 #include <linux/sem.h> 16 #include <linux/shm.h> 17 #include <linux/kmsan_types.h> 18 #include <linux/mutex.h> 19 #include <linux/plist.h> 20 #include <linux/hrtimer.h> 21 #include <linux/irqflags.h> 22 #include <linux/seccomp.h> 23 #include <linux/nodemask.h> 24 #include <linux/rcupdate.h> 25 #include <linux/refcount.h> 26 #include <linux/resource.h> 27 #include <linux/latencytop.h> 28 #include <linux/sched/prio.h> 29 #include <linux/sched/types.h> 30 #include <linux/signal_types.h> 31 #include <linux/syscall_user_dispatch.h> 32 #include <linux/mm_types_task.h> 33 #include <linux/task_io_accounting.h> 34 #include <linux/posix-timers.h> 35 #include <linux/rseq.h> 36 #include <linux/seqlock.h> 37 #include <linux/kcsan.h> 38 #include <linux/rv.h> 39 #include <linux/livepatch_sched.h> 40 #include <asm/kmap_size.h> 41 42 /* task_struct member predeclarations (sorted alphabetically): */ 43 struct audit_context; 44 struct bio_list; 45 struct blk_plug; 46 struct bpf_local_storage; 47 struct bpf_run_ctx; 48 struct capture_control; 49 struct cfs_rq; 50 struct fs_struct; 51 struct futex_pi_state; 52 struct io_context; 53 struct io_uring_task; 54 struct mempolicy; 55 struct nameidata; 56 struct nsproxy; 57 struct perf_event_context; 58 struct pid_namespace; 59 struct pipe_inode_info; 60 struct rcu_node; 61 struct reclaim_state; 62 struct robust_list_head; 63 struct root_domain; 64 struct rq; 65 struct sched_attr; 66 struct sched_dl_entity; 67 struct seq_file; 68 struct sighand_struct; 69 struct signal_struct; 70 struct task_delay_info; 71 struct task_group; 72 struct task_struct; 73 struct user_event_mm; 74 75 /* 76 * Task state bitmask. NOTE! These bits are also 77 * encoded in fs/proc/array.c: get_task_state(). 78 * 79 * We have two separate sets of flags: task->__state 80 * is about runnability, while task->exit_state are 81 * about the task exiting. Confusing, but this way 82 * modifying one set can't modify the other one by 83 * mistake. 84 */ 85 86 /* Used in tsk->__state: */ 87 #define TASK_RUNNING 0x00000000 88 #define TASK_INTERRUPTIBLE 0x00000001 89 #define TASK_UNINTERRUPTIBLE 0x00000002 90 #define __TASK_STOPPED 0x00000004 91 #define __TASK_TRACED 0x00000008 92 /* Used in tsk->exit_state: */ 93 #define EXIT_DEAD 0x00000010 94 #define EXIT_ZOMBIE 0x00000020 95 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) 96 /* Used in tsk->__state again: */ 97 #define TASK_PARKED 0x00000040 98 #define TASK_DEAD 0x00000080 99 #define TASK_WAKEKILL 0x00000100 100 #define TASK_WAKING 0x00000200 101 #define TASK_NOLOAD 0x00000400 102 #define TASK_NEW 0x00000800 103 #define TASK_RTLOCK_WAIT 0x00001000 104 #define TASK_FREEZABLE 0x00002000 105 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) 106 #define TASK_FROZEN 0x00008000 107 #define TASK_STATE_MAX 0x00010000 108 109 #define TASK_ANY (TASK_STATE_MAX-1) 110 111 /* 112 * DO NOT ADD ANY NEW USERS ! 113 */ 114 #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) 115 116 /* Convenience macros for the sake of set_current_state: */ 117 #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) 118 #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) 119 #define TASK_TRACED __TASK_TRACED 120 121 #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) 122 123 /* Convenience macros for the sake of wake_up(): */ 124 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) 125 126 /* get_task_state(): */ 127 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ 128 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ 129 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ 130 TASK_PARKED) 131 132 #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) 133 134 #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) 135 #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) 136 #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) 137 138 /* 139 * Special states are those that do not use the normal wait-loop pattern. See 140 * the comment with set_special_state(). 141 */ 142 #define is_special_task_state(state) \ 143 ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) 144 145 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 146 # define debug_normal_state_change(state_value) \ 147 do { \ 148 WARN_ON_ONCE(is_special_task_state(state_value)); \ 149 current->task_state_change = _THIS_IP_; \ 150 } while (0) 151 152 # define debug_special_state_change(state_value) \ 153 do { \ 154 WARN_ON_ONCE(!is_special_task_state(state_value)); \ 155 current->task_state_change = _THIS_IP_; \ 156 } while (0) 157 158 # define debug_rtlock_wait_set_state() \ 159 do { \ 160 current->saved_state_change = current->task_state_change;\ 161 current->task_state_change = _THIS_IP_; \ 162 } while (0) 163 164 # define debug_rtlock_wait_restore_state() \ 165 do { \ 166 current->task_state_change = current->saved_state_change;\ 167 } while (0) 168 169 #else 170 # define debug_normal_state_change(cond) do { } while (0) 171 # define debug_special_state_change(cond) do { } while (0) 172 # define debug_rtlock_wait_set_state() do { } while (0) 173 # define debug_rtlock_wait_restore_state() do { } while (0) 174 #endif 175 176 /* 177 * set_current_state() includes a barrier so that the write of current->__state 178 * is correctly serialised wrt the caller's subsequent test of whether to 179 * actually sleep: 180 * 181 * for (;;) { 182 * set_current_state(TASK_UNINTERRUPTIBLE); 183 * if (CONDITION) 184 * break; 185 * 186 * schedule(); 187 * } 188 * __set_current_state(TASK_RUNNING); 189 * 190 * If the caller does not need such serialisation (because, for instance, the 191 * CONDITION test and condition change and wakeup are under the same lock) then 192 * use __set_current_state(). 193 * 194 * The above is typically ordered against the wakeup, which does: 195 * 196 * CONDITION = 1; 197 * wake_up_state(p, TASK_UNINTERRUPTIBLE); 198 * 199 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before 200 * accessing p->__state. 201 * 202 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, 203 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a 204 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). 205 * 206 * However, with slightly different timing the wakeup TASK_RUNNING store can 207 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not 208 * a problem either because that will result in one extra go around the loop 209 * and our @cond test will save the day. 210 * 211 * Also see the comments of try_to_wake_up(). 212 */ 213 #define __set_current_state(state_value) \ 214 do { \ 215 debug_normal_state_change((state_value)); \ 216 WRITE_ONCE(current->__state, (state_value)); \ 217 } while (0) 218 219 #define set_current_state(state_value) \ 220 do { \ 221 debug_normal_state_change((state_value)); \ 222 smp_store_mb(current->__state, (state_value)); \ 223 } while (0) 224 225 /* 226 * set_special_state() should be used for those states when the blocking task 227 * can not use the regular condition based wait-loop. In that case we must 228 * serialize against wakeups such that any possible in-flight TASK_RUNNING 229 * stores will not collide with our state change. 230 */ 231 #define set_special_state(state_value) \ 232 do { \ 233 unsigned long flags; /* may shadow */ \ 234 \ 235 raw_spin_lock_irqsave(¤t->pi_lock, flags); \ 236 debug_special_state_change((state_value)); \ 237 WRITE_ONCE(current->__state, (state_value)); \ 238 raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ 239 } while (0) 240 241 /* 242 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks 243 * 244 * RT's spin/rwlock substitutions are state preserving. The state of the 245 * task when blocking on the lock is saved in task_struct::saved_state and 246 * restored after the lock has been acquired. These operations are 247 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT 248 * lock related wakeups while the task is blocked on the lock are 249 * redirected to operate on task_struct::saved_state to ensure that these 250 * are not dropped. On restore task_struct::saved_state is set to 251 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. 252 * 253 * The lock operation looks like this: 254 * 255 * current_save_and_set_rtlock_wait_state(); 256 * for (;;) { 257 * if (try_lock()) 258 * break; 259 * raw_spin_unlock_irq(&lock->wait_lock); 260 * schedule_rtlock(); 261 * raw_spin_lock_irq(&lock->wait_lock); 262 * set_current_state(TASK_RTLOCK_WAIT); 263 * } 264 * current_restore_rtlock_saved_state(); 265 */ 266 #define current_save_and_set_rtlock_wait_state() \ 267 do { \ 268 lockdep_assert_irqs_disabled(); \ 269 raw_spin_lock(¤t->pi_lock); \ 270 current->saved_state = current->__state; \ 271 debug_rtlock_wait_set_state(); \ 272 WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ 273 raw_spin_unlock(¤t->pi_lock); \ 274 } while (0); 275 276 #define current_restore_rtlock_saved_state() \ 277 do { \ 278 lockdep_assert_irqs_disabled(); \ 279 raw_spin_lock(¤t->pi_lock); \ 280 debug_rtlock_wait_restore_state(); \ 281 WRITE_ONCE(current->__state, current->saved_state); \ 282 current->saved_state = TASK_RUNNING; \ 283 raw_spin_unlock(¤t->pi_lock); \ 284 } while (0); 285 286 #define get_current_state() READ_ONCE(current->__state) 287 288 /* 289 * Define the task command name length as enum, then it can be visible to 290 * BPF programs. 291 */ 292 enum { 293 TASK_COMM_LEN = 16, 294 }; 295 296 extern void scheduler_tick(void); 297 298 #define MAX_SCHEDULE_TIMEOUT LONG_MAX 299 300 extern long schedule_timeout(long timeout); 301 extern long schedule_timeout_interruptible(long timeout); 302 extern long schedule_timeout_killable(long timeout); 303 extern long schedule_timeout_uninterruptible(long timeout); 304 extern long schedule_timeout_idle(long timeout); 305 asmlinkage void schedule(void); 306 extern void schedule_preempt_disabled(void); 307 asmlinkage void preempt_schedule_irq(void); 308 #ifdef CONFIG_PREEMPT_RT 309 extern void schedule_rtlock(void); 310 #endif 311 312 extern int __must_check io_schedule_prepare(void); 313 extern void io_schedule_finish(int token); 314 extern long io_schedule_timeout(long timeout); 315 extern void io_schedule(void); 316 317 /** 318 * struct prev_cputime - snapshot of system and user cputime 319 * @utime: time spent in user mode 320 * @stime: time spent in system mode 321 * @lock: protects the above two fields 322 * 323 * Stores previous user/system time values such that we can guarantee 324 * monotonicity. 325 */ 326 struct prev_cputime { 327 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 328 u64 utime; 329 u64 stime; 330 raw_spinlock_t lock; 331 #endif 332 }; 333 334 enum vtime_state { 335 /* Task is sleeping or running in a CPU with VTIME inactive: */ 336 VTIME_INACTIVE = 0, 337 /* Task is idle */ 338 VTIME_IDLE, 339 /* Task runs in kernelspace in a CPU with VTIME active: */ 340 VTIME_SYS, 341 /* Task runs in userspace in a CPU with VTIME active: */ 342 VTIME_USER, 343 /* Task runs as guests in a CPU with VTIME active: */ 344 VTIME_GUEST, 345 }; 346 347 struct vtime { 348 seqcount_t seqcount; 349 unsigned long long starttime; 350 enum vtime_state state; 351 unsigned int cpu; 352 u64 utime; 353 u64 stime; 354 u64 gtime; 355 }; 356 357 /* 358 * Utilization clamp constraints. 359 * @UCLAMP_MIN: Minimum utilization 360 * @UCLAMP_MAX: Maximum utilization 361 * @UCLAMP_CNT: Utilization clamp constraints count 362 */ 363 enum uclamp_id { 364 UCLAMP_MIN = 0, 365 UCLAMP_MAX, 366 UCLAMP_CNT 367 }; 368 369 #ifdef CONFIG_SMP 370 extern struct root_domain def_root_domain; 371 extern struct mutex sched_domains_mutex; 372 #endif 373 374 struct sched_param { 375 int sched_priority; 376 }; 377 378 struct sched_info { 379 #ifdef CONFIG_SCHED_INFO 380 /* Cumulative counters: */ 381 382 /* # of times we have run on this CPU: */ 383 unsigned long pcount; 384 385 /* Time spent waiting on a runqueue: */ 386 unsigned long long run_delay; 387 388 /* Timestamps: */ 389 390 /* When did we last run on a CPU? */ 391 unsigned long long last_arrival; 392 393 /* When were we last queued to run? */ 394 unsigned long long last_queued; 395 396 #endif /* CONFIG_SCHED_INFO */ 397 }; 398 399 /* 400 * Integer metrics need fixed point arithmetic, e.g., sched/fair 401 * has a few: load, load_avg, util_avg, freq, and capacity. 402 * 403 * We define a basic fixed point arithmetic range, and then formalize 404 * all these metrics based on that basic range. 405 */ 406 # define SCHED_FIXEDPOINT_SHIFT 10 407 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) 408 409 /* Increase resolution of cpu_capacity calculations */ 410 # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT 411 # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) 412 413 struct load_weight { 414 unsigned long weight; 415 u32 inv_weight; 416 }; 417 418 /* 419 * The load/runnable/util_avg accumulates an infinite geometric series 420 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). 421 * 422 * [load_avg definition] 423 * 424 * load_avg = runnable% * scale_load_down(load) 425 * 426 * [runnable_avg definition] 427 * 428 * runnable_avg = runnable% * SCHED_CAPACITY_SCALE 429 * 430 * [util_avg definition] 431 * 432 * util_avg = running% * SCHED_CAPACITY_SCALE 433 * 434 * where runnable% is the time ratio that a sched_entity is runnable and 435 * running% the time ratio that a sched_entity is running. 436 * 437 * For cfs_rq, they are the aggregated values of all runnable and blocked 438 * sched_entities. 439 * 440 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU 441 * capacity scaling. The scaling is done through the rq_clock_pelt that is used 442 * for computing those signals (see update_rq_clock_pelt()) 443 * 444 * N.B., the above ratios (runnable% and running%) themselves are in the 445 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them 446 * to as large a range as necessary. This is for example reflected by 447 * util_avg's SCHED_CAPACITY_SCALE. 448 * 449 * [Overflow issue] 450 * 451 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities 452 * with the highest load (=88761), always runnable on a single cfs_rq, 453 * and should not overflow as the number already hits PID_MAX_LIMIT. 454 * 455 * For all other cases (including 32-bit kernels), struct load_weight's 456 * weight will overflow first before we do, because: 457 * 458 * Max(load_avg) <= Max(load.weight) 459 * 460 * Then it is the load_weight's responsibility to consider overflow 461 * issues. 462 */ 463 struct sched_avg { 464 u64 last_update_time; 465 u64 load_sum; 466 u64 runnable_sum; 467 u32 util_sum; 468 u32 period_contrib; 469 unsigned long load_avg; 470 unsigned long runnable_avg; 471 unsigned long util_avg; 472 unsigned int util_est; 473 } ____cacheline_aligned; 474 475 /* 476 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg 477 * updates. When a task is dequeued, its util_est should not be updated if its 478 * util_avg has not been updated in the meantime. 479 * This information is mapped into the MSB bit of util_est at dequeue time. 480 * Since max value of util_est for a task is 1024 (PELT util_avg for a task) 481 * it is safe to use MSB. 482 */ 483 #define UTIL_EST_WEIGHT_SHIFT 2 484 #define UTIL_AVG_UNCHANGED 0x80000000 485 486 struct sched_statistics { 487 #ifdef CONFIG_SCHEDSTATS 488 u64 wait_start; 489 u64 wait_max; 490 u64 wait_count; 491 u64 wait_sum; 492 u64 iowait_count; 493 u64 iowait_sum; 494 495 u64 sleep_start; 496 u64 sleep_max; 497 s64 sum_sleep_runtime; 498 499 u64 block_start; 500 u64 block_max; 501 s64 sum_block_runtime; 502 503 s64 exec_max; 504 u64 slice_max; 505 506 u64 nr_migrations_cold; 507 u64 nr_failed_migrations_affine; 508 u64 nr_failed_migrations_running; 509 u64 nr_failed_migrations_hot; 510 u64 nr_forced_migrations; 511 512 u64 nr_wakeups; 513 u64 nr_wakeups_sync; 514 u64 nr_wakeups_migrate; 515 u64 nr_wakeups_local; 516 u64 nr_wakeups_remote; 517 u64 nr_wakeups_affine; 518 u64 nr_wakeups_affine_attempts; 519 u64 nr_wakeups_passive; 520 u64 nr_wakeups_idle; 521 522 #ifdef CONFIG_SCHED_CORE 523 u64 core_forceidle_sum; 524 #endif 525 #endif /* CONFIG_SCHEDSTATS */ 526 } ____cacheline_aligned; 527 528 struct sched_entity { 529 /* For load-balancing: */ 530 struct load_weight load; 531 struct rb_node run_node; 532 u64 deadline; 533 u64 min_vruntime; 534 535 struct list_head group_node; 536 unsigned int on_rq; 537 538 u64 exec_start; 539 u64 sum_exec_runtime; 540 u64 prev_sum_exec_runtime; 541 u64 vruntime; 542 s64 vlag; 543 u64 slice; 544 545 u64 nr_migrations; 546 547 #ifdef CONFIG_FAIR_GROUP_SCHED 548 int depth; 549 struct sched_entity *parent; 550 /* rq on which this entity is (to be) queued: */ 551 struct cfs_rq *cfs_rq; 552 /* rq "owned" by this entity/group: */ 553 struct cfs_rq *my_q; 554 /* cached value of my_q->h_nr_running */ 555 unsigned long runnable_weight; 556 #endif 557 558 #ifdef CONFIG_SMP 559 /* 560 * Per entity load average tracking. 561 * 562 * Put into separate cache line so it does not 563 * collide with read-mostly values above. 564 */ 565 struct sched_avg avg; 566 #endif 567 }; 568 569 struct sched_rt_entity { 570 struct list_head run_list; 571 unsigned long timeout; 572 unsigned long watchdog_stamp; 573 unsigned int time_slice; 574 unsigned short on_rq; 575 unsigned short on_list; 576 577 struct sched_rt_entity *back; 578 #ifdef CONFIG_RT_GROUP_SCHED 579 struct sched_rt_entity *parent; 580 /* rq on which this entity is (to be) queued: */ 581 struct rt_rq *rt_rq; 582 /* rq "owned" by this entity/group: */ 583 struct rt_rq *my_q; 584 #endif 585 } __randomize_layout; 586 587 typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); 588 typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); 589 590 struct sched_dl_entity { 591 struct rb_node rb_node; 592 593 /* 594 * Original scheduling parameters. Copied here from sched_attr 595 * during sched_setattr(), they will remain the same until 596 * the next sched_setattr(). 597 */ 598 u64 dl_runtime; /* Maximum runtime for each instance */ 599 u64 dl_deadline; /* Relative deadline of each instance */ 600 u64 dl_period; /* Separation of two instances (period) */ 601 u64 dl_bw; /* dl_runtime / dl_period */ 602 u64 dl_density; /* dl_runtime / dl_deadline */ 603 604 /* 605 * Actual scheduling parameters. Initialized with the values above, 606 * they are continuously updated during task execution. Note that 607 * the remaining runtime could be < 0 in case we are in overrun. 608 */ 609 s64 runtime; /* Remaining runtime for this instance */ 610 u64 deadline; /* Absolute deadline for this instance */ 611 unsigned int flags; /* Specifying the scheduler behaviour */ 612 613 /* 614 * Some bool flags: 615 * 616 * @dl_throttled tells if we exhausted the runtime. If so, the 617 * task has to wait for a replenishment to be performed at the 618 * next firing of dl_timer. 619 * 620 * @dl_yielded tells if task gave up the CPU before consuming 621 * all its available runtime during the last job. 622 * 623 * @dl_non_contending tells if the task is inactive while still 624 * contributing to the active utilization. In other words, it 625 * indicates if the inactive timer has been armed and its handler 626 * has not been executed yet. This flag is useful to avoid race 627 * conditions between the inactive timer handler and the wakeup 628 * code. 629 * 630 * @dl_overrun tells if the task asked to be informed about runtime 631 * overruns. 632 */ 633 unsigned int dl_throttled : 1; 634 unsigned int dl_yielded : 1; 635 unsigned int dl_non_contending : 1; 636 unsigned int dl_overrun : 1; 637 unsigned int dl_server : 1; 638 639 /* 640 * Bandwidth enforcement timer. Each -deadline task has its 641 * own bandwidth to be enforced, thus we need one timer per task. 642 */ 643 struct hrtimer dl_timer; 644 645 /* 646 * Inactive timer, responsible for decreasing the active utilization 647 * at the "0-lag time". When a -deadline task blocks, it contributes 648 * to GRUB's active utilization until the "0-lag time", hence a 649 * timer is needed to decrease the active utilization at the correct 650 * time. 651 */ 652 struct hrtimer inactive_timer; 653 654 /* 655 * Bits for DL-server functionality. Also see the comment near 656 * dl_server_update(). 657 * 658 * @rq the runqueue this server is for 659 * 660 * @server_has_tasks() returns true if @server_pick return a 661 * runnable task. 662 */ 663 struct rq *rq; 664 dl_server_has_tasks_f server_has_tasks; 665 dl_server_pick_f server_pick; 666 667 #ifdef CONFIG_RT_MUTEXES 668 /* 669 * Priority Inheritance. When a DEADLINE scheduling entity is boosted 670 * pi_se points to the donor, otherwise points to the dl_se it belongs 671 * to (the original one/itself). 672 */ 673 struct sched_dl_entity *pi_se; 674 #endif 675 }; 676 677 #ifdef CONFIG_UCLAMP_TASK 678 /* Number of utilization clamp buckets (shorter alias) */ 679 #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT 680 681 /* 682 * Utilization clamp for a scheduling entity 683 * @value: clamp value "assigned" to a se 684 * @bucket_id: bucket index corresponding to the "assigned" value 685 * @active: the se is currently refcounted in a rq's bucket 686 * @user_defined: the requested clamp value comes from user-space 687 * 688 * The bucket_id is the index of the clamp bucket matching the clamp value 689 * which is pre-computed and stored to avoid expensive integer divisions from 690 * the fast path. 691 * 692 * The active bit is set whenever a task has got an "effective" value assigned, 693 * which can be different from the clamp value "requested" from user-space. 694 * This allows to know a task is refcounted in the rq's bucket corresponding 695 * to the "effective" bucket_id. 696 * 697 * The user_defined bit is set whenever a task has got a task-specific clamp 698 * value requested from userspace, i.e. the system defaults apply to this task 699 * just as a restriction. This allows to relax default clamps when a less 700 * restrictive task-specific value has been requested, thus allowing to 701 * implement a "nice" semantic. For example, a task running with a 20% 702 * default boost can still drop its own boosting to 0%. 703 */ 704 struct uclamp_se { 705 unsigned int value : bits_per(SCHED_CAPACITY_SCALE); 706 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); 707 unsigned int active : 1; 708 unsigned int user_defined : 1; 709 }; 710 #endif /* CONFIG_UCLAMP_TASK */ 711 712 union rcu_special { 713 struct { 714 u8 blocked; 715 u8 need_qs; 716 u8 exp_hint; /* Hint for performance. */ 717 u8 need_mb; /* Readers need smp_mb(). */ 718 } b; /* Bits. */ 719 u32 s; /* Set of bits. */ 720 }; 721 722 enum perf_event_task_context { 723 perf_invalid_context = -1, 724 perf_hw_context = 0, 725 perf_sw_context, 726 perf_nr_task_contexts, 727 }; 728 729 struct wake_q_node { 730 struct wake_q_node *next; 731 }; 732 733 struct kmap_ctrl { 734 #ifdef CONFIG_KMAP_LOCAL 735 int idx; 736 pte_t pteval[KM_MAX_IDX]; 737 #endif 738 }; 739 740 struct task_struct { 741 #ifdef CONFIG_THREAD_INFO_IN_TASK 742 /* 743 * For reasons of header soup (see current_thread_info()), this 744 * must be the first element of task_struct. 745 */ 746 struct thread_info thread_info; 747 #endif 748 unsigned int __state; 749 750 /* saved state for "spinlock sleepers" */ 751 unsigned int saved_state; 752 753 /* 754 * This begins the randomizable portion of task_struct. Only 755 * scheduling-critical items should be added above here. 756 */ 757 randomized_struct_fields_start 758 759 void *stack; 760 refcount_t usage; 761 /* Per task flags (PF_*), defined further below: */ 762 unsigned int flags; 763 unsigned int ptrace; 764 765 #ifdef CONFIG_SMP 766 int on_cpu; 767 struct __call_single_node wake_entry; 768 unsigned int wakee_flips; 769 unsigned long wakee_flip_decay_ts; 770 struct task_struct *last_wakee; 771 772 /* 773 * recent_used_cpu is initially set as the last CPU used by a task 774 * that wakes affine another task. Waker/wakee relationships can 775 * push tasks around a CPU where each wakeup moves to the next one. 776 * Tracking a recently used CPU allows a quick search for a recently 777 * used CPU that may be idle. 778 */ 779 int recent_used_cpu; 780 int wake_cpu; 781 #endif 782 int on_rq; 783 784 int prio; 785 int static_prio; 786 int normal_prio; 787 unsigned int rt_priority; 788 789 struct sched_entity se; 790 struct sched_rt_entity rt; 791 struct sched_dl_entity dl; 792 struct sched_dl_entity *dl_server; 793 const struct sched_class *sched_class; 794 795 #ifdef CONFIG_SCHED_CORE 796 struct rb_node core_node; 797 unsigned long core_cookie; 798 unsigned int core_occupation; 799 #endif 800 801 #ifdef CONFIG_CGROUP_SCHED 802 struct task_group *sched_task_group; 803 #endif 804 805 #ifdef CONFIG_UCLAMP_TASK 806 /* 807 * Clamp values requested for a scheduling entity. 808 * Must be updated with task_rq_lock() held. 809 */ 810 struct uclamp_se uclamp_req[UCLAMP_CNT]; 811 /* 812 * Effective clamp values used for a scheduling entity. 813 * Must be updated with task_rq_lock() held. 814 */ 815 struct uclamp_se uclamp[UCLAMP_CNT]; 816 #endif 817 818 struct sched_statistics stats; 819 820 #ifdef CONFIG_PREEMPT_NOTIFIERS 821 /* List of struct preempt_notifier: */ 822 struct hlist_head preempt_notifiers; 823 #endif 824 825 #ifdef CONFIG_BLK_DEV_IO_TRACE 826 unsigned int btrace_seq; 827 #endif 828 829 unsigned int policy; 830 int nr_cpus_allowed; 831 const cpumask_t *cpus_ptr; 832 cpumask_t *user_cpus_ptr; 833 cpumask_t cpus_mask; 834 void *migration_pending; 835 #ifdef CONFIG_SMP 836 unsigned short migration_disabled; 837 #endif 838 unsigned short migration_flags; 839 840 #ifdef CONFIG_PREEMPT_RCU 841 int rcu_read_lock_nesting; 842 union rcu_special rcu_read_unlock_special; 843 struct list_head rcu_node_entry; 844 struct rcu_node *rcu_blocked_node; 845 #endif /* #ifdef CONFIG_PREEMPT_RCU */ 846 847 #ifdef CONFIG_TASKS_RCU 848 unsigned long rcu_tasks_nvcsw; 849 u8 rcu_tasks_holdout; 850 u8 rcu_tasks_idx; 851 int rcu_tasks_idle_cpu; 852 struct list_head rcu_tasks_holdout_list; 853 #endif /* #ifdef CONFIG_TASKS_RCU */ 854 855 #ifdef CONFIG_TASKS_TRACE_RCU 856 int trc_reader_nesting; 857 int trc_ipi_to_cpu; 858 union rcu_special trc_reader_special; 859 struct list_head trc_holdout_list; 860 struct list_head trc_blkd_node; 861 int trc_blkd_cpu; 862 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ 863 864 struct sched_info sched_info; 865 866 struct list_head tasks; 867 #ifdef CONFIG_SMP 868 struct plist_node pushable_tasks; 869 struct rb_node pushable_dl_tasks; 870 #endif 871 872 struct mm_struct *mm; 873 struct mm_struct *active_mm; 874 struct address_space *faults_disabled_mapping; 875 876 int exit_state; 877 int exit_code; 878 int exit_signal; 879 /* The signal sent when the parent dies: */ 880 int pdeath_signal; 881 /* JOBCTL_*, siglock protected: */ 882 unsigned long jobctl; 883 884 /* Used for emulating ABI behavior of previous Linux versions: */ 885 unsigned int personality; 886 887 /* Scheduler bits, serialized by scheduler locks: */ 888 unsigned sched_reset_on_fork:1; 889 unsigned sched_contributes_to_load:1; 890 unsigned sched_migrated:1; 891 892 /* Force alignment to the next boundary: */ 893 unsigned :0; 894 895 /* Unserialized, strictly 'current' */ 896 897 /* 898 * This field must not be in the scheduler word above due to wakelist 899 * queueing no longer being serialized by p->on_cpu. However: 900 * 901 * p->XXX = X; ttwu() 902 * schedule() if (p->on_rq && ..) // false 903 * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true 904 * deactivate_task() ttwu_queue_wakelist()) 905 * p->on_rq = 0; p->sched_remote_wakeup = Y; 906 * 907 * guarantees all stores of 'current' are visible before 908 * ->sched_remote_wakeup gets used, so it can be in this word. 909 */ 910 unsigned sched_remote_wakeup:1; 911 #ifdef CONFIG_RT_MUTEXES 912 unsigned sched_rt_mutex:1; 913 #endif 914 915 /* Bit to tell LSMs we're in execve(): */ 916 unsigned in_execve:1; 917 unsigned in_iowait:1; 918 #ifndef TIF_RESTORE_SIGMASK 919 unsigned restore_sigmask:1; 920 #endif 921 #ifdef CONFIG_MEMCG 922 unsigned in_user_fault:1; 923 #endif 924 #ifdef CONFIG_LRU_GEN 925 /* whether the LRU algorithm may apply to this access */ 926 unsigned in_lru_fault:1; 927 #endif 928 #ifdef CONFIG_COMPAT_BRK 929 unsigned brk_randomized:1; 930 #endif 931 #ifdef CONFIG_CGROUPS 932 /* disallow userland-initiated cgroup migration */ 933 unsigned no_cgroup_migration:1; 934 /* task is frozen/stopped (used by the cgroup freezer) */ 935 unsigned frozen:1; 936 #endif 937 #ifdef CONFIG_BLK_CGROUP 938 unsigned use_memdelay:1; 939 #endif 940 #ifdef CONFIG_PSI 941 /* Stalled due to lack of memory */ 942 unsigned in_memstall:1; 943 #endif 944 #ifdef CONFIG_PAGE_OWNER 945 /* Used by page_owner=on to detect recursion in page tracking. */ 946 unsigned in_page_owner:1; 947 #endif 948 #ifdef CONFIG_EVENTFD 949 /* Recursion prevention for eventfd_signal() */ 950 unsigned in_eventfd:1; 951 #endif 952 #ifdef CONFIG_IOMMU_SVA 953 unsigned pasid_activated:1; 954 #endif 955 #ifdef CONFIG_CPU_SUP_INTEL 956 unsigned reported_split_lock:1; 957 #endif 958 #ifdef CONFIG_TASK_DELAY_ACCT 959 /* delay due to memory thrashing */ 960 unsigned in_thrashing:1; 961 #endif 962 963 unsigned long atomic_flags; /* Flags requiring atomic access. */ 964 965 struct restart_block restart_block; 966 967 pid_t pid; 968 pid_t tgid; 969 970 #ifdef CONFIG_STACKPROTECTOR 971 /* Canary value for the -fstack-protector GCC feature: */ 972 unsigned long stack_canary; 973 #endif 974 /* 975 * Pointers to the (original) parent process, youngest child, younger sibling, 976 * older sibling, respectively. (p->father can be replaced with 977 * p->real_parent->pid) 978 */ 979 980 /* Real parent process: */ 981 struct task_struct __rcu *real_parent; 982 983 /* Recipient of SIGCHLD, wait4() reports: */ 984 struct task_struct __rcu *parent; 985 986 /* 987 * Children/sibling form the list of natural children: 988 */ 989 struct list_head children; 990 struct list_head sibling; 991 struct task_struct *group_leader; 992 993 /* 994 * 'ptraced' is the list of tasks this task is using ptrace() on. 995 * 996 * This includes both natural children and PTRACE_ATTACH targets. 997 * 'ptrace_entry' is this task's link on the p->parent->ptraced list. 998 */ 999 struct list_head ptraced; 1000 struct list_head ptrace_entry; 1001 1002 /* PID/PID hash table linkage. */ 1003 struct pid *thread_pid; 1004 struct hlist_node pid_links[PIDTYPE_MAX]; 1005 struct list_head thread_node; 1006 1007 struct completion *vfork_done; 1008 1009 /* CLONE_CHILD_SETTID: */ 1010 int __user *set_child_tid; 1011 1012 /* CLONE_CHILD_CLEARTID: */ 1013 int __user *clear_child_tid; 1014 1015 /* PF_KTHREAD | PF_IO_WORKER */ 1016 void *worker_private; 1017 1018 u64 utime; 1019 u64 stime; 1020 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME 1021 u64 utimescaled; 1022 u64 stimescaled; 1023 #endif 1024 u64 gtime; 1025 struct prev_cputime prev_cputime; 1026 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1027 struct vtime vtime; 1028 #endif 1029 1030 #ifdef CONFIG_NO_HZ_FULL 1031 atomic_t tick_dep_mask; 1032 #endif 1033 /* Context switch counts: */ 1034 unsigned long nvcsw; 1035 unsigned long nivcsw; 1036 1037 /* Monotonic time in nsecs: */ 1038 u64 start_time; 1039 1040 /* Boot based time in nsecs: */ 1041 u64 start_boottime; 1042 1043 /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ 1044 unsigned long min_flt; 1045 unsigned long maj_flt; 1046 1047 /* Empty if CONFIG_POSIX_CPUTIMERS=n */ 1048 struct posix_cputimers posix_cputimers; 1049 1050 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK 1051 struct posix_cputimers_work posix_cputimers_work; 1052 #endif 1053 1054 /* Process credentials: */ 1055 1056 /* Tracer's credentials at attach: */ 1057 const struct cred __rcu *ptracer_cred; 1058 1059 /* Objective and real subjective task credentials (COW): */ 1060 const struct cred __rcu *real_cred; 1061 1062 /* Effective (overridable) subjective task credentials (COW): */ 1063 const struct cred __rcu *cred; 1064 1065 #ifdef CONFIG_KEYS 1066 /* Cached requested key. */ 1067 struct key *cached_requested_key; 1068 #endif 1069 1070 /* 1071 * executable name, excluding path. 1072 * 1073 * - normally initialized setup_new_exec() 1074 * - access it with [gs]et_task_comm() 1075 * - lock it with task_lock() 1076 */ 1077 char comm[TASK_COMM_LEN]; 1078 1079 struct nameidata *nameidata; 1080 1081 #ifdef CONFIG_SYSVIPC 1082 struct sysv_sem sysvsem; 1083 struct sysv_shm sysvshm; 1084 #endif 1085 #ifdef CONFIG_DETECT_HUNG_TASK 1086 unsigned long last_switch_count; 1087 unsigned long last_switch_time; 1088 #endif 1089 /* Filesystem information: */ 1090 struct fs_struct *fs; 1091 1092 /* Open file information: */ 1093 struct files_struct *files; 1094 1095 #ifdef CONFIG_IO_URING 1096 struct io_uring_task *io_uring; 1097 #endif 1098 1099 /* Namespaces: */ 1100 struct nsproxy *nsproxy; 1101 1102 /* Signal handlers: */ 1103 struct signal_struct *signal; 1104 struct sighand_struct __rcu *sighand; 1105 sigset_t blocked; 1106 sigset_t real_blocked; 1107 /* Restored if set_restore_sigmask() was used: */ 1108 sigset_t saved_sigmask; 1109 struct sigpending pending; 1110 unsigned long sas_ss_sp; 1111 size_t sas_ss_size; 1112 unsigned int sas_ss_flags; 1113 1114 struct callback_head *task_works; 1115 1116 #ifdef CONFIG_AUDIT 1117 #ifdef CONFIG_AUDITSYSCALL 1118 struct audit_context *audit_context; 1119 #endif 1120 kuid_t loginuid; 1121 unsigned int sessionid; 1122 #endif 1123 struct seccomp seccomp; 1124 struct syscall_user_dispatch syscall_dispatch; 1125 1126 /* Thread group tracking: */ 1127 u64 parent_exec_id; 1128 u64 self_exec_id; 1129 1130 /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ 1131 spinlock_t alloc_lock; 1132 1133 /* Protection of the PI data structures: */ 1134 raw_spinlock_t pi_lock; 1135 1136 struct wake_q_node wake_q; 1137 1138 #ifdef CONFIG_RT_MUTEXES 1139 /* PI waiters blocked on a rt_mutex held by this task: */ 1140 struct rb_root_cached pi_waiters; 1141 /* Updated under owner's pi_lock and rq lock */ 1142 struct task_struct *pi_top_task; 1143 /* Deadlock detection and priority inheritance handling: */ 1144 struct rt_mutex_waiter *pi_blocked_on; 1145 #endif 1146 1147 #ifdef CONFIG_DEBUG_MUTEXES 1148 /* Mutex deadlock detection: */ 1149 struct mutex_waiter *blocked_on; 1150 #endif 1151 1152 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 1153 int non_block_count; 1154 #endif 1155 1156 #ifdef CONFIG_TRACE_IRQFLAGS 1157 struct irqtrace_events irqtrace; 1158 unsigned int hardirq_threaded; 1159 u64 hardirq_chain_key; 1160 int softirqs_enabled; 1161 int softirq_context; 1162 int irq_config; 1163 #endif 1164 #ifdef CONFIG_PREEMPT_RT 1165 int softirq_disable_cnt; 1166 #endif 1167 1168 #ifdef CONFIG_LOCKDEP 1169 # define MAX_LOCK_DEPTH 48UL 1170 u64 curr_chain_key; 1171 int lockdep_depth; 1172 unsigned int lockdep_recursion; 1173 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1174 #endif 1175 1176 #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) 1177 unsigned int in_ubsan; 1178 #endif 1179 1180 /* Journalling filesystem info: */ 1181 void *journal_info; 1182 1183 /* Stacked block device info: */ 1184 struct bio_list *bio_list; 1185 1186 /* Stack plugging: */ 1187 struct blk_plug *plug; 1188 1189 /* VM state: */ 1190 struct reclaim_state *reclaim_state; 1191 1192 struct io_context *io_context; 1193 1194 #ifdef CONFIG_COMPACTION 1195 struct capture_control *capture_control; 1196 #endif 1197 /* Ptrace state: */ 1198 unsigned long ptrace_message; 1199 kernel_siginfo_t *last_siginfo; 1200 1201 struct task_io_accounting ioac; 1202 #ifdef CONFIG_PSI 1203 /* Pressure stall state */ 1204 unsigned int psi_flags; 1205 #endif 1206 #ifdef CONFIG_TASK_XACCT 1207 /* Accumulated RSS usage: */ 1208 u64 acct_rss_mem1; 1209 /* Accumulated virtual memory usage: */ 1210 u64 acct_vm_mem1; 1211 /* stime + utime since last update: */ 1212 u64 acct_timexpd; 1213 #endif 1214 #ifdef CONFIG_CPUSETS 1215 /* Protected by ->alloc_lock: */ 1216 nodemask_t mems_allowed; 1217 /* Sequence number to catch updates: */ 1218 seqcount_spinlock_t mems_allowed_seq; 1219 int cpuset_mem_spread_rotor; 1220 int cpuset_slab_spread_rotor; 1221 #endif 1222 #ifdef CONFIG_CGROUPS 1223 /* Control Group info protected by css_set_lock: */ 1224 struct css_set __rcu *cgroups; 1225 /* cg_list protected by css_set_lock and tsk->alloc_lock: */ 1226 struct list_head cg_list; 1227 #endif 1228 #ifdef CONFIG_X86_CPU_RESCTRL 1229 u32 closid; 1230 u32 rmid; 1231 #endif 1232 #ifdef CONFIG_FUTEX 1233 struct robust_list_head __user *robust_list; 1234 #ifdef CONFIG_COMPAT 1235 struct compat_robust_list_head __user *compat_robust_list; 1236 #endif 1237 struct list_head pi_state_list; 1238 struct futex_pi_state *pi_state_cache; 1239 struct mutex futex_exit_mutex; 1240 unsigned int futex_state; 1241 #endif 1242 #ifdef CONFIG_PERF_EVENTS 1243 struct perf_event_context *perf_event_ctxp; 1244 struct mutex perf_event_mutex; 1245 struct list_head perf_event_list; 1246 #endif 1247 #ifdef CONFIG_DEBUG_PREEMPT 1248 unsigned long preempt_disable_ip; 1249 #endif 1250 #ifdef CONFIG_NUMA 1251 /* Protected by alloc_lock: */ 1252 struct mempolicy *mempolicy; 1253 short il_prev; 1254 short pref_node_fork; 1255 #endif 1256 #ifdef CONFIG_NUMA_BALANCING 1257 int numa_scan_seq; 1258 unsigned int numa_scan_period; 1259 unsigned int numa_scan_period_max; 1260 int numa_preferred_nid; 1261 unsigned long numa_migrate_retry; 1262 /* Migration stamp: */ 1263 u64 node_stamp; 1264 u64 last_task_numa_placement; 1265 u64 last_sum_exec_runtime; 1266 struct callback_head numa_work; 1267 1268 /* 1269 * This pointer is only modified for current in syscall and 1270 * pagefault context (and for tasks being destroyed), so it can be read 1271 * from any of the following contexts: 1272 * - RCU read-side critical section 1273 * - current->numa_group from everywhere 1274 * - task's runqueue locked, task not running 1275 */ 1276 struct numa_group __rcu *numa_group; 1277 1278 /* 1279 * numa_faults is an array split into four regions: 1280 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer 1281 * in this precise order. 1282 * 1283 * faults_memory: Exponential decaying average of faults on a per-node 1284 * basis. Scheduling placement decisions are made based on these 1285 * counts. The values remain static for the duration of a PTE scan. 1286 * faults_cpu: Track the nodes the process was running on when a NUMA 1287 * hinting fault was incurred. 1288 * faults_memory_buffer and faults_cpu_buffer: Record faults per node 1289 * during the current scan window. When the scan completes, the counts 1290 * in faults_memory and faults_cpu decay and these values are copied. 1291 */ 1292 unsigned long *numa_faults; 1293 unsigned long total_numa_faults; 1294 1295 /* 1296 * numa_faults_locality tracks if faults recorded during the last 1297 * scan window were remote/local or failed to migrate. The task scan 1298 * period is adapted based on the locality of the faults with different 1299 * weights depending on whether they were shared or private faults 1300 */ 1301 unsigned long numa_faults_locality[3]; 1302 1303 unsigned long numa_pages_migrated; 1304 #endif /* CONFIG_NUMA_BALANCING */ 1305 1306 #ifdef CONFIG_RSEQ 1307 struct rseq __user *rseq; 1308 u32 rseq_len; 1309 u32 rseq_sig; 1310 /* 1311 * RmW on rseq_event_mask must be performed atomically 1312 * with respect to preemption. 1313 */ 1314 unsigned long rseq_event_mask; 1315 #endif 1316 1317 #ifdef CONFIG_SCHED_MM_CID 1318 int mm_cid; /* Current cid in mm */ 1319 int last_mm_cid; /* Most recent cid in mm */ 1320 int migrate_from_cpu; 1321 int mm_cid_active; /* Whether cid bitmap is active */ 1322 struct callback_head cid_work; 1323 #endif 1324 1325 struct tlbflush_unmap_batch tlb_ubc; 1326 1327 /* Cache last used pipe for splice(): */ 1328 struct pipe_inode_info *splice_pipe; 1329 1330 struct page_frag task_frag; 1331 1332 #ifdef CONFIG_TASK_DELAY_ACCT 1333 struct task_delay_info *delays; 1334 #endif 1335 1336 #ifdef CONFIG_FAULT_INJECTION 1337 int make_it_fail; 1338 unsigned int fail_nth; 1339 #endif 1340 /* 1341 * When (nr_dirtied >= nr_dirtied_pause), it's time to call 1342 * balance_dirty_pages() for a dirty throttling pause: 1343 */ 1344 int nr_dirtied; 1345 int nr_dirtied_pause; 1346 /* Start of a write-and-pause period: */ 1347 unsigned long dirty_paused_when; 1348 1349 #ifdef CONFIG_LATENCYTOP 1350 int latency_record_count; 1351 struct latency_record latency_record[LT_SAVECOUNT]; 1352 #endif 1353 /* 1354 * Time slack values; these are used to round up poll() and 1355 * select() etc timeout values. These are in nanoseconds. 1356 */ 1357 u64 timer_slack_ns; 1358 u64 default_timer_slack_ns; 1359 1360 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) 1361 unsigned int kasan_depth; 1362 #endif 1363 1364 #ifdef CONFIG_KCSAN 1365 struct kcsan_ctx kcsan_ctx; 1366 #ifdef CONFIG_TRACE_IRQFLAGS 1367 struct irqtrace_events kcsan_save_irqtrace; 1368 #endif 1369 #ifdef CONFIG_KCSAN_WEAK_MEMORY 1370 int kcsan_stack_depth; 1371 #endif 1372 #endif 1373 1374 #ifdef CONFIG_KMSAN 1375 struct kmsan_ctx kmsan_ctx; 1376 #endif 1377 1378 #if IS_ENABLED(CONFIG_KUNIT) 1379 struct kunit *kunit_test; 1380 #endif 1381 1382 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1383 /* Index of current stored address in ret_stack: */ 1384 int curr_ret_stack; 1385 int curr_ret_depth; 1386 1387 /* Stack of return addresses for return function tracing: */ 1388 struct ftrace_ret_stack *ret_stack; 1389 1390 /* Timestamp for last schedule: */ 1391 unsigned long long ftrace_timestamp; 1392 1393 /* 1394 * Number of functions that haven't been traced 1395 * because of depth overrun: 1396 */ 1397 atomic_t trace_overrun; 1398 1399 /* Pause tracing: */ 1400 atomic_t tracing_graph_pause; 1401 #endif 1402 1403 #ifdef CONFIG_TRACING 1404 /* Bitmask and counter of trace recursion: */ 1405 unsigned long trace_recursion; 1406 #endif /* CONFIG_TRACING */ 1407 1408 #ifdef CONFIG_KCOV 1409 /* See kernel/kcov.c for more details. */ 1410 1411 /* Coverage collection mode enabled for this task (0 if disabled): */ 1412 unsigned int kcov_mode; 1413 1414 /* Size of the kcov_area: */ 1415 unsigned int kcov_size; 1416 1417 /* Buffer for coverage collection: */ 1418 void *kcov_area; 1419 1420 /* KCOV descriptor wired with this task or NULL: */ 1421 struct kcov *kcov; 1422 1423 /* KCOV common handle for remote coverage collection: */ 1424 u64 kcov_handle; 1425 1426 /* KCOV sequence number: */ 1427 int kcov_sequence; 1428 1429 /* Collect coverage from softirq context: */ 1430 unsigned int kcov_softirq; 1431 #endif 1432 1433 #ifdef CONFIG_MEMCG 1434 struct mem_cgroup *memcg_in_oom; 1435 gfp_t memcg_oom_gfp_mask; 1436 int memcg_oom_order; 1437 1438 /* Number of pages to reclaim on returning to userland: */ 1439 unsigned int memcg_nr_pages_over_high; 1440 1441 /* Used by memcontrol for targeted memcg charge: */ 1442 struct mem_cgroup *active_memcg; 1443 #endif 1444 1445 #ifdef CONFIG_MEMCG_KMEM 1446 struct obj_cgroup *objcg; 1447 #endif 1448 1449 #ifdef CONFIG_BLK_CGROUP 1450 struct gendisk *throttle_disk; 1451 #endif 1452 1453 #ifdef CONFIG_UPROBES 1454 struct uprobe_task *utask; 1455 #endif 1456 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) 1457 unsigned int sequential_io; 1458 unsigned int sequential_io_avg; 1459 #endif 1460 struct kmap_ctrl kmap_ctrl; 1461 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 1462 unsigned long task_state_change; 1463 # ifdef CONFIG_PREEMPT_RT 1464 unsigned long saved_state_change; 1465 # endif 1466 #endif 1467 struct rcu_head rcu; 1468 refcount_t rcu_users; 1469 int pagefault_disabled; 1470 #ifdef CONFIG_MMU 1471 struct task_struct *oom_reaper_list; 1472 struct timer_list oom_reaper_timer; 1473 #endif 1474 #ifdef CONFIG_VMAP_STACK 1475 struct vm_struct *stack_vm_area; 1476 #endif 1477 #ifdef CONFIG_THREAD_INFO_IN_TASK 1478 /* A live task holds one reference: */ 1479 refcount_t stack_refcount; 1480 #endif 1481 #ifdef CONFIG_LIVEPATCH 1482 int patch_state; 1483 #endif 1484 #ifdef CONFIG_SECURITY 1485 /* Used by LSM modules for access restriction: */ 1486 void *security; 1487 #endif 1488 #ifdef CONFIG_BPF_SYSCALL 1489 /* Used by BPF task local storage */ 1490 struct bpf_local_storage __rcu *bpf_storage; 1491 /* Used for BPF run context */ 1492 struct bpf_run_ctx *bpf_ctx; 1493 #endif 1494 1495 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK 1496 unsigned long lowest_stack; 1497 unsigned long prev_lowest_stack; 1498 #endif 1499 1500 #ifdef CONFIG_X86_MCE 1501 void __user *mce_vaddr; 1502 __u64 mce_kflags; 1503 u64 mce_addr; 1504 __u64 mce_ripv : 1, 1505 mce_whole_page : 1, 1506 __mce_reserved : 62; 1507 struct callback_head mce_kill_me; 1508 int mce_count; 1509 #endif 1510 1511 #ifdef CONFIG_KRETPROBES 1512 struct llist_head kretprobe_instances; 1513 #endif 1514 #ifdef CONFIG_RETHOOK 1515 struct llist_head rethooks; 1516 #endif 1517 1518 #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH 1519 /* 1520 * If L1D flush is supported on mm context switch 1521 * then we use this callback head to queue kill work 1522 * to kill tasks that are not running on SMT disabled 1523 * cores 1524 */ 1525 struct callback_head l1d_flush_kill; 1526 #endif 1527 1528 #ifdef CONFIG_RV 1529 /* 1530 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. 1531 * If we find justification for more monitors, we can think 1532 * about adding more or developing a dynamic method. So far, 1533 * none of these are justified. 1534 */ 1535 union rv_task_monitor rv[RV_PER_TASK_MONITORS]; 1536 #endif 1537 1538 #ifdef CONFIG_USER_EVENTS 1539 struct user_event_mm *user_event_mm; 1540 #endif 1541 1542 /* 1543 * New fields for task_struct should be added above here, so that 1544 * they are included in the randomized portion of task_struct. 1545 */ 1546 randomized_struct_fields_end 1547 1548 /* CPU-specific state of this task: */ 1549 struct thread_struct thread; 1550 1551 /* 1552 * WARNING: on x86, 'thread_struct' contains a variable-sized 1553 * structure. It *MUST* be at the end of 'task_struct'. 1554 * 1555 * Do not put anything below here! 1556 */ 1557 }; 1558 1559 static inline struct pid *task_pid(struct task_struct *task) 1560 { 1561 return task->thread_pid; 1562 } 1563 1564 /* 1565 * the helpers to get the task's different pids as they are seen 1566 * from various namespaces 1567 * 1568 * task_xid_nr() : global id, i.e. the id seen from the init namespace; 1569 * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of 1570 * current. 1571 * task_xid_nr_ns() : id seen from the ns specified; 1572 * 1573 * see also pid_nr() etc in include/linux/pid.h 1574 */ 1575 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); 1576 1577 static inline pid_t task_pid_nr(struct task_struct *tsk) 1578 { 1579 return tsk->pid; 1580 } 1581 1582 static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 1583 { 1584 return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); 1585 } 1586 1587 static inline pid_t task_pid_vnr(struct task_struct *tsk) 1588 { 1589 return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); 1590 } 1591 1592 1593 static inline pid_t task_tgid_nr(struct task_struct *tsk) 1594 { 1595 return tsk->tgid; 1596 } 1597 1598 /** 1599 * pid_alive - check that a task structure is not stale 1600 * @p: Task structure to be checked. 1601 * 1602 * Test if a process is not yet dead (at most zombie state) 1603 * If pid_alive fails, then pointers within the task structure 1604 * can be stale and must not be dereferenced. 1605 * 1606 * Return: 1 if the process is alive. 0 otherwise. 1607 */ 1608 static inline int pid_alive(const struct task_struct *p) 1609 { 1610 return p->thread_pid != NULL; 1611 } 1612 1613 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 1614 { 1615 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); 1616 } 1617 1618 static inline pid_t task_pgrp_vnr(struct task_struct *tsk) 1619 { 1620 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); 1621 } 1622 1623 1624 static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 1625 { 1626 return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); 1627 } 1628 1629 static inline pid_t task_session_vnr(struct task_struct *tsk) 1630 { 1631 return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); 1632 } 1633 1634 static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 1635 { 1636 return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); 1637 } 1638 1639 static inline pid_t task_tgid_vnr(struct task_struct *tsk) 1640 { 1641 return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); 1642 } 1643 1644 static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) 1645 { 1646 pid_t pid = 0; 1647 1648 rcu_read_lock(); 1649 if (pid_alive(tsk)) 1650 pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); 1651 rcu_read_unlock(); 1652 1653 return pid; 1654 } 1655 1656 static inline pid_t task_ppid_nr(const struct task_struct *tsk) 1657 { 1658 return task_ppid_nr_ns(tsk, &init_pid_ns); 1659 } 1660 1661 /* Obsolete, do not use: */ 1662 static inline pid_t task_pgrp_nr(struct task_struct *tsk) 1663 { 1664 return task_pgrp_nr_ns(tsk, &init_pid_ns); 1665 } 1666 1667 #define TASK_REPORT_IDLE (TASK_REPORT + 1) 1668 #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) 1669 1670 static inline unsigned int __task_state_index(unsigned int tsk_state, 1671 unsigned int tsk_exit_state) 1672 { 1673 unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; 1674 1675 BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); 1676 1677 if ((tsk_state & TASK_IDLE) == TASK_IDLE) 1678 state = TASK_REPORT_IDLE; 1679 1680 /* 1681 * We're lying here, but rather than expose a completely new task state 1682 * to userspace, we can make this appear as if the task has gone through 1683 * a regular rt_mutex_lock() call. 1684 */ 1685 if (tsk_state & TASK_RTLOCK_WAIT) 1686 state = TASK_UNINTERRUPTIBLE; 1687 1688 return fls(state); 1689 } 1690 1691 static inline unsigned int task_state_index(struct task_struct *tsk) 1692 { 1693 return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); 1694 } 1695 1696 static inline char task_index_to_char(unsigned int state) 1697 { 1698 static const char state_char[] = "RSDTtXZPI"; 1699 1700 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); 1701 1702 return state_char[state]; 1703 } 1704 1705 static inline char task_state_to_char(struct task_struct *tsk) 1706 { 1707 return task_index_to_char(task_state_index(tsk)); 1708 } 1709 1710 /** 1711 * is_global_init - check if a task structure is init. Since init 1712 * is free to have sub-threads we need to check tgid. 1713 * @tsk: Task structure to be checked. 1714 * 1715 * Check if a task structure is the first user space task the kernel created. 1716 * 1717 * Return: 1 if the task structure is init. 0 otherwise. 1718 */ 1719 static inline int is_global_init(struct task_struct *tsk) 1720 { 1721 return task_tgid_nr(tsk) == 1; 1722 } 1723 1724 extern struct pid *cad_pid; 1725 1726 /* 1727 * Per process flags 1728 */ 1729 #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ 1730 #define PF_IDLE 0x00000002 /* I am an IDLE thread */ 1731 #define PF_EXITING 0x00000004 /* Getting shut down */ 1732 #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ 1733 #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ 1734 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 1735 #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ 1736 #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ 1737 #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ 1738 #define PF_DUMPCORE 0x00000200 /* Dumped core */ 1739 #define PF_SIGNALED 0x00000400 /* Killed by a signal */ 1740 #define PF_MEMALLOC 0x00000800 /* Allocating memory */ 1741 #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ 1742 #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ 1743 #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ 1744 #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ 1745 #define PF__HOLE__00010000 0x00010000 1746 #define PF_KSWAPD 0x00020000 /* I am kswapd */ 1747 #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ 1748 #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ 1749 #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, 1750 * I am cleaning dirty pages from some other bdi. */ 1751 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1752 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1753 #define PF__HOLE__00800000 0x00800000 1754 #define PF__HOLE__01000000 0x01000000 1755 #define PF__HOLE__02000000 0x02000000 1756 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1757 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1758 #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ 1759 #define PF__HOLE__20000000 0x20000000 1760 #define PF__HOLE__40000000 0x40000000 1761 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1762 1763 /* 1764 * Only the _current_ task can read/write to tsk->flags, but other 1765 * tasks can access tsk->flags in readonly mode for example 1766 * with tsk_used_math (like during threaded core dumping). 1767 * There is however an exception to this rule during ptrace 1768 * or during fork: the ptracer task is allowed to write to the 1769 * child->flags of its traced child (same goes for fork, the parent 1770 * can write to the child->flags), because we're guaranteed the 1771 * child is not running and in turn not changing child->flags 1772 * at the same time the parent does it. 1773 */ 1774 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) 1775 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) 1776 #define clear_used_math() clear_stopped_child_used_math(current) 1777 #define set_used_math() set_stopped_child_used_math(current) 1778 1779 #define conditional_stopped_child_used_math(condition, child) \ 1780 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) 1781 1782 #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) 1783 1784 #define copy_to_stopped_child_used_math(child) \ 1785 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) 1786 1787 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ 1788 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1789 #define used_math() tsk_used_math(current) 1790 1791 static __always_inline bool is_percpu_thread(void) 1792 { 1793 #ifdef CONFIG_SMP 1794 return (current->flags & PF_NO_SETAFFINITY) && 1795 (current->nr_cpus_allowed == 1); 1796 #else 1797 return true; 1798 #endif 1799 } 1800 1801 /* Per-process atomic flags. */ 1802 #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ 1803 #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ 1804 #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ 1805 #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ 1806 #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ 1807 #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ 1808 #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ 1809 #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ 1810 1811 #define TASK_PFA_TEST(name, func) \ 1812 static inline bool task_##func(struct task_struct *p) \ 1813 { return test_bit(PFA_##name, &p->atomic_flags); } 1814 1815 #define TASK_PFA_SET(name, func) \ 1816 static inline void task_set_##func(struct task_struct *p) \ 1817 { set_bit(PFA_##name, &p->atomic_flags); } 1818 1819 #define TASK_PFA_CLEAR(name, func) \ 1820 static inline void task_clear_##func(struct task_struct *p) \ 1821 { clear_bit(PFA_##name, &p->atomic_flags); } 1822 1823 TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) 1824 TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) 1825 1826 TASK_PFA_TEST(SPREAD_PAGE, spread_page) 1827 TASK_PFA_SET(SPREAD_PAGE, spread_page) 1828 TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) 1829 1830 TASK_PFA_TEST(SPREAD_SLAB, spread_slab) 1831 TASK_PFA_SET(SPREAD_SLAB, spread_slab) 1832 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) 1833 1834 TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) 1835 TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) 1836 TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) 1837 1838 TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) 1839 TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) 1840 TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) 1841 1842 TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) 1843 TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) 1844 1845 TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) 1846 TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) 1847 TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) 1848 1849 TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) 1850 TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) 1851 1852 static inline void 1853 current_restore_flags(unsigned long orig_flags, unsigned long flags) 1854 { 1855 current->flags &= ~flags; 1856 current->flags |= orig_flags & flags; 1857 } 1858 1859 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 1860 extern int task_can_attach(struct task_struct *p); 1861 extern int dl_bw_alloc(int cpu, u64 dl_bw); 1862 extern void dl_bw_free(int cpu, u64 dl_bw); 1863 #ifdef CONFIG_SMP 1864 1865 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ 1866 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); 1867 1868 /** 1869 * set_cpus_allowed_ptr - set CPU affinity mask of a task 1870 * @p: the task 1871 * @new_mask: CPU affinity mask 1872 * 1873 * Return: zero if successful, or a negative error code 1874 */ 1875 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); 1876 extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); 1877 extern void release_user_cpus_ptr(struct task_struct *p); 1878 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); 1879 extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); 1880 extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); 1881 #else 1882 static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1883 { 1884 } 1885 static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1886 { 1887 if (!cpumask_test_cpu(0, new_mask)) 1888 return -EINVAL; 1889 return 0; 1890 } 1891 static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) 1892 { 1893 if (src->user_cpus_ptr) 1894 return -EINVAL; 1895 return 0; 1896 } 1897 static inline void release_user_cpus_ptr(struct task_struct *p) 1898 { 1899 WARN_ON(p->user_cpus_ptr); 1900 } 1901 1902 static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) 1903 { 1904 return 0; 1905 } 1906 #endif 1907 1908 extern int yield_to(struct task_struct *p, bool preempt); 1909 extern void set_user_nice(struct task_struct *p, long nice); 1910 extern int task_prio(const struct task_struct *p); 1911 1912 /** 1913 * task_nice - return the nice value of a given task. 1914 * @p: the task in question. 1915 * 1916 * Return: The nice value [ -20 ... 0 ... 19 ]. 1917 */ 1918 static inline int task_nice(const struct task_struct *p) 1919 { 1920 return PRIO_TO_NICE((p)->static_prio); 1921 } 1922 1923 extern int can_nice(const struct task_struct *p, const int nice); 1924 extern int task_curr(const struct task_struct *p); 1925 extern int idle_cpu(int cpu); 1926 extern int available_idle_cpu(int cpu); 1927 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); 1928 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); 1929 extern void sched_set_fifo(struct task_struct *p); 1930 extern void sched_set_fifo_low(struct task_struct *p); 1931 extern void sched_set_normal(struct task_struct *p, int nice); 1932 extern int sched_setattr(struct task_struct *, const struct sched_attr *); 1933 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); 1934 extern struct task_struct *idle_task(int cpu); 1935 1936 /** 1937 * is_idle_task - is the specified task an idle task? 1938 * @p: the task in question. 1939 * 1940 * Return: 1 if @p is an idle task. 0 otherwise. 1941 */ 1942 static __always_inline bool is_idle_task(const struct task_struct *p) 1943 { 1944 return !!(p->flags & PF_IDLE); 1945 } 1946 1947 extern struct task_struct *curr_task(int cpu); 1948 extern void ia64_set_curr_task(int cpu, struct task_struct *p); 1949 1950 void yield(void); 1951 1952 union thread_union { 1953 struct task_struct task; 1954 #ifndef CONFIG_THREAD_INFO_IN_TASK 1955 struct thread_info thread_info; 1956 #endif 1957 unsigned long stack[THREAD_SIZE/sizeof(long)]; 1958 }; 1959 1960 #ifndef CONFIG_THREAD_INFO_IN_TASK 1961 extern struct thread_info init_thread_info; 1962 #endif 1963 1964 extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; 1965 1966 #ifdef CONFIG_THREAD_INFO_IN_TASK 1967 # define task_thread_info(task) (&(task)->thread_info) 1968 #elif !defined(__HAVE_THREAD_FUNCTIONS) 1969 # define task_thread_info(task) ((struct thread_info *)(task)->stack) 1970 #endif 1971 1972 /* 1973 * find a task by one of its numerical ids 1974 * 1975 * find_task_by_pid_ns(): 1976 * finds a task by its pid in the specified namespace 1977 * find_task_by_vpid(): 1978 * finds a task by its virtual pid 1979 * 1980 * see also find_vpid() etc in include/linux/pid.h 1981 */ 1982 1983 extern struct task_struct *find_task_by_vpid(pid_t nr); 1984 extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); 1985 1986 /* 1987 * find a task by its virtual pid and get the task struct 1988 */ 1989 extern struct task_struct *find_get_task_by_vpid(pid_t nr); 1990 1991 extern int wake_up_state(struct task_struct *tsk, unsigned int state); 1992 extern int wake_up_process(struct task_struct *tsk); 1993 extern void wake_up_new_task(struct task_struct *tsk); 1994 1995 #ifdef CONFIG_SMP 1996 extern void kick_process(struct task_struct *tsk); 1997 #else 1998 static inline void kick_process(struct task_struct *tsk) { } 1999 #endif 2000 2001 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); 2002 2003 static inline void set_task_comm(struct task_struct *tsk, const char *from) 2004 { 2005 __set_task_comm(tsk, from, false); 2006 } 2007 2008 extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); 2009 #define get_task_comm(buf, tsk) ({ \ 2010 BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ 2011 __get_task_comm(buf, sizeof(buf), tsk); \ 2012 }) 2013 2014 #ifdef CONFIG_SMP 2015 static __always_inline void scheduler_ipi(void) 2016 { 2017 /* 2018 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 2019 * TIF_NEED_RESCHED remotely (for the first time) will also send 2020 * this IPI. 2021 */ 2022 preempt_fold_need_resched(); 2023 } 2024 #else 2025 static inline void scheduler_ipi(void) { } 2026 #endif 2027 2028 extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); 2029 2030 /* 2031 * Set thread flags in other task's structures. 2032 * See asm/thread_info.h for TIF_xxxx flags available: 2033 */ 2034 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) 2035 { 2036 set_ti_thread_flag(task_thread_info(tsk), flag); 2037 } 2038 2039 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) 2040 { 2041 clear_ti_thread_flag(task_thread_info(tsk), flag); 2042 } 2043 2044 static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, 2045 bool value) 2046 { 2047 update_ti_thread_flag(task_thread_info(tsk), flag, value); 2048 } 2049 2050 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) 2051 { 2052 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); 2053 } 2054 2055 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) 2056 { 2057 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); 2058 } 2059 2060 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) 2061 { 2062 return test_ti_thread_flag(task_thread_info(tsk), flag); 2063 } 2064 2065 static inline void set_tsk_need_resched(struct task_struct *tsk) 2066 { 2067 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2068 } 2069 2070 static inline void clear_tsk_need_resched(struct task_struct *tsk) 2071 { 2072 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2073 } 2074 2075 static inline int test_tsk_need_resched(struct task_struct *tsk) 2076 { 2077 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); 2078 } 2079 2080 /* 2081 * cond_resched() and cond_resched_lock(): latency reduction via 2082 * explicit rescheduling in places that are safe. The return 2083 * value indicates whether a reschedule was done in fact. 2084 * cond_resched_lock() will drop the spinlock before scheduling, 2085 */ 2086 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) 2087 extern int __cond_resched(void); 2088 2089 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 2090 2091 void sched_dynamic_klp_enable(void); 2092 void sched_dynamic_klp_disable(void); 2093 2094 DECLARE_STATIC_CALL(cond_resched, __cond_resched); 2095 2096 static __always_inline int _cond_resched(void) 2097 { 2098 return static_call_mod(cond_resched)(); 2099 } 2100 2101 #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 2102 2103 extern int dynamic_cond_resched(void); 2104 2105 static __always_inline int _cond_resched(void) 2106 { 2107 return dynamic_cond_resched(); 2108 } 2109 2110 #else /* !CONFIG_PREEMPTION */ 2111 2112 static inline int _cond_resched(void) 2113 { 2114 klp_sched_try_switch(); 2115 return __cond_resched(); 2116 } 2117 2118 #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 2119 2120 #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ 2121 2122 static inline int _cond_resched(void) 2123 { 2124 klp_sched_try_switch(); 2125 return 0; 2126 } 2127 2128 #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ 2129 2130 #define cond_resched() ({ \ 2131 __might_resched(__FILE__, __LINE__, 0); \ 2132 _cond_resched(); \ 2133 }) 2134 2135 extern int __cond_resched_lock(spinlock_t *lock); 2136 extern int __cond_resched_rwlock_read(rwlock_t *lock); 2137 extern int __cond_resched_rwlock_write(rwlock_t *lock); 2138 2139 #define MIGHT_RESCHED_RCU_SHIFT 8 2140 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) 2141 2142 #ifndef CONFIG_PREEMPT_RT 2143 /* 2144 * Non RT kernels have an elevated preempt count due to the held lock, 2145 * but are not allowed to be inside a RCU read side critical section 2146 */ 2147 # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET 2148 #else 2149 /* 2150 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in 2151 * cond_resched*lock() has to take that into account because it checks for 2152 * preempt_count() and rcu_preempt_depth(). 2153 */ 2154 # define PREEMPT_LOCK_RESCHED_OFFSETS \ 2155 (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) 2156 #endif 2157 2158 #define cond_resched_lock(lock) ({ \ 2159 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ 2160 __cond_resched_lock(lock); \ 2161 }) 2162 2163 #define cond_resched_rwlock_read(lock) ({ \ 2164 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ 2165 __cond_resched_rwlock_read(lock); \ 2166 }) 2167 2168 #define cond_resched_rwlock_write(lock) ({ \ 2169 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ 2170 __cond_resched_rwlock_write(lock); \ 2171 }) 2172 2173 static inline void cond_resched_rcu(void) 2174 { 2175 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) 2176 rcu_read_unlock(); 2177 cond_resched(); 2178 rcu_read_lock(); 2179 #endif 2180 } 2181 2182 #ifdef CONFIG_PREEMPT_DYNAMIC 2183 2184 extern bool preempt_model_none(void); 2185 extern bool preempt_model_voluntary(void); 2186 extern bool preempt_model_full(void); 2187 2188 #else 2189 2190 static inline bool preempt_model_none(void) 2191 { 2192 return IS_ENABLED(CONFIG_PREEMPT_NONE); 2193 } 2194 static inline bool preempt_model_voluntary(void) 2195 { 2196 return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); 2197 } 2198 static inline bool preempt_model_full(void) 2199 { 2200 return IS_ENABLED(CONFIG_PREEMPT); 2201 } 2202 2203 #endif 2204 2205 static inline bool preempt_model_rt(void) 2206 { 2207 return IS_ENABLED(CONFIG_PREEMPT_RT); 2208 } 2209 2210 /* 2211 * Does the preemption model allow non-cooperative preemption? 2212 * 2213 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with 2214 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the 2215 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the 2216 * PREEMPT_NONE model. 2217 */ 2218 static inline bool preempt_model_preemptible(void) 2219 { 2220 return preempt_model_full() || preempt_model_rt(); 2221 } 2222 2223 /* 2224 * Does a critical section need to be broken due to another 2225 * task waiting?: (technically does not depend on CONFIG_PREEMPTION, 2226 * but a general need for low latency) 2227 */ 2228 static inline int spin_needbreak(spinlock_t *lock) 2229 { 2230 #ifdef CONFIG_PREEMPTION 2231 return spin_is_contended(lock); 2232 #else 2233 return 0; 2234 #endif 2235 } 2236 2237 /* 2238 * Check if a rwlock is contended. 2239 * Returns non-zero if there is another task waiting on the rwlock. 2240 * Returns zero if the lock is not contended or the system / underlying 2241 * rwlock implementation does not support contention detection. 2242 * Technically does not depend on CONFIG_PREEMPTION, but a general need 2243 * for low latency. 2244 */ 2245 static inline int rwlock_needbreak(rwlock_t *lock) 2246 { 2247 #ifdef CONFIG_PREEMPTION 2248 return rwlock_is_contended(lock); 2249 #else 2250 return 0; 2251 #endif 2252 } 2253 2254 static __always_inline bool need_resched(void) 2255 { 2256 return unlikely(tif_need_resched()); 2257 } 2258 2259 /* 2260 * Wrappers for p->thread_info->cpu access. No-op on UP. 2261 */ 2262 #ifdef CONFIG_SMP 2263 2264 static inline unsigned int task_cpu(const struct task_struct *p) 2265 { 2266 return READ_ONCE(task_thread_info(p)->cpu); 2267 } 2268 2269 extern void set_task_cpu(struct task_struct *p, unsigned int cpu); 2270 2271 #else 2272 2273 static inline unsigned int task_cpu(const struct task_struct *p) 2274 { 2275 return 0; 2276 } 2277 2278 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) 2279 { 2280 } 2281 2282 #endif /* CONFIG_SMP */ 2283 2284 extern bool sched_task_on_rq(struct task_struct *p); 2285 extern unsigned long get_wchan(struct task_struct *p); 2286 extern struct task_struct *cpu_curr_snapshot(int cpu); 2287 2288 /* 2289 * In order to reduce various lock holder preemption latencies provide an 2290 * interface to see if a vCPU is currently running or not. 2291 * 2292 * This allows us to terminate optimistic spin loops and block, analogous to 2293 * the native optimistic spin heuristic of testing if the lock owner task is 2294 * running or not. 2295 */ 2296 #ifndef vcpu_is_preempted 2297 static inline bool vcpu_is_preempted(int cpu) 2298 { 2299 return false; 2300 } 2301 #endif 2302 2303 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); 2304 extern long sched_getaffinity(pid_t pid, struct cpumask *mask); 2305 2306 #ifndef TASK_SIZE_OF 2307 #define TASK_SIZE_OF(tsk) TASK_SIZE 2308 #endif 2309 2310 #ifdef CONFIG_SMP 2311 static inline bool owner_on_cpu(struct task_struct *owner) 2312 { 2313 /* 2314 * As lock holder preemption issue, we both skip spinning if 2315 * task is not on cpu or its cpu is preempted 2316 */ 2317 return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); 2318 } 2319 2320 /* Returns effective CPU energy utilization, as seen by the scheduler */ 2321 unsigned long sched_cpu_util(int cpu); 2322 #endif /* CONFIG_SMP */ 2323 2324 #ifdef CONFIG_RSEQ 2325 2326 /* 2327 * Map the event mask on the user-space ABI enum rseq_cs_flags 2328 * for direct mask checks. 2329 */ 2330 enum rseq_event_mask_bits { 2331 RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, 2332 RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, 2333 RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, 2334 }; 2335 2336 enum rseq_event_mask { 2337 RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), 2338 RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), 2339 RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), 2340 }; 2341 2342 static inline void rseq_set_notify_resume(struct task_struct *t) 2343 { 2344 if (t->rseq) 2345 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 2346 } 2347 2348 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); 2349 2350 static inline void rseq_handle_notify_resume(struct ksignal *ksig, 2351 struct pt_regs *regs) 2352 { 2353 if (current->rseq) 2354 __rseq_handle_notify_resume(ksig, regs); 2355 } 2356 2357 static inline void rseq_signal_deliver(struct ksignal *ksig, 2358 struct pt_regs *regs) 2359 { 2360 preempt_disable(); 2361 __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); 2362 preempt_enable(); 2363 rseq_handle_notify_resume(ksig, regs); 2364 } 2365 2366 /* rseq_preempt() requires preemption to be disabled. */ 2367 static inline void rseq_preempt(struct task_struct *t) 2368 { 2369 __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); 2370 rseq_set_notify_resume(t); 2371 } 2372 2373 /* rseq_migrate() requires preemption to be disabled. */ 2374 static inline void rseq_migrate(struct task_struct *t) 2375 { 2376 __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); 2377 rseq_set_notify_resume(t); 2378 } 2379 2380 /* 2381 * If parent process has a registered restartable sequences area, the 2382 * child inherits. Unregister rseq for a clone with CLONE_VM set. 2383 */ 2384 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) 2385 { 2386 if (clone_flags & CLONE_VM) { 2387 t->rseq = NULL; 2388 t->rseq_len = 0; 2389 t->rseq_sig = 0; 2390 t->rseq_event_mask = 0; 2391 } else { 2392 t->rseq = current->rseq; 2393 t->rseq_len = current->rseq_len; 2394 t->rseq_sig = current->rseq_sig; 2395 t->rseq_event_mask = current->rseq_event_mask; 2396 } 2397 } 2398 2399 static inline void rseq_execve(struct task_struct *t) 2400 { 2401 t->rseq = NULL; 2402 t->rseq_len = 0; 2403 t->rseq_sig = 0; 2404 t->rseq_event_mask = 0; 2405 } 2406 2407 #else 2408 2409 static inline void rseq_set_notify_resume(struct task_struct *t) 2410 { 2411 } 2412 static inline void rseq_handle_notify_resume(struct ksignal *ksig, 2413 struct pt_regs *regs) 2414 { 2415 } 2416 static inline void rseq_signal_deliver(struct ksignal *ksig, 2417 struct pt_regs *regs) 2418 { 2419 } 2420 static inline void rseq_preempt(struct task_struct *t) 2421 { 2422 } 2423 static inline void rseq_migrate(struct task_struct *t) 2424 { 2425 } 2426 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) 2427 { 2428 } 2429 static inline void rseq_execve(struct task_struct *t) 2430 { 2431 } 2432 2433 #endif 2434 2435 #ifdef CONFIG_DEBUG_RSEQ 2436 2437 void rseq_syscall(struct pt_regs *regs); 2438 2439 #else 2440 2441 static inline void rseq_syscall(struct pt_regs *regs) 2442 { 2443 } 2444 2445 #endif 2446 2447 #ifdef CONFIG_SCHED_CORE 2448 extern void sched_core_free(struct task_struct *tsk); 2449 extern void sched_core_fork(struct task_struct *p); 2450 extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, 2451 unsigned long uaddr); 2452 extern int sched_core_idle_cpu(int cpu); 2453 #else 2454 static inline void sched_core_free(struct task_struct *tsk) { } 2455 static inline void sched_core_fork(struct task_struct *p) { } 2456 static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } 2457 #endif 2458 2459 extern void sched_set_stop_task(int cpu, struct task_struct *stop); 2460 2461 #endif 2462