1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Kernel internal timers 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 * 7 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. 8 * 9 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 10 * "A Kernel Model for Precision Timekeeping" by Dave Mills 11 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to 12 * serialize accesses to xtime/lost_ticks). 13 * Copyright (C) 1998 Andrea Arcangeli 14 * 1999-03-10 Improved NTP compatibility by Ulrich Windl 15 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love 16 * 2000-10-05 Implemented scalable SMP per-CPU timer handling. 17 * Copyright (C) 2000, 2001, 2002 Ingo Molnar 18 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar 19 */ 20 21 #include <linux/kernel_stat.h> 22 #include <linux/export.h> 23 #include <linux/interrupt.h> 24 #include <linux/percpu.h> 25 #include <linux/init.h> 26 #include <linux/mm.h> 27 #include <linux/swap.h> 28 #include <linux/pid_namespace.h> 29 #include <linux/notifier.h> 30 #include <linux/thread_info.h> 31 #include <linux/time.h> 32 #include <linux/jiffies.h> 33 #include <linux/posix-timers.h> 34 #include <linux/cpu.h> 35 #include <linux/syscalls.h> 36 #include <linux/delay.h> 37 #include <linux/tick.h> 38 #include <linux/kallsyms.h> 39 #include <linux/irq_work.h> 40 #include <linux/sched/signal.h> 41 #include <linux/sched/sysctl.h> 42 #include <linux/sched/nohz.h> 43 #include <linux/sched/debug.h> 44 #include <linux/slab.h> 45 #include <linux/compat.h> 46 #include <linux/random.h> 47 #include <linux/sysctl.h> 48 49 #include <linux/uaccess.h> 50 #include <asm/unistd.h> 51 #include <asm/div64.h> 52 #include <asm/timex.h> 53 #include <asm/io.h> 54 55 #include "tick-internal.h" 56 #include "timer_migration.h" 57 58 #define CREATE_TRACE_POINTS 59 #include <trace/events/timer.h> 60 61 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 62 63 EXPORT_SYMBOL(jiffies_64); 64 65 /* 66 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of 67 * LVL_SIZE buckets. Each level is driven by its own clock and therefor each 68 * level has a different granularity. 69 * 70 * The level granularity is: LVL_CLK_DIV ^ lvl 71 * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) 72 * 73 * The array level of a newly armed timer depends on the relative expiry 74 * time. The farther the expiry time is away the higher the array level and 75 * therefor the granularity becomes. 76 * 77 * Contrary to the original timer wheel implementation, which aims for 'exact' 78 * expiry of the timers, this implementation removes the need for recascading 79 * the timers into the lower array levels. The previous 'classic' timer wheel 80 * implementation of the kernel already violated the 'exact' expiry by adding 81 * slack to the expiry time to provide batched expiration. The granularity 82 * levels provide implicit batching. 83 * 84 * This is an optimization of the original timer wheel implementation for the 85 * majority of the timer wheel use cases: timeouts. The vast majority of 86 * timeout timers (networking, disk I/O ...) are canceled before expiry. If 87 * the timeout expires it indicates that normal operation is disturbed, so it 88 * does not matter much whether the timeout comes with a slight delay. 89 * 90 * The only exception to this are networking timers with a small expiry 91 * time. They rely on the granularity. Those fit into the first wheel level, 92 * which has HZ granularity. 93 * 94 * We don't have cascading anymore. timers with a expiry time above the 95 * capacity of the last wheel level are force expired at the maximum timeout 96 * value of the last wheel level. From data sampling we know that the maximum 97 * value observed is 5 days (network connection tracking), so this should not 98 * be an issue. 99 * 100 * The currently chosen array constants values are a good compromise between 101 * array size and granularity. 102 * 103 * This results in the following granularity and range levels: 104 * 105 * HZ 1000 steps 106 * Level Offset Granularity Range 107 * 0 0 1 ms 0 ms - 63 ms 108 * 1 64 8 ms 64 ms - 511 ms 109 * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) 110 * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) 111 * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) 112 * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) 113 * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) 114 * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) 115 * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) 116 * 117 * HZ 300 118 * Level Offset Granularity Range 119 * 0 0 3 ms 0 ms - 210 ms 120 * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) 121 * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) 122 * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) 123 * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) 124 * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) 125 * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) 126 * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) 127 * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) 128 * 129 * HZ 250 130 * Level Offset Granularity Range 131 * 0 0 4 ms 0 ms - 255 ms 132 * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) 133 * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) 134 * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) 135 * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) 136 * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) 137 * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) 138 * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) 139 * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) 140 * 141 * HZ 100 142 * Level Offset Granularity Range 143 * 0 0 10 ms 0 ms - 630 ms 144 * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) 145 * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) 146 * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) 147 * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) 148 * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) 149 * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) 150 * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) 151 */ 152 153 /* Clock divisor for the next level */ 154 #define LVL_CLK_SHIFT 3 155 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) 156 #define LVL_CLK_MASK (LVL_CLK_DIV - 1) 157 #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) 158 #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) 159 160 /* 161 * The time start value for each level to select the bucket at enqueue 162 * time. We start from the last possible delta of the previous level 163 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). 164 */ 165 #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) 166 167 /* Size of each clock level */ 168 #define LVL_BITS 6 169 #define LVL_SIZE (1UL << LVL_BITS) 170 #define LVL_MASK (LVL_SIZE - 1) 171 #define LVL_OFFS(n) ((n) * LVL_SIZE) 172 173 /* Level depth */ 174 #if HZ > 100 175 # define LVL_DEPTH 9 176 # else 177 # define LVL_DEPTH 8 178 #endif 179 180 /* The cutoff (max. capacity of the wheel) */ 181 #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) 182 #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) 183 184 /* 185 * The resulting wheel size. If NOHZ is configured we allocate two 186 * wheels so we have a separate storage for the deferrable timers. 187 */ 188 #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) 189 190 #ifdef CONFIG_NO_HZ_COMMON 191 /* 192 * If multiple bases need to be locked, use the base ordering for lock 193 * nesting, i.e. lowest number first. 194 */ 195 # define NR_BASES 3 196 # define BASE_LOCAL 0 197 # define BASE_GLOBAL 1 198 # define BASE_DEF 2 199 #else 200 # define NR_BASES 1 201 # define BASE_LOCAL 0 202 # define BASE_GLOBAL 0 203 # define BASE_DEF 0 204 #endif 205 206 /** 207 * struct timer_base - Per CPU timer base (number of base depends on config) 208 * @lock: Lock protecting the timer_base 209 * @running_timer: When expiring timers, the lock is dropped. To make 210 * sure not to race agains deleting/modifying a 211 * currently running timer, the pointer is set to the 212 * timer, which expires at the moment. If no timer is 213 * running, the pointer is NULL. 214 * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around 215 * timer expiry callback execution and when trying to 216 * delete a running timer and it wasn't successful in 217 * the first glance. It prevents priority inversion 218 * when callback was preempted on a remote CPU and a 219 * caller tries to delete the running timer. It also 220 * prevents a life lock, when the task which tries to 221 * delete a timer preempted the softirq thread which 222 * is running the timer callback function. 223 * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter 224 * waiting for the end of the timer callback function 225 * execution. 226 * @clk: clock of the timer base; is updated before enqueue 227 * of a timer; during expiry, it is 1 offset ahead of 228 * jiffies to avoid endless requeuing to current 229 * jiffies 230 * @next_expiry: expiry value of the first timer; it is updated when 231 * finding the next timer and during enqueue; the 232 * value is not valid, when next_expiry_recalc is set 233 * @cpu: Number of CPU the timer base belongs to 234 * @next_expiry_recalc: States, whether a recalculation of next_expiry is 235 * required. Value is set true, when a timer was 236 * deleted. 237 * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ 238 * code. This state is only used in standard 239 * base. Deferrable timers, which are enqueued remotely 240 * never wake up an idle CPU. So no matter of supporting it 241 * for this base. 242 * @timers_pending: Is set, when a timer is pending in the base. It is only 243 * reliable when next_expiry_recalc is not set. 244 * @pending_map: bitmap of the timer wheel; each bit reflects a 245 * bucket of the wheel. When a bit is set, at least a 246 * single timer is enqueued in the related bucket. 247 * @vectors: Array of lists; Each array member reflects a bucket 248 * of the timer wheel. The list contains all timers 249 * which are enqueued into a specific bucket. 250 */ 251 struct timer_base { 252 raw_spinlock_t lock; 253 struct timer_list *running_timer; 254 #ifdef CONFIG_PREEMPT_RT 255 spinlock_t expiry_lock; 256 atomic_t timer_waiters; 257 #endif 258 unsigned long clk; 259 unsigned long next_expiry; 260 unsigned int cpu; 261 bool next_expiry_recalc; 262 bool is_idle; 263 bool timers_pending; 264 DECLARE_BITMAP(pending_map, WHEEL_SIZE); 265 struct hlist_head vectors[WHEEL_SIZE]; 266 } ____cacheline_aligned; 267 268 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); 269 270 #ifdef CONFIG_NO_HZ_COMMON 271 272 static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); 273 static DEFINE_MUTEX(timer_keys_mutex); 274 275 static void timer_update_keys(struct work_struct *work); 276 static DECLARE_WORK(timer_update_work, timer_update_keys); 277 278 #ifdef CONFIG_SMP 279 static unsigned int sysctl_timer_migration = 1; 280 281 DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); 282 283 static void timers_update_migration(void) 284 { 285 if (sysctl_timer_migration && tick_nohz_active) 286 static_branch_enable(&timers_migration_enabled); 287 else 288 static_branch_disable(&timers_migration_enabled); 289 } 290 291 #ifdef CONFIG_SYSCTL 292 static int timer_migration_handler(struct ctl_table *table, int write, 293 void *buffer, size_t *lenp, loff_t *ppos) 294 { 295 int ret; 296 297 mutex_lock(&timer_keys_mutex); 298 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 299 if (!ret && write) 300 timers_update_migration(); 301 mutex_unlock(&timer_keys_mutex); 302 return ret; 303 } 304 305 static struct ctl_table timer_sysctl[] = { 306 { 307 .procname = "timer_migration", 308 .data = &sysctl_timer_migration, 309 .maxlen = sizeof(unsigned int), 310 .mode = 0644, 311 .proc_handler = timer_migration_handler, 312 .extra1 = SYSCTL_ZERO, 313 .extra2 = SYSCTL_ONE, 314 }, 315 {} 316 }; 317 318 static int __init timer_sysctl_init(void) 319 { 320 register_sysctl("kernel", timer_sysctl); 321 return 0; 322 } 323 device_initcall(timer_sysctl_init); 324 #endif /* CONFIG_SYSCTL */ 325 #else /* CONFIG_SMP */ 326 static inline void timers_update_migration(void) { } 327 #endif /* !CONFIG_SMP */ 328 329 static void timer_update_keys(struct work_struct *work) 330 { 331 mutex_lock(&timer_keys_mutex); 332 timers_update_migration(); 333 static_branch_enable(&timers_nohz_active); 334 mutex_unlock(&timer_keys_mutex); 335 } 336 337 void timers_update_nohz(void) 338 { 339 schedule_work(&timer_update_work); 340 } 341 342 static inline bool is_timers_nohz_active(void) 343 { 344 return static_branch_unlikely(&timers_nohz_active); 345 } 346 #else 347 static inline bool is_timers_nohz_active(void) { return false; } 348 #endif /* NO_HZ_COMMON */ 349 350 static unsigned long round_jiffies_common(unsigned long j, int cpu, 351 bool force_up) 352 { 353 int rem; 354 unsigned long original = j; 355 356 /* 357 * We don't want all cpus firing their timers at once hitting the 358 * same lock or cachelines, so we skew each extra cpu with an extra 359 * 3 jiffies. This 3 jiffies came originally from the mm/ code which 360 * already did this. 361 * The skew is done by adding 3*cpunr, then round, then subtract this 362 * extra offset again. 363 */ 364 j += cpu * 3; 365 366 rem = j % HZ; 367 368 /* 369 * If the target jiffie is just after a whole second (which can happen 370 * due to delays of the timer irq, long irq off times etc etc) then 371 * we should round down to the whole second, not up. Use 1/4th second 372 * as cutoff for this rounding as an extreme upper bound for this. 373 * But never round down if @force_up is set. 374 */ 375 if (rem < HZ/4 && !force_up) /* round down */ 376 j = j - rem; 377 else /* round up */ 378 j = j - rem + HZ; 379 380 /* now that we have rounded, subtract the extra skew again */ 381 j -= cpu * 3; 382 383 /* 384 * Make sure j is still in the future. Otherwise return the 385 * unmodified value. 386 */ 387 return time_is_after_jiffies(j) ? j : original; 388 } 389 390 /** 391 * __round_jiffies - function to round jiffies to a full second 392 * @j: the time in (absolute) jiffies that should be rounded 393 * @cpu: the processor number on which the timeout will happen 394 * 395 * __round_jiffies() rounds an absolute time in the future (in jiffies) 396 * up or down to (approximately) full seconds. This is useful for timers 397 * for which the exact time they fire does not matter too much, as long as 398 * they fire approximately every X seconds. 399 * 400 * By rounding these timers to whole seconds, all such timers will fire 401 * at the same time, rather than at various times spread out. The goal 402 * of this is to have the CPU wake up less, which saves power. 403 * 404 * The exact rounding is skewed for each processor to avoid all 405 * processors firing at the exact same time, which could lead 406 * to lock contention or spurious cache line bouncing. 407 * 408 * The return value is the rounded version of the @j parameter. 409 */ 410 unsigned long __round_jiffies(unsigned long j, int cpu) 411 { 412 return round_jiffies_common(j, cpu, false); 413 } 414 EXPORT_SYMBOL_GPL(__round_jiffies); 415 416 /** 417 * __round_jiffies_relative - function to round jiffies to a full second 418 * @j: the time in (relative) jiffies that should be rounded 419 * @cpu: the processor number on which the timeout will happen 420 * 421 * __round_jiffies_relative() rounds a time delta in the future (in jiffies) 422 * up or down to (approximately) full seconds. This is useful for timers 423 * for which the exact time they fire does not matter too much, as long as 424 * they fire approximately every X seconds. 425 * 426 * By rounding these timers to whole seconds, all such timers will fire 427 * at the same time, rather than at various times spread out. The goal 428 * of this is to have the CPU wake up less, which saves power. 429 * 430 * The exact rounding is skewed for each processor to avoid all 431 * processors firing at the exact same time, which could lead 432 * to lock contention or spurious cache line bouncing. 433 * 434 * The return value is the rounded version of the @j parameter. 435 */ 436 unsigned long __round_jiffies_relative(unsigned long j, int cpu) 437 { 438 unsigned long j0 = jiffies; 439 440 /* Use j0 because jiffies might change while we run */ 441 return round_jiffies_common(j + j0, cpu, false) - j0; 442 } 443 EXPORT_SYMBOL_GPL(__round_jiffies_relative); 444 445 /** 446 * round_jiffies - function to round jiffies to a full second 447 * @j: the time in (absolute) jiffies that should be rounded 448 * 449 * round_jiffies() rounds an absolute time in the future (in jiffies) 450 * up or down to (approximately) full seconds. This is useful for timers 451 * for which the exact time they fire does not matter too much, as long as 452 * they fire approximately every X seconds. 453 * 454 * By rounding these timers to whole seconds, all such timers will fire 455 * at the same time, rather than at various times spread out. The goal 456 * of this is to have the CPU wake up less, which saves power. 457 * 458 * The return value is the rounded version of the @j parameter. 459 */ 460 unsigned long round_jiffies(unsigned long j) 461 { 462 return round_jiffies_common(j, raw_smp_processor_id(), false); 463 } 464 EXPORT_SYMBOL_GPL(round_jiffies); 465 466 /** 467 * round_jiffies_relative - function to round jiffies to a full second 468 * @j: the time in (relative) jiffies that should be rounded 469 * 470 * round_jiffies_relative() rounds a time delta in the future (in jiffies) 471 * up or down to (approximately) full seconds. This is useful for timers 472 * for which the exact time they fire does not matter too much, as long as 473 * they fire approximately every X seconds. 474 * 475 * By rounding these timers to whole seconds, all such timers will fire 476 * at the same time, rather than at various times spread out. The goal 477 * of this is to have the CPU wake up less, which saves power. 478 * 479 * The return value is the rounded version of the @j parameter. 480 */ 481 unsigned long round_jiffies_relative(unsigned long j) 482 { 483 return __round_jiffies_relative(j, raw_smp_processor_id()); 484 } 485 EXPORT_SYMBOL_GPL(round_jiffies_relative); 486 487 /** 488 * __round_jiffies_up - function to round jiffies up to a full second 489 * @j: the time in (absolute) jiffies that should be rounded 490 * @cpu: the processor number on which the timeout will happen 491 * 492 * This is the same as __round_jiffies() except that it will never 493 * round down. This is useful for timeouts for which the exact time 494 * of firing does not matter too much, as long as they don't fire too 495 * early. 496 */ 497 unsigned long __round_jiffies_up(unsigned long j, int cpu) 498 { 499 return round_jiffies_common(j, cpu, true); 500 } 501 EXPORT_SYMBOL_GPL(__round_jiffies_up); 502 503 /** 504 * __round_jiffies_up_relative - function to round jiffies up to a full second 505 * @j: the time in (relative) jiffies that should be rounded 506 * @cpu: the processor number on which the timeout will happen 507 * 508 * This is the same as __round_jiffies_relative() except that it will never 509 * round down. This is useful for timeouts for which the exact time 510 * of firing does not matter too much, as long as they don't fire too 511 * early. 512 */ 513 unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) 514 { 515 unsigned long j0 = jiffies; 516 517 /* Use j0 because jiffies might change while we run */ 518 return round_jiffies_common(j + j0, cpu, true) - j0; 519 } 520 EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); 521 522 /** 523 * round_jiffies_up - function to round jiffies up to a full second 524 * @j: the time in (absolute) jiffies that should be rounded 525 * 526 * This is the same as round_jiffies() except that it will never 527 * round down. This is useful for timeouts for which the exact time 528 * of firing does not matter too much, as long as they don't fire too 529 * early. 530 */ 531 unsigned long round_jiffies_up(unsigned long j) 532 { 533 return round_jiffies_common(j, raw_smp_processor_id(), true); 534 } 535 EXPORT_SYMBOL_GPL(round_jiffies_up); 536 537 /** 538 * round_jiffies_up_relative - function to round jiffies up to a full second 539 * @j: the time in (relative) jiffies that should be rounded 540 * 541 * This is the same as round_jiffies_relative() except that it will never 542 * round down. This is useful for timeouts for which the exact time 543 * of firing does not matter too much, as long as they don't fire too 544 * early. 545 */ 546 unsigned long round_jiffies_up_relative(unsigned long j) 547 { 548 return __round_jiffies_up_relative(j, raw_smp_processor_id()); 549 } 550 EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 551 552 553 static inline unsigned int timer_get_idx(struct timer_list *timer) 554 { 555 return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; 556 } 557 558 static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) 559 { 560 timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | 561 idx << TIMER_ARRAYSHIFT; 562 } 563 564 /* 565 * Helper function to calculate the array index for a given expiry 566 * time. 567 */ 568 static inline unsigned calc_index(unsigned long expires, unsigned lvl, 569 unsigned long *bucket_expiry) 570 { 571 572 /* 573 * The timer wheel has to guarantee that a timer does not fire 574 * early. Early expiry can happen due to: 575 * - Timer is armed at the edge of a tick 576 * - Truncation of the expiry time in the outer wheel levels 577 * 578 * Round up with level granularity to prevent this. 579 */ 580 expires = (expires >> LVL_SHIFT(lvl)) + 1; 581 *bucket_expiry = expires << LVL_SHIFT(lvl); 582 return LVL_OFFS(lvl) + (expires & LVL_MASK); 583 } 584 585 static int calc_wheel_index(unsigned long expires, unsigned long clk, 586 unsigned long *bucket_expiry) 587 { 588 unsigned long delta = expires - clk; 589 unsigned int idx; 590 591 if (delta < LVL_START(1)) { 592 idx = calc_index(expires, 0, bucket_expiry); 593 } else if (delta < LVL_START(2)) { 594 idx = calc_index(expires, 1, bucket_expiry); 595 } else if (delta < LVL_START(3)) { 596 idx = calc_index(expires, 2, bucket_expiry); 597 } else if (delta < LVL_START(4)) { 598 idx = calc_index(expires, 3, bucket_expiry); 599 } else if (delta < LVL_START(5)) { 600 idx = calc_index(expires, 4, bucket_expiry); 601 } else if (delta < LVL_START(6)) { 602 idx = calc_index(expires, 5, bucket_expiry); 603 } else if (delta < LVL_START(7)) { 604 idx = calc_index(expires, 6, bucket_expiry); 605 } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { 606 idx = calc_index(expires, 7, bucket_expiry); 607 } else if ((long) delta < 0) { 608 idx = clk & LVL_MASK; 609 *bucket_expiry = clk; 610 } else { 611 /* 612 * Force expire obscene large timeouts to expire at the 613 * capacity limit of the wheel. 614 */ 615 if (delta >= WHEEL_TIMEOUT_CUTOFF) 616 expires = clk + WHEEL_TIMEOUT_MAX; 617 618 idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); 619 } 620 return idx; 621 } 622 623 static void 624 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) 625 { 626 /* 627 * Deferrable timers do not prevent the CPU from entering dynticks and 628 * are not taken into account on the idle/nohz_full path. An IPI when a 629 * new deferrable timer is enqueued will wake up the remote CPU but 630 * nothing will be done with the deferrable timer base. Therefore skip 631 * the remote IPI for deferrable timers completely. 632 */ 633 if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) 634 return; 635 636 /* 637 * We might have to IPI the remote CPU if the base is idle and the 638 * timer is not deferrable. If the other CPU is on the way to idle 639 * then it can't set base->is_idle as we hold the base lock: 640 */ 641 if (base->is_idle) 642 wake_up_nohz_cpu(base->cpu); 643 } 644 645 /* 646 * Enqueue the timer into the hash bucket, mark it pending in 647 * the bitmap, store the index in the timer flags then wake up 648 * the target CPU if needed. 649 */ 650 static void enqueue_timer(struct timer_base *base, struct timer_list *timer, 651 unsigned int idx, unsigned long bucket_expiry) 652 { 653 654 hlist_add_head(&timer->entry, base->vectors + idx); 655 __set_bit(idx, base->pending_map); 656 timer_set_idx(timer, idx); 657 658 trace_timer_start(timer, bucket_expiry); 659 660 /* 661 * Check whether this is the new first expiring timer. The 662 * effective expiry time of the timer is required here 663 * (bucket_expiry) instead of timer->expires. 664 */ 665 if (time_before(bucket_expiry, base->next_expiry)) { 666 /* 667 * Set the next expiry time and kick the CPU so it 668 * can reevaluate the wheel: 669 */ 670 base->next_expiry = bucket_expiry; 671 base->timers_pending = true; 672 base->next_expiry_recalc = false; 673 trigger_dyntick_cpu(base, timer); 674 } 675 } 676 677 static void internal_add_timer(struct timer_base *base, struct timer_list *timer) 678 { 679 unsigned long bucket_expiry; 680 unsigned int idx; 681 682 idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); 683 enqueue_timer(base, timer, idx, bucket_expiry); 684 } 685 686 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS 687 688 static const struct debug_obj_descr timer_debug_descr; 689 690 struct timer_hint { 691 void (*function)(struct timer_list *t); 692 long offset; 693 }; 694 695 #define TIMER_HINT(fn, container, timr, hintfn) \ 696 { \ 697 .function = fn, \ 698 .offset = offsetof(container, hintfn) - \ 699 offsetof(container, timr) \ 700 } 701 702 static const struct timer_hint timer_hints[] = { 703 TIMER_HINT(delayed_work_timer_fn, 704 struct delayed_work, timer, work.func), 705 TIMER_HINT(kthread_delayed_work_timer_fn, 706 struct kthread_delayed_work, timer, work.func), 707 }; 708 709 static void *timer_debug_hint(void *addr) 710 { 711 struct timer_list *timer = addr; 712 int i; 713 714 for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { 715 if (timer_hints[i].function == timer->function) { 716 void (**fn)(void) = addr + timer_hints[i].offset; 717 718 return *fn; 719 } 720 } 721 722 return timer->function; 723 } 724 725 static bool timer_is_static_object(void *addr) 726 { 727 struct timer_list *timer = addr; 728 729 return (timer->entry.pprev == NULL && 730 timer->entry.next == TIMER_ENTRY_STATIC); 731 } 732 733 /* 734 * fixup_init is called when: 735 * - an active object is initialized 736 */ 737 static bool timer_fixup_init(void *addr, enum debug_obj_state state) 738 { 739 struct timer_list *timer = addr; 740 741 switch (state) { 742 case ODEBUG_STATE_ACTIVE: 743 del_timer_sync(timer); 744 debug_object_init(timer, &timer_debug_descr); 745 return true; 746 default: 747 return false; 748 } 749 } 750 751 /* Stub timer callback for improperly used timers. */ 752 static void stub_timer(struct timer_list *unused) 753 { 754 WARN_ON(1); 755 } 756 757 /* 758 * fixup_activate is called when: 759 * - an active object is activated 760 * - an unknown non-static object is activated 761 */ 762 static bool timer_fixup_activate(void *addr, enum debug_obj_state state) 763 { 764 struct timer_list *timer = addr; 765 766 switch (state) { 767 case ODEBUG_STATE_NOTAVAILABLE: 768 timer_setup(timer, stub_timer, 0); 769 return true; 770 771 case ODEBUG_STATE_ACTIVE: 772 WARN_ON(1); 773 fallthrough; 774 default: 775 return false; 776 } 777 } 778 779 /* 780 * fixup_free is called when: 781 * - an active object is freed 782 */ 783 static bool timer_fixup_free(void *addr, enum debug_obj_state state) 784 { 785 struct timer_list *timer = addr; 786 787 switch (state) { 788 case ODEBUG_STATE_ACTIVE: 789 del_timer_sync(timer); 790 debug_object_free(timer, &timer_debug_descr); 791 return true; 792 default: 793 return false; 794 } 795 } 796 797 /* 798 * fixup_assert_init is called when: 799 * - an untracked/uninit-ed object is found 800 */ 801 static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) 802 { 803 struct timer_list *timer = addr; 804 805 switch (state) { 806 case ODEBUG_STATE_NOTAVAILABLE: 807 timer_setup(timer, stub_timer, 0); 808 return true; 809 default: 810 return false; 811 } 812 } 813 814 static const struct debug_obj_descr timer_debug_descr = { 815 .name = "timer_list", 816 .debug_hint = timer_debug_hint, 817 .is_static_object = timer_is_static_object, 818 .fixup_init = timer_fixup_init, 819 .fixup_activate = timer_fixup_activate, 820 .fixup_free = timer_fixup_free, 821 .fixup_assert_init = timer_fixup_assert_init, 822 }; 823 824 static inline void debug_timer_init(struct timer_list *timer) 825 { 826 debug_object_init(timer, &timer_debug_descr); 827 } 828 829 static inline void debug_timer_activate(struct timer_list *timer) 830 { 831 debug_object_activate(timer, &timer_debug_descr); 832 } 833 834 static inline void debug_timer_deactivate(struct timer_list *timer) 835 { 836 debug_object_deactivate(timer, &timer_debug_descr); 837 } 838 839 static inline void debug_timer_assert_init(struct timer_list *timer) 840 { 841 debug_object_assert_init(timer, &timer_debug_descr); 842 } 843 844 static void do_init_timer(struct timer_list *timer, 845 void (*func)(struct timer_list *), 846 unsigned int flags, 847 const char *name, struct lock_class_key *key); 848 849 void init_timer_on_stack_key(struct timer_list *timer, 850 void (*func)(struct timer_list *), 851 unsigned int flags, 852 const char *name, struct lock_class_key *key) 853 { 854 debug_object_init_on_stack(timer, &timer_debug_descr); 855 do_init_timer(timer, func, flags, name, key); 856 } 857 EXPORT_SYMBOL_GPL(init_timer_on_stack_key); 858 859 void destroy_timer_on_stack(struct timer_list *timer) 860 { 861 debug_object_free(timer, &timer_debug_descr); 862 } 863 EXPORT_SYMBOL_GPL(destroy_timer_on_stack); 864 865 #else 866 static inline void debug_timer_init(struct timer_list *timer) { } 867 static inline void debug_timer_activate(struct timer_list *timer) { } 868 static inline void debug_timer_deactivate(struct timer_list *timer) { } 869 static inline void debug_timer_assert_init(struct timer_list *timer) { } 870 #endif 871 872 static inline void debug_init(struct timer_list *timer) 873 { 874 debug_timer_init(timer); 875 trace_timer_init(timer); 876 } 877 878 static inline void debug_deactivate(struct timer_list *timer) 879 { 880 debug_timer_deactivate(timer); 881 trace_timer_cancel(timer); 882 } 883 884 static inline void debug_assert_init(struct timer_list *timer) 885 { 886 debug_timer_assert_init(timer); 887 } 888 889 static void do_init_timer(struct timer_list *timer, 890 void (*func)(struct timer_list *), 891 unsigned int flags, 892 const char *name, struct lock_class_key *key) 893 { 894 timer->entry.pprev = NULL; 895 timer->function = func; 896 if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) 897 flags &= TIMER_INIT_FLAGS; 898 timer->flags = flags | raw_smp_processor_id(); 899 lockdep_init_map(&timer->lockdep_map, name, key, 0); 900 } 901 902 /** 903 * init_timer_key - initialize a timer 904 * @timer: the timer to be initialized 905 * @func: timer callback function 906 * @flags: timer flags 907 * @name: name of the timer 908 * @key: lockdep class key of the fake lock used for tracking timer 909 * sync lock dependencies 910 * 911 * init_timer_key() must be done to a timer prior calling *any* of the 912 * other timer functions. 913 */ 914 void init_timer_key(struct timer_list *timer, 915 void (*func)(struct timer_list *), unsigned int flags, 916 const char *name, struct lock_class_key *key) 917 { 918 debug_init(timer); 919 do_init_timer(timer, func, flags, name, key); 920 } 921 EXPORT_SYMBOL(init_timer_key); 922 923 static inline void detach_timer(struct timer_list *timer, bool clear_pending) 924 { 925 struct hlist_node *entry = &timer->entry; 926 927 debug_deactivate(timer); 928 929 __hlist_del(entry); 930 if (clear_pending) 931 entry->pprev = NULL; 932 entry->next = LIST_POISON2; 933 } 934 935 static int detach_if_pending(struct timer_list *timer, struct timer_base *base, 936 bool clear_pending) 937 { 938 unsigned idx = timer_get_idx(timer); 939 940 if (!timer_pending(timer)) 941 return 0; 942 943 if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { 944 __clear_bit(idx, base->pending_map); 945 base->next_expiry_recalc = true; 946 } 947 948 detach_timer(timer, clear_pending); 949 return 1; 950 } 951 952 static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) 953 { 954 int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; 955 struct timer_base *base; 956 957 base = per_cpu_ptr(&timer_bases[index], cpu); 958 959 /* 960 * If the timer is deferrable and NO_HZ_COMMON is set then we need 961 * to use the deferrable base. 962 */ 963 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) 964 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); 965 return base; 966 } 967 968 static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) 969 { 970 int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; 971 struct timer_base *base; 972 973 base = this_cpu_ptr(&timer_bases[index]); 974 975 /* 976 * If the timer is deferrable and NO_HZ_COMMON is set then we need 977 * to use the deferrable base. 978 */ 979 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) 980 base = this_cpu_ptr(&timer_bases[BASE_DEF]); 981 return base; 982 } 983 984 static inline struct timer_base *get_timer_base(u32 tflags) 985 { 986 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); 987 } 988 989 static inline struct timer_base * 990 get_target_base(struct timer_base *base, unsigned tflags) 991 { 992 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 993 if (static_branch_likely(&timers_migration_enabled) && 994 !(tflags & TIMER_PINNED)) 995 return get_timer_cpu_base(tflags, get_nohz_timer_target()); 996 #endif 997 return get_timer_this_cpu_base(tflags); 998 } 999 1000 static inline void __forward_timer_base(struct timer_base *base, 1001 unsigned long basej) 1002 { 1003 /* 1004 * Check whether we can forward the base. We can only do that when 1005 * @basej is past base->clk otherwise we might rewind base->clk. 1006 */ 1007 if (time_before_eq(basej, base->clk)) 1008 return; 1009 1010 /* 1011 * If the next expiry value is > jiffies, then we fast forward to 1012 * jiffies otherwise we forward to the next expiry value. 1013 */ 1014 if (time_after(base->next_expiry, basej)) { 1015 base->clk = basej; 1016 } else { 1017 if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) 1018 return; 1019 base->clk = base->next_expiry; 1020 } 1021 1022 } 1023 1024 static inline void forward_timer_base(struct timer_base *base) 1025 { 1026 __forward_timer_base(base, READ_ONCE(jiffies)); 1027 } 1028 1029 /* 1030 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means 1031 * that all timers which are tied to this base are locked, and the base itself 1032 * is locked too. 1033 * 1034 * So __run_timers/migrate_timers can safely modify all timers which could 1035 * be found in the base->vectors array. 1036 * 1037 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need 1038 * to wait until the migration is done. 1039 */ 1040 static struct timer_base *lock_timer_base(struct timer_list *timer, 1041 unsigned long *flags) 1042 __acquires(timer->base->lock) 1043 { 1044 for (;;) { 1045 struct timer_base *base; 1046 u32 tf; 1047 1048 /* 1049 * We need to use READ_ONCE() here, otherwise the compiler 1050 * might re-read @tf between the check for TIMER_MIGRATING 1051 * and spin_lock(). 1052 */ 1053 tf = READ_ONCE(timer->flags); 1054 1055 if (!(tf & TIMER_MIGRATING)) { 1056 base = get_timer_base(tf); 1057 raw_spin_lock_irqsave(&base->lock, *flags); 1058 if (timer->flags == tf) 1059 return base; 1060 raw_spin_unlock_irqrestore(&base->lock, *flags); 1061 } 1062 cpu_relax(); 1063 } 1064 } 1065 1066 #define MOD_TIMER_PENDING_ONLY 0x01 1067 #define MOD_TIMER_REDUCE 0x02 1068 #define MOD_TIMER_NOTPENDING 0x04 1069 1070 static inline int 1071 __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) 1072 { 1073 unsigned long clk = 0, flags, bucket_expiry; 1074 struct timer_base *base, *new_base; 1075 unsigned int idx = UINT_MAX; 1076 int ret = 0; 1077 1078 debug_assert_init(timer); 1079 1080 /* 1081 * This is a common optimization triggered by the networking code - if 1082 * the timer is re-modified to have the same timeout or ends up in the 1083 * same array bucket then just return: 1084 */ 1085 if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { 1086 /* 1087 * The downside of this optimization is that it can result in 1088 * larger granularity than you would get from adding a new 1089 * timer with this expiry. 1090 */ 1091 long diff = timer->expires - expires; 1092 1093 if (!diff) 1094 return 1; 1095 if (options & MOD_TIMER_REDUCE && diff <= 0) 1096 return 1; 1097 1098 /* 1099 * We lock timer base and calculate the bucket index right 1100 * here. If the timer ends up in the same bucket, then we 1101 * just update the expiry time and avoid the whole 1102 * dequeue/enqueue dance. 1103 */ 1104 base = lock_timer_base(timer, &flags); 1105 /* 1106 * Has @timer been shutdown? This needs to be evaluated 1107 * while holding base lock to prevent a race against the 1108 * shutdown code. 1109 */ 1110 if (!timer->function) 1111 goto out_unlock; 1112 1113 forward_timer_base(base); 1114 1115 if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && 1116 time_before_eq(timer->expires, expires)) { 1117 ret = 1; 1118 goto out_unlock; 1119 } 1120 1121 clk = base->clk; 1122 idx = calc_wheel_index(expires, clk, &bucket_expiry); 1123 1124 /* 1125 * Retrieve and compare the array index of the pending 1126 * timer. If it matches set the expiry to the new value so a 1127 * subsequent call will exit in the expires check above. 1128 */ 1129 if (idx == timer_get_idx(timer)) { 1130 if (!(options & MOD_TIMER_REDUCE)) 1131 timer->expires = expires; 1132 else if (time_after(timer->expires, expires)) 1133 timer->expires = expires; 1134 ret = 1; 1135 goto out_unlock; 1136 } 1137 } else { 1138 base = lock_timer_base(timer, &flags); 1139 /* 1140 * Has @timer been shutdown? This needs to be evaluated 1141 * while holding base lock to prevent a race against the 1142 * shutdown code. 1143 */ 1144 if (!timer->function) 1145 goto out_unlock; 1146 1147 forward_timer_base(base); 1148 } 1149 1150 ret = detach_if_pending(timer, base, false); 1151 if (!ret && (options & MOD_TIMER_PENDING_ONLY)) 1152 goto out_unlock; 1153 1154 new_base = get_target_base(base, timer->flags); 1155 1156 if (base != new_base) { 1157 /* 1158 * We are trying to schedule the timer on the new base. 1159 * However we can't change timer's base while it is running, 1160 * otherwise timer_delete_sync() can't detect that the timer's 1161 * handler yet has not finished. This also guarantees that the 1162 * timer is serialized wrt itself. 1163 */ 1164 if (likely(base->running_timer != timer)) { 1165 /* See the comment in lock_timer_base() */ 1166 timer->flags |= TIMER_MIGRATING; 1167 1168 raw_spin_unlock(&base->lock); 1169 base = new_base; 1170 raw_spin_lock(&base->lock); 1171 WRITE_ONCE(timer->flags, 1172 (timer->flags & ~TIMER_BASEMASK) | base->cpu); 1173 forward_timer_base(base); 1174 } 1175 } 1176 1177 debug_timer_activate(timer); 1178 1179 timer->expires = expires; 1180 /* 1181 * If 'idx' was calculated above and the base time did not advance 1182 * between calculating 'idx' and possibly switching the base, only 1183 * enqueue_timer() is required. Otherwise we need to (re)calculate 1184 * the wheel index via internal_add_timer(). 1185 */ 1186 if (idx != UINT_MAX && clk == base->clk) 1187 enqueue_timer(base, timer, idx, bucket_expiry); 1188 else 1189 internal_add_timer(base, timer); 1190 1191 out_unlock: 1192 raw_spin_unlock_irqrestore(&base->lock, flags); 1193 1194 return ret; 1195 } 1196 1197 /** 1198 * mod_timer_pending - Modify a pending timer's timeout 1199 * @timer: The pending timer to be modified 1200 * @expires: New absolute timeout in jiffies 1201 * 1202 * mod_timer_pending() is the same for pending timers as mod_timer(), but 1203 * will not activate inactive timers. 1204 * 1205 * If @timer->function == NULL then the start operation is silently 1206 * discarded. 1207 * 1208 * Return: 1209 * * %0 - The timer was inactive and not modified or was in 1210 * shutdown state and the operation was discarded 1211 * * %1 - The timer was active and requeued to expire at @expires 1212 */ 1213 int mod_timer_pending(struct timer_list *timer, unsigned long expires) 1214 { 1215 return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); 1216 } 1217 EXPORT_SYMBOL(mod_timer_pending); 1218 1219 /** 1220 * mod_timer - Modify a timer's timeout 1221 * @timer: The timer to be modified 1222 * @expires: New absolute timeout in jiffies 1223 * 1224 * mod_timer(timer, expires) is equivalent to: 1225 * 1226 * del_timer(timer); timer->expires = expires; add_timer(timer); 1227 * 1228 * mod_timer() is more efficient than the above open coded sequence. In 1229 * case that the timer is inactive, the del_timer() part is a NOP. The 1230 * timer is in any case activated with the new expiry time @expires. 1231 * 1232 * Note that if there are multiple unserialized concurrent users of the 1233 * same timer, then mod_timer() is the only safe way to modify the timeout, 1234 * since add_timer() cannot modify an already running timer. 1235 * 1236 * If @timer->function == NULL then the start operation is silently 1237 * discarded. In this case the return value is 0 and meaningless. 1238 * 1239 * Return: 1240 * * %0 - The timer was inactive and started or was in shutdown 1241 * state and the operation was discarded 1242 * * %1 - The timer was active and requeued to expire at @expires or 1243 * the timer was active and not modified because @expires did 1244 * not change the effective expiry time 1245 */ 1246 int mod_timer(struct timer_list *timer, unsigned long expires) 1247 { 1248 return __mod_timer(timer, expires, 0); 1249 } 1250 EXPORT_SYMBOL(mod_timer); 1251 1252 /** 1253 * timer_reduce - Modify a timer's timeout if it would reduce the timeout 1254 * @timer: The timer to be modified 1255 * @expires: New absolute timeout in jiffies 1256 * 1257 * timer_reduce() is very similar to mod_timer(), except that it will only 1258 * modify an enqueued timer if that would reduce the expiration time. If 1259 * @timer is not enqueued it starts the timer. 1260 * 1261 * If @timer->function == NULL then the start operation is silently 1262 * discarded. 1263 * 1264 * Return: 1265 * * %0 - The timer was inactive and started or was in shutdown 1266 * state and the operation was discarded 1267 * * %1 - The timer was active and requeued to expire at @expires or 1268 * the timer was active and not modified because @expires 1269 * did not change the effective expiry time such that the 1270 * timer would expire earlier than already scheduled 1271 */ 1272 int timer_reduce(struct timer_list *timer, unsigned long expires) 1273 { 1274 return __mod_timer(timer, expires, MOD_TIMER_REDUCE); 1275 } 1276 EXPORT_SYMBOL(timer_reduce); 1277 1278 /** 1279 * add_timer - Start a timer 1280 * @timer: The timer to be started 1281 * 1282 * Start @timer to expire at @timer->expires in the future. @timer->expires 1283 * is the absolute expiry time measured in 'jiffies'. When the timer expires 1284 * timer->function(timer) will be invoked from soft interrupt context. 1285 * 1286 * The @timer->expires and @timer->function fields must be set prior 1287 * to calling this function. 1288 * 1289 * If @timer->function == NULL then the start operation is silently 1290 * discarded. 1291 * 1292 * If @timer->expires is already in the past @timer will be queued to 1293 * expire at the next timer tick. 1294 * 1295 * This can only operate on an inactive timer. Attempts to invoke this on 1296 * an active timer are rejected with a warning. 1297 */ 1298 void add_timer(struct timer_list *timer) 1299 { 1300 if (WARN_ON_ONCE(timer_pending(timer))) 1301 return; 1302 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); 1303 } 1304 EXPORT_SYMBOL(add_timer); 1305 1306 /** 1307 * add_timer_local() - Start a timer on the local CPU 1308 * @timer: The timer to be started 1309 * 1310 * Same as add_timer() except that the timer flag TIMER_PINNED is set. 1311 * 1312 * See add_timer() for further details. 1313 */ 1314 void add_timer_local(struct timer_list *timer) 1315 { 1316 if (WARN_ON_ONCE(timer_pending(timer))) 1317 return; 1318 timer->flags |= TIMER_PINNED; 1319 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); 1320 } 1321 EXPORT_SYMBOL(add_timer_local); 1322 1323 /** 1324 * add_timer_global() - Start a timer without TIMER_PINNED flag set 1325 * @timer: The timer to be started 1326 * 1327 * Same as add_timer() except that the timer flag TIMER_PINNED is unset. 1328 * 1329 * See add_timer() for further details. 1330 */ 1331 void add_timer_global(struct timer_list *timer) 1332 { 1333 if (WARN_ON_ONCE(timer_pending(timer))) 1334 return; 1335 timer->flags &= ~TIMER_PINNED; 1336 __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); 1337 } 1338 EXPORT_SYMBOL(add_timer_global); 1339 1340 /** 1341 * add_timer_on - Start a timer on a particular CPU 1342 * @timer: The timer to be started 1343 * @cpu: The CPU to start it on 1344 * 1345 * Same as add_timer() except that it starts the timer on the given CPU and 1346 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in 1347 * the next round, add_timer_global() should be used instead as it unsets 1348 * the TIMER_PINNED flag. 1349 * 1350 * See add_timer() for further details. 1351 */ 1352 void add_timer_on(struct timer_list *timer, int cpu) 1353 { 1354 struct timer_base *new_base, *base; 1355 unsigned long flags; 1356 1357 debug_assert_init(timer); 1358 1359 if (WARN_ON_ONCE(timer_pending(timer))) 1360 return; 1361 1362 /* Make sure timer flags have TIMER_PINNED flag set */ 1363 timer->flags |= TIMER_PINNED; 1364 1365 new_base = get_timer_cpu_base(timer->flags, cpu); 1366 1367 /* 1368 * If @timer was on a different CPU, it should be migrated with the 1369 * old base locked to prevent other operations proceeding with the 1370 * wrong base locked. See lock_timer_base(). 1371 */ 1372 base = lock_timer_base(timer, &flags); 1373 /* 1374 * Has @timer been shutdown? This needs to be evaluated while 1375 * holding base lock to prevent a race against the shutdown code. 1376 */ 1377 if (!timer->function) 1378 goto out_unlock; 1379 1380 if (base != new_base) { 1381 timer->flags |= TIMER_MIGRATING; 1382 1383 raw_spin_unlock(&base->lock); 1384 base = new_base; 1385 raw_spin_lock(&base->lock); 1386 WRITE_ONCE(timer->flags, 1387 (timer->flags & ~TIMER_BASEMASK) | cpu); 1388 } 1389 forward_timer_base(base); 1390 1391 debug_timer_activate(timer); 1392 internal_add_timer(base, timer); 1393 out_unlock: 1394 raw_spin_unlock_irqrestore(&base->lock, flags); 1395 } 1396 EXPORT_SYMBOL_GPL(add_timer_on); 1397 1398 /** 1399 * __timer_delete - Internal function: Deactivate a timer 1400 * @timer: The timer to be deactivated 1401 * @shutdown: If true, this indicates that the timer is about to be 1402 * shutdown permanently. 1403 * 1404 * If @shutdown is true then @timer->function is set to NULL under the 1405 * timer base lock which prevents further rearming of the time. In that 1406 * case any attempt to rearm @timer after this function returns will be 1407 * silently ignored. 1408 * 1409 * Return: 1410 * * %0 - The timer was not pending 1411 * * %1 - The timer was pending and deactivated 1412 */ 1413 static int __timer_delete(struct timer_list *timer, bool shutdown) 1414 { 1415 struct timer_base *base; 1416 unsigned long flags; 1417 int ret = 0; 1418 1419 debug_assert_init(timer); 1420 1421 /* 1422 * If @shutdown is set then the lock has to be taken whether the 1423 * timer is pending or not to protect against a concurrent rearm 1424 * which might hit between the lockless pending check and the lock 1425 * aquisition. By taking the lock it is ensured that such a newly 1426 * enqueued timer is dequeued and cannot end up with 1427 * timer->function == NULL in the expiry code. 1428 * 1429 * If timer->function is currently executed, then this makes sure 1430 * that the callback cannot requeue the timer. 1431 */ 1432 if (timer_pending(timer) || shutdown) { 1433 base = lock_timer_base(timer, &flags); 1434 ret = detach_if_pending(timer, base, true); 1435 if (shutdown) 1436 timer->function = NULL; 1437 raw_spin_unlock_irqrestore(&base->lock, flags); 1438 } 1439 1440 return ret; 1441 } 1442 1443 /** 1444 * timer_delete - Deactivate a timer 1445 * @timer: The timer to be deactivated 1446 * 1447 * The function only deactivates a pending timer, but contrary to 1448 * timer_delete_sync() it does not take into account whether the timer's 1449 * callback function is concurrently executed on a different CPU or not. 1450 * It neither prevents rearming of the timer. If @timer can be rearmed 1451 * concurrently then the return value of this function is meaningless. 1452 * 1453 * Return: 1454 * * %0 - The timer was not pending 1455 * * %1 - The timer was pending and deactivated 1456 */ 1457 int timer_delete(struct timer_list *timer) 1458 { 1459 return __timer_delete(timer, false); 1460 } 1461 EXPORT_SYMBOL(timer_delete); 1462 1463 /** 1464 * timer_shutdown - Deactivate a timer and prevent rearming 1465 * @timer: The timer to be deactivated 1466 * 1467 * The function does not wait for an eventually running timer callback on a 1468 * different CPU but it prevents rearming of the timer. Any attempt to arm 1469 * @timer after this function returns will be silently ignored. 1470 * 1471 * This function is useful for teardown code and should only be used when 1472 * timer_shutdown_sync() cannot be invoked due to locking or context constraints. 1473 * 1474 * Return: 1475 * * %0 - The timer was not pending 1476 * * %1 - The timer was pending 1477 */ 1478 int timer_shutdown(struct timer_list *timer) 1479 { 1480 return __timer_delete(timer, true); 1481 } 1482 EXPORT_SYMBOL_GPL(timer_shutdown); 1483 1484 /** 1485 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer 1486 * @timer: Timer to deactivate 1487 * @shutdown: If true, this indicates that the timer is about to be 1488 * shutdown permanently. 1489 * 1490 * If @shutdown is true then @timer->function is set to NULL under the 1491 * timer base lock which prevents further rearming of the timer. Any 1492 * attempt to rearm @timer after this function returns will be silently 1493 * ignored. 1494 * 1495 * This function cannot guarantee that the timer cannot be rearmed 1496 * right after dropping the base lock if @shutdown is false. That 1497 * needs to be prevented by the calling code if necessary. 1498 * 1499 * Return: 1500 * * %0 - The timer was not pending 1501 * * %1 - The timer was pending and deactivated 1502 * * %-1 - The timer callback function is running on a different CPU 1503 */ 1504 static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) 1505 { 1506 struct timer_base *base; 1507 unsigned long flags; 1508 int ret = -1; 1509 1510 debug_assert_init(timer); 1511 1512 base = lock_timer_base(timer, &flags); 1513 1514 if (base->running_timer != timer) 1515 ret = detach_if_pending(timer, base, true); 1516 if (shutdown) 1517 timer->function = NULL; 1518 1519 raw_spin_unlock_irqrestore(&base->lock, flags); 1520 1521 return ret; 1522 } 1523 1524 /** 1525 * try_to_del_timer_sync - Try to deactivate a timer 1526 * @timer: Timer to deactivate 1527 * 1528 * This function tries to deactivate a timer. On success the timer is not 1529 * queued and the timer callback function is not running on any CPU. 1530 * 1531 * This function does not guarantee that the timer cannot be rearmed right 1532 * after dropping the base lock. That needs to be prevented by the calling 1533 * code if necessary. 1534 * 1535 * Return: 1536 * * %0 - The timer was not pending 1537 * * %1 - The timer was pending and deactivated 1538 * * %-1 - The timer callback function is running on a different CPU 1539 */ 1540 int try_to_del_timer_sync(struct timer_list *timer) 1541 { 1542 return __try_to_del_timer_sync(timer, false); 1543 } 1544 EXPORT_SYMBOL(try_to_del_timer_sync); 1545 1546 #ifdef CONFIG_PREEMPT_RT 1547 static __init void timer_base_init_expiry_lock(struct timer_base *base) 1548 { 1549 spin_lock_init(&base->expiry_lock); 1550 } 1551 1552 static inline void timer_base_lock_expiry(struct timer_base *base) 1553 { 1554 spin_lock(&base->expiry_lock); 1555 } 1556 1557 static inline void timer_base_unlock_expiry(struct timer_base *base) 1558 { 1559 spin_unlock(&base->expiry_lock); 1560 } 1561 1562 /* 1563 * The counterpart to del_timer_wait_running(). 1564 * 1565 * If there is a waiter for base->expiry_lock, then it was waiting for the 1566 * timer callback to finish. Drop expiry_lock and reacquire it. That allows 1567 * the waiter to acquire the lock and make progress. 1568 */ 1569 static void timer_sync_wait_running(struct timer_base *base) 1570 { 1571 if (atomic_read(&base->timer_waiters)) { 1572 raw_spin_unlock_irq(&base->lock); 1573 spin_unlock(&base->expiry_lock); 1574 spin_lock(&base->expiry_lock); 1575 raw_spin_lock_irq(&base->lock); 1576 } 1577 } 1578 1579 /* 1580 * This function is called on PREEMPT_RT kernels when the fast path 1581 * deletion of a timer failed because the timer callback function was 1582 * running. 1583 * 1584 * This prevents priority inversion, if the softirq thread on a remote CPU 1585 * got preempted, and it prevents a life lock when the task which tries to 1586 * delete a timer preempted the softirq thread running the timer callback 1587 * function. 1588 */ 1589 static void del_timer_wait_running(struct timer_list *timer) 1590 { 1591 u32 tf; 1592 1593 tf = READ_ONCE(timer->flags); 1594 if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { 1595 struct timer_base *base = get_timer_base(tf); 1596 1597 /* 1598 * Mark the base as contended and grab the expiry lock, 1599 * which is held by the softirq across the timer 1600 * callback. Drop the lock immediately so the softirq can 1601 * expire the next timer. In theory the timer could already 1602 * be running again, but that's more than unlikely and just 1603 * causes another wait loop. 1604 */ 1605 atomic_inc(&base->timer_waiters); 1606 spin_lock_bh(&base->expiry_lock); 1607 atomic_dec(&base->timer_waiters); 1608 spin_unlock_bh(&base->expiry_lock); 1609 } 1610 } 1611 #else 1612 static inline void timer_base_init_expiry_lock(struct timer_base *base) { } 1613 static inline void timer_base_lock_expiry(struct timer_base *base) { } 1614 static inline void timer_base_unlock_expiry(struct timer_base *base) { } 1615 static inline void timer_sync_wait_running(struct timer_base *base) { } 1616 static inline void del_timer_wait_running(struct timer_list *timer) { } 1617 #endif 1618 1619 /** 1620 * __timer_delete_sync - Internal function: Deactivate a timer and wait 1621 * for the handler to finish. 1622 * @timer: The timer to be deactivated 1623 * @shutdown: If true, @timer->function will be set to NULL under the 1624 * timer base lock which prevents rearming of @timer 1625 * 1626 * If @shutdown is not set the timer can be rearmed later. If the timer can 1627 * be rearmed concurrently, i.e. after dropping the base lock then the 1628 * return value is meaningless. 1629 * 1630 * If @shutdown is set then @timer->function is set to NULL under timer 1631 * base lock which prevents rearming of the timer. Any attempt to rearm 1632 * a shutdown timer is silently ignored. 1633 * 1634 * If the timer should be reused after shutdown it has to be initialized 1635 * again. 1636 * 1637 * Return: 1638 * * %0 - The timer was not pending 1639 * * %1 - The timer was pending and deactivated 1640 */ 1641 static int __timer_delete_sync(struct timer_list *timer, bool shutdown) 1642 { 1643 int ret; 1644 1645 #ifdef CONFIG_LOCKDEP 1646 unsigned long flags; 1647 1648 /* 1649 * If lockdep gives a backtrace here, please reference 1650 * the synchronization rules above. 1651 */ 1652 local_irq_save(flags); 1653 lock_map_acquire(&timer->lockdep_map); 1654 lock_map_release(&timer->lockdep_map); 1655 local_irq_restore(flags); 1656 #endif 1657 /* 1658 * don't use it in hardirq context, because it 1659 * could lead to deadlock. 1660 */ 1661 WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); 1662 1663 /* 1664 * Must be able to sleep on PREEMPT_RT because of the slowpath in 1665 * del_timer_wait_running(). 1666 */ 1667 if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) 1668 lockdep_assert_preemption_enabled(); 1669 1670 do { 1671 ret = __try_to_del_timer_sync(timer, shutdown); 1672 1673 if (unlikely(ret < 0)) { 1674 del_timer_wait_running(timer); 1675 cpu_relax(); 1676 } 1677 } while (ret < 0); 1678 1679 return ret; 1680 } 1681 1682 /** 1683 * timer_delete_sync - Deactivate a timer and wait for the handler to finish. 1684 * @timer: The timer to be deactivated 1685 * 1686 * Synchronization rules: Callers must prevent restarting of the timer, 1687 * otherwise this function is meaningless. It must not be called from 1688 * interrupt contexts unless the timer is an irqsafe one. The caller must 1689 * not hold locks which would prevent completion of the timer's callback 1690 * function. The timer's handler must not call add_timer_on(). Upon exit 1691 * the timer is not queued and the handler is not running on any CPU. 1692 * 1693 * For !irqsafe timers, the caller must not hold locks that are held in 1694 * interrupt context. Even if the lock has nothing to do with the timer in 1695 * question. Here's why:: 1696 * 1697 * CPU0 CPU1 1698 * ---- ---- 1699 * <SOFTIRQ> 1700 * call_timer_fn(); 1701 * base->running_timer = mytimer; 1702 * spin_lock_irq(somelock); 1703 * <IRQ> 1704 * spin_lock(somelock); 1705 * timer_delete_sync(mytimer); 1706 * while (base->running_timer == mytimer); 1707 * 1708 * Now timer_delete_sync() will never return and never release somelock. 1709 * The interrupt on the other CPU is waiting to grab somelock but it has 1710 * interrupted the softirq that CPU0 is waiting to finish. 1711 * 1712 * This function cannot guarantee that the timer is not rearmed again by 1713 * some concurrent or preempting code, right after it dropped the base 1714 * lock. If there is the possibility of a concurrent rearm then the return 1715 * value of the function is meaningless. 1716 * 1717 * If such a guarantee is needed, e.g. for teardown situations then use 1718 * timer_shutdown_sync() instead. 1719 * 1720 * Return: 1721 * * %0 - The timer was not pending 1722 * * %1 - The timer was pending and deactivated 1723 */ 1724 int timer_delete_sync(struct timer_list *timer) 1725 { 1726 return __timer_delete_sync(timer, false); 1727 } 1728 EXPORT_SYMBOL(timer_delete_sync); 1729 1730 /** 1731 * timer_shutdown_sync - Shutdown a timer and prevent rearming 1732 * @timer: The timer to be shutdown 1733 * 1734 * When the function returns it is guaranteed that: 1735 * - @timer is not queued 1736 * - The callback function of @timer is not running 1737 * - @timer cannot be enqueued again. Any attempt to rearm 1738 * @timer is silently ignored. 1739 * 1740 * See timer_delete_sync() for synchronization rules. 1741 * 1742 * This function is useful for final teardown of an infrastructure where 1743 * the timer is subject to a circular dependency problem. 1744 * 1745 * A common pattern for this is a timer and a workqueue where the timer can 1746 * schedule work and work can arm the timer. On shutdown the workqueue must 1747 * be destroyed and the timer must be prevented from rearming. Unless the 1748 * code has conditionals like 'if (mything->in_shutdown)' to prevent that 1749 * there is no way to get this correct with timer_delete_sync(). 1750 * 1751 * timer_shutdown_sync() is solving the problem. The correct ordering of 1752 * calls in this case is: 1753 * 1754 * timer_shutdown_sync(&mything->timer); 1755 * workqueue_destroy(&mything->workqueue); 1756 * 1757 * After this 'mything' can be safely freed. 1758 * 1759 * This obviously implies that the timer is not required to be functional 1760 * for the rest of the shutdown operation. 1761 * 1762 * Return: 1763 * * %0 - The timer was not pending 1764 * * %1 - The timer was pending 1765 */ 1766 int timer_shutdown_sync(struct timer_list *timer) 1767 { 1768 return __timer_delete_sync(timer, true); 1769 } 1770 EXPORT_SYMBOL_GPL(timer_shutdown_sync); 1771 1772 static void call_timer_fn(struct timer_list *timer, 1773 void (*fn)(struct timer_list *), 1774 unsigned long baseclk) 1775 { 1776 int count = preempt_count(); 1777 1778 #ifdef CONFIG_LOCKDEP 1779 /* 1780 * It is permissible to free the timer from inside the 1781 * function that is called from it, this we need to take into 1782 * account for lockdep too. To avoid bogus "held lock freed" 1783 * warnings as well as problems when looking into 1784 * timer->lockdep_map, make a copy and use that here. 1785 */ 1786 struct lockdep_map lockdep_map; 1787 1788 lockdep_copy_map(&lockdep_map, &timer->lockdep_map); 1789 #endif 1790 /* 1791 * Couple the lock chain with the lock chain at 1792 * timer_delete_sync() by acquiring the lock_map around the fn() 1793 * call here and in timer_delete_sync(). 1794 */ 1795 lock_map_acquire(&lockdep_map); 1796 1797 trace_timer_expire_entry(timer, baseclk); 1798 fn(timer); 1799 trace_timer_expire_exit(timer); 1800 1801 lock_map_release(&lockdep_map); 1802 1803 if (count != preempt_count()) { 1804 WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", 1805 fn, count, preempt_count()); 1806 /* 1807 * Restore the preempt count. That gives us a decent 1808 * chance to survive and extract information. If the 1809 * callback kept a lock held, bad luck, but not worse 1810 * than the BUG() we had. 1811 */ 1812 preempt_count_set(count); 1813 } 1814 } 1815 1816 static void expire_timers(struct timer_base *base, struct hlist_head *head) 1817 { 1818 /* 1819 * This value is required only for tracing. base->clk was 1820 * incremented directly before expire_timers was called. But expiry 1821 * is related to the old base->clk value. 1822 */ 1823 unsigned long baseclk = base->clk - 1; 1824 1825 while (!hlist_empty(head)) { 1826 struct timer_list *timer; 1827 void (*fn)(struct timer_list *); 1828 1829 timer = hlist_entry(head->first, struct timer_list, entry); 1830 1831 base->running_timer = timer; 1832 detach_timer(timer, true); 1833 1834 fn = timer->function; 1835 1836 if (WARN_ON_ONCE(!fn)) { 1837 /* Should never happen. Emphasis on should! */ 1838 base->running_timer = NULL; 1839 continue; 1840 } 1841 1842 if (timer->flags & TIMER_IRQSAFE) { 1843 raw_spin_unlock(&base->lock); 1844 call_timer_fn(timer, fn, baseclk); 1845 raw_spin_lock(&base->lock); 1846 base->running_timer = NULL; 1847 } else { 1848 raw_spin_unlock_irq(&base->lock); 1849 call_timer_fn(timer, fn, baseclk); 1850 raw_spin_lock_irq(&base->lock); 1851 base->running_timer = NULL; 1852 timer_sync_wait_running(base); 1853 } 1854 } 1855 } 1856 1857 static int collect_expired_timers(struct timer_base *base, 1858 struct hlist_head *heads) 1859 { 1860 unsigned long clk = base->clk = base->next_expiry; 1861 struct hlist_head *vec; 1862 int i, levels = 0; 1863 unsigned int idx; 1864 1865 for (i = 0; i < LVL_DEPTH; i++) { 1866 idx = (clk & LVL_MASK) + i * LVL_SIZE; 1867 1868 if (__test_and_clear_bit(idx, base->pending_map)) { 1869 vec = base->vectors + idx; 1870 hlist_move_list(vec, heads++); 1871 levels++; 1872 } 1873 /* Is it time to look at the next level? */ 1874 if (clk & LVL_CLK_MASK) 1875 break; 1876 /* Shift clock for the next level granularity */ 1877 clk >>= LVL_CLK_SHIFT; 1878 } 1879 return levels; 1880 } 1881 1882 /* 1883 * Find the next pending bucket of a level. Search from level start (@offset) 1884 * + @clk upwards and if nothing there, search from start of the level 1885 * (@offset) up to @offset + clk. 1886 */ 1887 static int next_pending_bucket(struct timer_base *base, unsigned offset, 1888 unsigned clk) 1889 { 1890 unsigned pos, start = offset + clk; 1891 unsigned end = offset + LVL_SIZE; 1892 1893 pos = find_next_bit(base->pending_map, end, start); 1894 if (pos < end) 1895 return pos - start; 1896 1897 pos = find_next_bit(base->pending_map, start, offset); 1898 return pos < start ? pos + LVL_SIZE - start : -1; 1899 } 1900 1901 /* 1902 * Search the first expiring timer in the various clock levels. Caller must 1903 * hold base->lock. 1904 * 1905 * Store next expiry time in base->next_expiry. 1906 */ 1907 static void next_expiry_recalc(struct timer_base *base) 1908 { 1909 unsigned long clk, next, adj; 1910 unsigned lvl, offset = 0; 1911 1912 next = base->clk + NEXT_TIMER_MAX_DELTA; 1913 clk = base->clk; 1914 for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { 1915 int pos = next_pending_bucket(base, offset, clk & LVL_MASK); 1916 unsigned long lvl_clk = clk & LVL_CLK_MASK; 1917 1918 if (pos >= 0) { 1919 unsigned long tmp = clk + (unsigned long) pos; 1920 1921 tmp <<= LVL_SHIFT(lvl); 1922 if (time_before(tmp, next)) 1923 next = tmp; 1924 1925 /* 1926 * If the next expiration happens before we reach 1927 * the next level, no need to check further. 1928 */ 1929 if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) 1930 break; 1931 } 1932 /* 1933 * Clock for the next level. If the current level clock lower 1934 * bits are zero, we look at the next level as is. If not we 1935 * need to advance it by one because that's going to be the 1936 * next expiring bucket in that level. base->clk is the next 1937 * expiring jiffie. So in case of: 1938 * 1939 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 1940 * 0 0 0 0 0 0 1941 * 1942 * we have to look at all levels @index 0. With 1943 * 1944 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 1945 * 0 0 0 0 0 2 1946 * 1947 * LVL0 has the next expiring bucket @index 2. The upper 1948 * levels have the next expiring bucket @index 1. 1949 * 1950 * In case that the propagation wraps the next level the same 1951 * rules apply: 1952 * 1953 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 1954 * 0 0 0 0 F 2 1955 * 1956 * So after looking at LVL0 we get: 1957 * 1958 * LVL5 LVL4 LVL3 LVL2 LVL1 1959 * 0 0 0 1 0 1960 * 1961 * So no propagation from LVL1 to LVL2 because that happened 1962 * with the add already, but then we need to propagate further 1963 * from LVL2 to LVL3. 1964 * 1965 * So the simple check whether the lower bits of the current 1966 * level are 0 or not is sufficient for all cases. 1967 */ 1968 adj = lvl_clk ? 1 : 0; 1969 clk >>= LVL_CLK_SHIFT; 1970 clk += adj; 1971 } 1972 1973 base->next_expiry = next; 1974 base->next_expiry_recalc = false; 1975 base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); 1976 } 1977 1978 #ifdef CONFIG_NO_HZ_COMMON 1979 /* 1980 * Check, if the next hrtimer event is before the next timer wheel 1981 * event: 1982 */ 1983 static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) 1984 { 1985 u64 nextevt = hrtimer_get_next_event(); 1986 1987 /* 1988 * If high resolution timers are enabled 1989 * hrtimer_get_next_event() returns KTIME_MAX. 1990 */ 1991 if (expires <= nextevt) 1992 return expires; 1993 1994 /* 1995 * If the next timer is already expired, return the tick base 1996 * time so the tick is fired immediately. 1997 */ 1998 if (nextevt <= basem) 1999 return basem; 2000 2001 /* 2002 * Round up to the next jiffie. High resolution timers are 2003 * off, so the hrtimers are expired in the tick and we need to 2004 * make sure that this tick really expires the timer to avoid 2005 * a ping pong of the nohz stop code. 2006 * 2007 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 2008 */ 2009 return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; 2010 } 2011 2012 static unsigned long next_timer_interrupt(struct timer_base *base, 2013 unsigned long basej) 2014 { 2015 if (base->next_expiry_recalc) 2016 next_expiry_recalc(base); 2017 2018 /* 2019 * Move next_expiry for the empty base into the future to prevent an 2020 * unnecessary raise of the timer softirq when the next_expiry value 2021 * will be reached even if there is no timer pending. 2022 * 2023 * This update is also required to make timer_base::next_expiry values 2024 * easy comparable to find out which base holds the first pending timer. 2025 */ 2026 if (!base->timers_pending) 2027 base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; 2028 2029 return base->next_expiry; 2030 } 2031 2032 static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, 2033 struct timer_base *base_local, 2034 struct timer_base *base_global, 2035 struct timer_events *tevt) 2036 { 2037 unsigned long nextevt, nextevt_local, nextevt_global; 2038 bool local_first; 2039 2040 nextevt_local = next_timer_interrupt(base_local, basej); 2041 nextevt_global = next_timer_interrupt(base_global, basej); 2042 2043 local_first = time_before_eq(nextevt_local, nextevt_global); 2044 2045 nextevt = local_first ? nextevt_local : nextevt_global; 2046 2047 /* 2048 * If the @nextevt is at max. one tick away, use @nextevt and store 2049 * it in the local expiry value. The next global event is irrelevant in 2050 * this case and can be left as KTIME_MAX. 2051 */ 2052 if (time_before_eq(nextevt, basej + 1)) { 2053 /* If we missed a tick already, force 0 delta */ 2054 if (time_before(nextevt, basej)) 2055 nextevt = basej; 2056 tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; 2057 2058 /* 2059 * This is required for the remote check only but it doesn't 2060 * hurt, when it is done for both call sites: 2061 * 2062 * * The remote callers will only take care of the global timers 2063 * as local timers will be handled by CPU itself. When not 2064 * updating tevt->global with the already missed first global 2065 * timer, it is possible that it will be missed completely. 2066 * 2067 * * The local callers will ignore the tevt->global anyway, when 2068 * nextevt is max. one tick away. 2069 */ 2070 if (!local_first) 2071 tevt->global = tevt->local; 2072 return nextevt; 2073 } 2074 2075 /* 2076 * Update tevt.* values: 2077 * 2078 * If the local queue expires first, then the global event can be 2079 * ignored. If the global queue is empty, nothing to do either. 2080 */ 2081 if (!local_first && base_global->timers_pending) 2082 tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; 2083 2084 if (base_local->timers_pending) 2085 tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; 2086 2087 return nextevt; 2088 } 2089 2090 # ifdef CONFIG_SMP 2091 /** 2092 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt 2093 * @basej: base time jiffies 2094 * @basem: base time clock monotonic 2095 * @tevt: Pointer to the storage for the expiry values 2096 * @cpu: Remote CPU 2097 * 2098 * Stores the next pending local and global timer expiry values in the 2099 * struct pointed to by @tevt. If a queue is empty the corresponding 2100 * field is set to KTIME_MAX. If local event expires before global 2101 * event, global event is set to KTIME_MAX as well. 2102 * 2103 * Caller needs to make sure timer base locks are held (use 2104 * timer_lock_remote_bases() for this purpose). 2105 */ 2106 void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, 2107 struct timer_events *tevt, 2108 unsigned int cpu) 2109 { 2110 struct timer_base *base_local, *base_global; 2111 2112 /* Preset local / global events */ 2113 tevt->local = tevt->global = KTIME_MAX; 2114 2115 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2116 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2117 2118 lockdep_assert_held(&base_local->lock); 2119 lockdep_assert_held(&base_global->lock); 2120 2121 fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); 2122 } 2123 2124 /** 2125 * timer_unlock_remote_bases - unlock timer bases of cpu 2126 * @cpu: Remote CPU 2127 * 2128 * Unlocks the remote timer bases. 2129 */ 2130 void timer_unlock_remote_bases(unsigned int cpu) 2131 __releases(timer_bases[BASE_LOCAL]->lock) 2132 __releases(timer_bases[BASE_GLOBAL]->lock) 2133 { 2134 struct timer_base *base_local, *base_global; 2135 2136 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2137 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2138 2139 raw_spin_unlock(&base_global->lock); 2140 raw_spin_unlock(&base_local->lock); 2141 } 2142 2143 /** 2144 * timer_lock_remote_bases - lock timer bases of cpu 2145 * @cpu: Remote CPU 2146 * 2147 * Locks the remote timer bases. 2148 */ 2149 void timer_lock_remote_bases(unsigned int cpu) 2150 __acquires(timer_bases[BASE_LOCAL]->lock) 2151 __acquires(timer_bases[BASE_GLOBAL]->lock) 2152 { 2153 struct timer_base *base_local, *base_global; 2154 2155 base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); 2156 base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2157 2158 lockdep_assert_irqs_disabled(); 2159 2160 raw_spin_lock(&base_local->lock); 2161 raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); 2162 } 2163 2164 /** 2165 * timer_base_is_idle() - Return whether timer base is set idle 2166 * 2167 * Returns value of local timer base is_idle value. 2168 */ 2169 bool timer_base_is_idle(void) 2170 { 2171 return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); 2172 } 2173 2174 static void __run_timer_base(struct timer_base *base); 2175 2176 /** 2177 * timer_expire_remote() - expire global timers of cpu 2178 * @cpu: Remote CPU 2179 * 2180 * Expire timers of global base of remote CPU. 2181 */ 2182 void timer_expire_remote(unsigned int cpu) 2183 { 2184 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); 2185 2186 __run_timer_base(base); 2187 } 2188 2189 static void timer_use_tmigr(unsigned long basej, u64 basem, 2190 unsigned long *nextevt, bool *tick_stop_path, 2191 bool timer_base_idle, struct timer_events *tevt) 2192 { 2193 u64 next_tmigr; 2194 2195 if (timer_base_idle) 2196 next_tmigr = tmigr_cpu_new_timer(tevt->global); 2197 else if (tick_stop_path) 2198 next_tmigr = tmigr_cpu_deactivate(tevt->global); 2199 else 2200 next_tmigr = tmigr_quick_check(tevt->global); 2201 2202 /* 2203 * If the CPU is the last going idle in timer migration hierarchy, make 2204 * sure the CPU will wake up in time to handle remote timers. 2205 * next_tmigr == KTIME_MAX if other CPUs are still active. 2206 */ 2207 if (next_tmigr < tevt->local) { 2208 u64 tmp; 2209 2210 /* If we missed a tick already, force 0 delta */ 2211 if (next_tmigr < basem) 2212 next_tmigr = basem; 2213 2214 tmp = div_u64(next_tmigr - basem, TICK_NSEC); 2215 2216 *nextevt = basej + (unsigned long)tmp; 2217 tevt->local = next_tmigr; 2218 } 2219 } 2220 # else 2221 static void timer_use_tmigr(unsigned long basej, u64 basem, 2222 unsigned long *nextevt, bool *tick_stop_path, 2223 bool timer_base_idle, struct timer_events *tevt) 2224 { 2225 /* 2226 * Make sure first event is written into tevt->local to not miss a 2227 * timer on !SMP systems. 2228 */ 2229 tevt->local = min_t(u64, tevt->local, tevt->global); 2230 } 2231 # endif /* CONFIG_SMP */ 2232 2233 static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, 2234 bool *idle) 2235 { 2236 struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; 2237 struct timer_base *base_local, *base_global; 2238 unsigned long nextevt; 2239 bool idle_is_possible; 2240 2241 /* 2242 * Pretend that there is no timer pending if the cpu is offline. 2243 * Possible pending timers will be migrated later to an active cpu. 2244 */ 2245 if (cpu_is_offline(smp_processor_id())) { 2246 if (idle) 2247 *idle = true; 2248 return tevt.local; 2249 } 2250 2251 base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); 2252 base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); 2253 2254 raw_spin_lock(&base_local->lock); 2255 raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); 2256 2257 nextevt = fetch_next_timer_interrupt(basej, basem, base_local, 2258 base_global, &tevt); 2259 2260 /* 2261 * If the next event is only one jiffie ahead there is no need to call 2262 * timer migration hierarchy related functions. The value for the next 2263 * global timer in @tevt struct equals then KTIME_MAX. This is also 2264 * true, when the timer base is idle. 2265 * 2266 * The proper timer migration hierarchy function depends on the callsite 2267 * and whether timer base is idle or not. @nextevt will be updated when 2268 * this CPU needs to handle the first timer migration hierarchy 2269 * event. See timer_use_tmigr() for detailed information. 2270 */ 2271 idle_is_possible = time_after(nextevt, basej + 1); 2272 if (idle_is_possible) 2273 timer_use_tmigr(basej, basem, &nextevt, idle, 2274 base_local->is_idle, &tevt); 2275 2276 /* 2277 * We have a fresh next event. Check whether we can forward the 2278 * base. 2279 */ 2280 __forward_timer_base(base_local, basej); 2281 __forward_timer_base(base_global, basej); 2282 2283 /* 2284 * Set base->is_idle only when caller is timer_base_try_to_set_idle() 2285 */ 2286 if (idle) { 2287 /* 2288 * Bases are idle if the next event is more than a tick 2289 * away. Caution: @nextevt could have changed by enqueueing a 2290 * global timer into timer migration hierarchy. Therefore a new 2291 * check is required here. 2292 * 2293 * If the base is marked idle then any timer add operation must 2294 * forward the base clk itself to keep granularity small. This 2295 * idle logic is only maintained for the BASE_LOCAL and 2296 * BASE_GLOBAL base, deferrable timers may still see large 2297 * granularity skew (by design). 2298 */ 2299 if (!base_local->is_idle && time_after(nextevt, basej + 1)) { 2300 base_local->is_idle = base_global->is_idle = true; 2301 trace_timer_base_idle(true, base_local->cpu); 2302 } 2303 *idle = base_local->is_idle; 2304 2305 /* 2306 * When timer base is not set idle, undo the effect of 2307 * tmigr_cpu_deactivate() to prevent inconsitent states - active 2308 * timer base but inactive timer migration hierarchy. 2309 * 2310 * When timer base was already marked idle, nothing will be 2311 * changed here. 2312 */ 2313 if (!base_local->is_idle && idle_is_possible) 2314 tmigr_cpu_activate(); 2315 } 2316 2317 raw_spin_unlock(&base_global->lock); 2318 raw_spin_unlock(&base_local->lock); 2319 2320 return cmp_next_hrtimer_event(basem, tevt.local); 2321 } 2322 2323 /** 2324 * get_next_timer_interrupt() - return the time (clock mono) of the next timer 2325 * @basej: base time jiffies 2326 * @basem: base time clock monotonic 2327 * 2328 * Returns the tick aligned clock monotonic time of the next pending timer or 2329 * KTIME_MAX if no timer is pending. If timer of global base was queued into 2330 * timer migration hierarchy, first global timer is not taken into account. If 2331 * it was the last CPU of timer migration hierarchy going idle, first global 2332 * event is taken into account. 2333 */ 2334 u64 get_next_timer_interrupt(unsigned long basej, u64 basem) 2335 { 2336 return __get_next_timer_interrupt(basej, basem, NULL); 2337 } 2338 2339 /** 2340 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases 2341 * @basej: base time jiffies 2342 * @basem: base time clock monotonic 2343 * @idle: pointer to store the value of timer_base->is_idle on return; 2344 * *idle contains the information whether tick was already stopped 2345 * 2346 * Returns the tick aligned clock monotonic time of the next pending timer or 2347 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is 2348 * returned as well. 2349 */ 2350 u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) 2351 { 2352 if (*idle) 2353 return KTIME_MAX; 2354 2355 return __get_next_timer_interrupt(basej, basem, idle); 2356 } 2357 2358 /** 2359 * timer_clear_idle - Clear the idle state of the timer base 2360 * 2361 * Called with interrupts disabled 2362 */ 2363 void timer_clear_idle(void) 2364 { 2365 /* 2366 * We do this unlocked. The worst outcome is a remote enqueue sending 2367 * a pointless IPI, but taking the lock would just make the window for 2368 * sending the IPI a few instructions smaller for the cost of taking 2369 * the lock in the exit from idle path. 2370 */ 2371 __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); 2372 __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); 2373 trace_timer_base_idle(false, smp_processor_id()); 2374 2375 /* Activate without holding the timer_base->lock */ 2376 tmigr_cpu_activate(); 2377 } 2378 #endif 2379 2380 /** 2381 * __run_timers - run all expired timers (if any) on this CPU. 2382 * @base: the timer vector to be processed. 2383 */ 2384 static inline void __run_timers(struct timer_base *base) 2385 { 2386 struct hlist_head heads[LVL_DEPTH]; 2387 int levels; 2388 2389 lockdep_assert_held(&base->lock); 2390 2391 if (base->running_timer) 2392 return; 2393 2394 while (time_after_eq(jiffies, base->clk) && 2395 time_after_eq(jiffies, base->next_expiry)) { 2396 levels = collect_expired_timers(base, heads); 2397 /* 2398 * The two possible reasons for not finding any expired 2399 * timer at this clk are that all matching timers have been 2400 * dequeued or no timer has been queued since 2401 * base::next_expiry was set to base::clk + 2402 * NEXT_TIMER_MAX_DELTA. 2403 */ 2404 WARN_ON_ONCE(!levels && !base->next_expiry_recalc 2405 && base->timers_pending); 2406 /* 2407 * While executing timers, base->clk is set 1 offset ahead of 2408 * jiffies to avoid endless requeuing to current jiffies. 2409 */ 2410 base->clk++; 2411 next_expiry_recalc(base); 2412 2413 while (levels--) 2414 expire_timers(base, heads + levels); 2415 } 2416 } 2417 2418 static void __run_timer_base(struct timer_base *base) 2419 { 2420 if (time_before(jiffies, base->next_expiry)) 2421 return; 2422 2423 timer_base_lock_expiry(base); 2424 raw_spin_lock_irq(&base->lock); 2425 __run_timers(base); 2426 raw_spin_unlock_irq(&base->lock); 2427 timer_base_unlock_expiry(base); 2428 } 2429 2430 static void run_timer_base(int index) 2431 { 2432 struct timer_base *base = this_cpu_ptr(&timer_bases[index]); 2433 2434 __run_timer_base(base); 2435 } 2436 2437 /* 2438 * This function runs timers and the timer-tq in bottom half context. 2439 */ 2440 static __latent_entropy void run_timer_softirq(struct softirq_action *h) 2441 { 2442 run_timer_base(BASE_LOCAL); 2443 if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { 2444 run_timer_base(BASE_GLOBAL); 2445 run_timer_base(BASE_DEF); 2446 2447 if (is_timers_nohz_active()) 2448 tmigr_handle_remote(); 2449 } 2450 } 2451 2452 /* 2453 * Called by the local, per-CPU timer interrupt on SMP. 2454 */ 2455 static void run_local_timers(void) 2456 { 2457 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); 2458 2459 hrtimer_run_queues(); 2460 2461 for (int i = 0; i < NR_BASES; i++, base++) { 2462 /* Raise the softirq only if required. */ 2463 if (time_after_eq(jiffies, base->next_expiry) || 2464 (i == BASE_DEF && tmigr_requires_handle_remote())) { 2465 raise_softirq(TIMER_SOFTIRQ); 2466 return; 2467 } 2468 } 2469 } 2470 2471 /* 2472 * Called from the timer interrupt handler to charge one tick to the current 2473 * process. user_tick is 1 if the tick is user time, 0 for system. 2474 */ 2475 void update_process_times(int user_tick) 2476 { 2477 struct task_struct *p = current; 2478 2479 /* Note: this timer irq context must be accounted for as well. */ 2480 account_process_tick(p, user_tick); 2481 run_local_timers(); 2482 rcu_sched_clock_irq(user_tick); 2483 #ifdef CONFIG_IRQ_WORK 2484 if (in_irq()) 2485 irq_work_tick(); 2486 #endif 2487 scheduler_tick(); 2488 if (IS_ENABLED(CONFIG_POSIX_TIMERS)) 2489 run_posix_cpu_timers(); 2490 } 2491 2492 /* 2493 * Since schedule_timeout()'s timer is defined on the stack, it must store 2494 * the target task on the stack as well. 2495 */ 2496 struct process_timer { 2497 struct timer_list timer; 2498 struct task_struct *task; 2499 }; 2500 2501 static void process_timeout(struct timer_list *t) 2502 { 2503 struct process_timer *timeout = from_timer(timeout, t, timer); 2504 2505 wake_up_process(timeout->task); 2506 } 2507 2508 /** 2509 * schedule_timeout - sleep until timeout 2510 * @timeout: timeout value in jiffies 2511 * 2512 * Make the current task sleep until @timeout jiffies have elapsed. 2513 * The function behavior depends on the current task state 2514 * (see also set_current_state() description): 2515 * 2516 * %TASK_RUNNING - the scheduler is called, but the task does not sleep 2517 * at all. That happens because sched_submit_work() does nothing for 2518 * tasks in %TASK_RUNNING state. 2519 * 2520 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to 2521 * pass before the routine returns unless the current task is explicitly 2522 * woken up, (e.g. by wake_up_process()). 2523 * 2524 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is 2525 * delivered to the current task or the current task is explicitly woken 2526 * up. 2527 * 2528 * The current task state is guaranteed to be %TASK_RUNNING when this 2529 * routine returns. 2530 * 2531 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule 2532 * the CPU away without a bound on the timeout. In this case the return 2533 * value will be %MAX_SCHEDULE_TIMEOUT. 2534 * 2535 * Returns 0 when the timer has expired otherwise the remaining time in 2536 * jiffies will be returned. In all cases the return value is guaranteed 2537 * to be non-negative. 2538 */ 2539 signed long __sched schedule_timeout(signed long timeout) 2540 { 2541 struct process_timer timer; 2542 unsigned long expire; 2543 2544 switch (timeout) 2545 { 2546 case MAX_SCHEDULE_TIMEOUT: 2547 /* 2548 * These two special cases are useful to be comfortable 2549 * in the caller. Nothing more. We could take 2550 * MAX_SCHEDULE_TIMEOUT from one of the negative value 2551 * but I' d like to return a valid offset (>=0) to allow 2552 * the caller to do everything it want with the retval. 2553 */ 2554 schedule(); 2555 goto out; 2556 default: 2557 /* 2558 * Another bit of PARANOID. Note that the retval will be 2559 * 0 since no piece of kernel is supposed to do a check 2560 * for a negative retval of schedule_timeout() (since it 2561 * should never happens anyway). You just have the printk() 2562 * that will tell you if something is gone wrong and where. 2563 */ 2564 if (timeout < 0) { 2565 printk(KERN_ERR "schedule_timeout: wrong timeout " 2566 "value %lx\n", timeout); 2567 dump_stack(); 2568 __set_current_state(TASK_RUNNING); 2569 goto out; 2570 } 2571 } 2572 2573 expire = timeout + jiffies; 2574 2575 timer.task = current; 2576 timer_setup_on_stack(&timer.timer, process_timeout, 0); 2577 __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); 2578 schedule(); 2579 del_timer_sync(&timer.timer); 2580 2581 /* Remove the timer from the object tracker */ 2582 destroy_timer_on_stack(&timer.timer); 2583 2584 timeout = expire - jiffies; 2585 2586 out: 2587 return timeout < 0 ? 0 : timeout; 2588 } 2589 EXPORT_SYMBOL(schedule_timeout); 2590 2591 /* 2592 * We can use __set_current_state() here because schedule_timeout() calls 2593 * schedule() unconditionally. 2594 */ 2595 signed long __sched schedule_timeout_interruptible(signed long timeout) 2596 { 2597 __set_current_state(TASK_INTERRUPTIBLE); 2598 return schedule_timeout(timeout); 2599 } 2600 EXPORT_SYMBOL(schedule_timeout_interruptible); 2601 2602 signed long __sched schedule_timeout_killable(signed long timeout) 2603 { 2604 __set_current_state(TASK_KILLABLE); 2605 return schedule_timeout(timeout); 2606 } 2607 EXPORT_SYMBOL(schedule_timeout_killable); 2608 2609 signed long __sched schedule_timeout_uninterruptible(signed long timeout) 2610 { 2611 __set_current_state(TASK_UNINTERRUPTIBLE); 2612 return schedule_timeout(timeout); 2613 } 2614 EXPORT_SYMBOL(schedule_timeout_uninterruptible); 2615 2616 /* 2617 * Like schedule_timeout_uninterruptible(), except this task will not contribute 2618 * to load average. 2619 */ 2620 signed long __sched schedule_timeout_idle(signed long timeout) 2621 { 2622 __set_current_state(TASK_IDLE); 2623 return schedule_timeout(timeout); 2624 } 2625 EXPORT_SYMBOL(schedule_timeout_idle); 2626 2627 #ifdef CONFIG_HOTPLUG_CPU 2628 static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) 2629 { 2630 struct timer_list *timer; 2631 int cpu = new_base->cpu; 2632 2633 while (!hlist_empty(head)) { 2634 timer = hlist_entry(head->first, struct timer_list, entry); 2635 detach_timer(timer, false); 2636 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; 2637 internal_add_timer(new_base, timer); 2638 } 2639 } 2640 2641 int timers_prepare_cpu(unsigned int cpu) 2642 { 2643 struct timer_base *base; 2644 int b; 2645 2646 for (b = 0; b < NR_BASES; b++) { 2647 base = per_cpu_ptr(&timer_bases[b], cpu); 2648 base->clk = jiffies; 2649 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; 2650 base->next_expiry_recalc = false; 2651 base->timers_pending = false; 2652 base->is_idle = false; 2653 } 2654 return 0; 2655 } 2656 2657 int timers_dead_cpu(unsigned int cpu) 2658 { 2659 struct timer_base *old_base; 2660 struct timer_base *new_base; 2661 int b, i; 2662 2663 for (b = 0; b < NR_BASES; b++) { 2664 old_base = per_cpu_ptr(&timer_bases[b], cpu); 2665 new_base = get_cpu_ptr(&timer_bases[b]); 2666 /* 2667 * The caller is globally serialized and nobody else 2668 * takes two locks at once, deadlock is not possible. 2669 */ 2670 raw_spin_lock_irq(&new_base->lock); 2671 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 2672 2673 /* 2674 * The current CPUs base clock might be stale. Update it 2675 * before moving the timers over. 2676 */ 2677 forward_timer_base(new_base); 2678 2679 WARN_ON_ONCE(old_base->running_timer); 2680 old_base->running_timer = NULL; 2681 2682 for (i = 0; i < WHEEL_SIZE; i++) 2683 migrate_timer_list(new_base, old_base->vectors + i); 2684 2685 raw_spin_unlock(&old_base->lock); 2686 raw_spin_unlock_irq(&new_base->lock); 2687 put_cpu_ptr(&timer_bases); 2688 } 2689 return 0; 2690 } 2691 2692 #endif /* CONFIG_HOTPLUG_CPU */ 2693 2694 static void __init init_timer_cpu(int cpu) 2695 { 2696 struct timer_base *base; 2697 int i; 2698 2699 for (i = 0; i < NR_BASES; i++) { 2700 base = per_cpu_ptr(&timer_bases[i], cpu); 2701 base->cpu = cpu; 2702 raw_spin_lock_init(&base->lock); 2703 base->clk = jiffies; 2704 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; 2705 timer_base_init_expiry_lock(base); 2706 } 2707 } 2708 2709 static void __init init_timer_cpus(void) 2710 { 2711 int cpu; 2712 2713 for_each_possible_cpu(cpu) 2714 init_timer_cpu(cpu); 2715 } 2716 2717 void __init init_timers(void) 2718 { 2719 init_timer_cpus(); 2720 posix_cputimers_init_work(); 2721 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 2722 } 2723 2724 /** 2725 * msleep - sleep safely even with waitqueue interruptions 2726 * @msecs: Time in milliseconds to sleep for 2727 */ 2728 void msleep(unsigned int msecs) 2729 { 2730 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 2731 2732 while (timeout) 2733 timeout = schedule_timeout_uninterruptible(timeout); 2734 } 2735 2736 EXPORT_SYMBOL(msleep); 2737 2738 /** 2739 * msleep_interruptible - sleep waiting for signals 2740 * @msecs: Time in milliseconds to sleep for 2741 */ 2742 unsigned long msleep_interruptible(unsigned int msecs) 2743 { 2744 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 2745 2746 while (timeout && !signal_pending(current)) 2747 timeout = schedule_timeout_interruptible(timeout); 2748 return jiffies_to_msecs(timeout); 2749 } 2750 2751 EXPORT_SYMBOL(msleep_interruptible); 2752 2753 /** 2754 * usleep_range_state - Sleep for an approximate time in a given state 2755 * @min: Minimum time in usecs to sleep 2756 * @max: Maximum time in usecs to sleep 2757 * @state: State of the current task that will be while sleeping 2758 * 2759 * In non-atomic context where the exact wakeup time is flexible, use 2760 * usleep_range_state() instead of udelay(). The sleep improves responsiveness 2761 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces 2762 * power usage by allowing hrtimers to take advantage of an already- 2763 * scheduled interrupt instead of scheduling a new one just for this sleep. 2764 */ 2765 void __sched usleep_range_state(unsigned long min, unsigned long max, 2766 unsigned int state) 2767 { 2768 ktime_t exp = ktime_add_us(ktime_get(), min); 2769 u64 delta = (u64)(max - min) * NSEC_PER_USEC; 2770 2771 for (;;) { 2772 __set_current_state(state); 2773 /* Do not return before the requested sleep time has elapsed */ 2774 if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) 2775 break; 2776 } 2777 } 2778 EXPORT_SYMBOL(usleep_range_state); 2779