1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Infrastructure for migratable timers 4 * 5 * Copyright(C) 2022 linutronix GmbH 6 */ 7 #include <linux/cpuhotplug.h> 8 #include <linux/slab.h> 9 #include <linux/smp.h> 10 #include <linux/spinlock.h> 11 #include <linux/timerqueue.h> 12 #include <trace/events/ipi.h> 13 14 #include "timer_migration.h" 15 #include "tick-internal.h" 16 17 /* 18 * The timer migration mechanism is built on a hierarchy of groups. The 19 * lowest level group contains CPUs, the next level groups of CPU groups 20 * and so forth. The CPU groups are kept per node so for the normal case 21 * lock contention won't happen across nodes. Depending on the number of 22 * CPUs per node even the next level might be kept as groups of CPU groups 23 * per node and only the levels above cross the node topology. 24 * 25 * Example topology for a two node system with 24 CPUs each. 26 * 27 * LVL 2 [GRP2:0] 28 * GRP1:0 = GRP1:M 29 * 30 * LVL 1 [GRP1:0] [GRP1:1] 31 * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5 32 * 33 * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5] 34 * CPUS 0-7 8-15 16-23 24-31 32-39 40-47 35 * 36 * The groups hold a timer queue of events sorted by expiry time. These 37 * queues are updated when CPUs go in idle. When they come out of idle 38 * ignore flag of events is set. 39 * 40 * Each group has a designated migrator CPU/group as long as a CPU/group is 41 * active in the group. This designated role is necessary to avoid that all 42 * active CPUs in a group try to migrate expired timers from other CPUs, 43 * which would result in massive lock bouncing. 44 * 45 * When a CPU is awake, it checks in it's own timer tick the group 46 * hierarchy up to the point where it is assigned the migrator role or if 47 * no CPU is active, it also checks the groups where no migrator is set 48 * (TMIGR_NONE). 49 * 50 * If it finds expired timers in one of the group queues it pulls them over 51 * from the idle CPU and runs the timer function. After that it updates the 52 * group and the parent groups if required. 53 * 54 * CPUs which go idle arm their CPU local timer hardware for the next local 55 * (pinned) timer event. If the next migratable timer expires after the 56 * next local timer or the CPU has no migratable timer pending then the 57 * CPU does not queue an event in the LVL0 group. If the next migratable 58 * timer expires before the next local timer then the CPU queues that timer 59 * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0 60 * group. 61 * 62 * When CPU comes out of idle and when a group has at least a single active 63 * child, the ignore flag of the tmigr_event is set. This indicates, that 64 * the event is ignored even if it is still enqueued in the parent groups 65 * timer queue. It will be removed when touching the timer queue the next 66 * time. This spares locking in active path as the lock protects (after 67 * setup) only event information. For more information about locking, 68 * please read the section "Locking rules". 69 * 70 * If the CPU is the migrator of the group then it delegates that role to 71 * the next active CPU in the group or sets migrator to TMIGR_NONE when 72 * there is no active CPU in the group. This delegation needs to be 73 * propagated up the hierarchy so hand over from other leaves can happen at 74 * all hierarchy levels w/o doing a search. 75 * 76 * When the last CPU in the system goes idle, then it drops all migrator 77 * duties up to the top level of the hierarchy (LVL2 in the example). It 78 * then has to make sure, that it arms it's own local hardware timer for 79 * the earliest event in the system. 80 * 81 * 82 * Lifetime rules: 83 * --------------- 84 * 85 * The groups are built up at init time or when CPUs come online. They are 86 * not destroyed when a group becomes empty due to offlining. The group 87 * just won't participate in the hierarchy management anymore. Destroying 88 * groups would result in interesting race conditions which would just make 89 * the whole mechanism slow and complex. 90 * 91 * 92 * Locking rules: 93 * -------------- 94 * 95 * For setting up new groups and handling events it's required to lock both 96 * child and parent group. The lock ordering is always bottom up. This also 97 * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and 98 * active CPU/group information atomic_try_cmpxchg() is used instead and only 99 * the per CPU tmigr_cpu->lock is held. 100 * 101 * During the setup of groups tmigr_level_list is required. It is protected by 102 * @tmigr_mutex. 103 * 104 * When @timer_base->lock as well as tmigr related locks are required, the lock 105 * ordering is: first @timer_base->lock, afterwards tmigr related locks. 106 * 107 * 108 * Protection of the tmigr group state information: 109 * ------------------------------------------------ 110 * 111 * The state information with the list of active children and migrator needs to 112 * be protected by a sequence counter. It prevents a race when updates in child 113 * groups are propagated in changed order. The state update is performed 114 * lockless and group wise. The following scenario describes what happens 115 * without updating the sequence counter: 116 * 117 * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well 118 * as GRP0:1 will not change during the scenario): 119 * 120 * LVL 1 [GRP1:0] 121 * migrator = GRP0:1 122 * active = GRP0:0, GRP0:1 123 * / \ 124 * LVL 0 [GRP0:0] [GRP0:1] 125 * migrator = CPU0 migrator = CPU2 126 * active = CPU0 active = CPU2 127 * / \ / \ 128 * CPUs 0 1 2 3 129 * active idle active idle 130 * 131 * 132 * 1. CPU0 goes idle. As the update is performed group wise, in the first step 133 * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to 134 * walk the hierarchy. 135 * 136 * LVL 1 [GRP1:0] 137 * migrator = GRP0:1 138 * active = GRP0:0, GRP0:1 139 * / \ 140 * LVL 0 [GRP0:0] [GRP0:1] 141 * --> migrator = TMIGR_NONE migrator = CPU2 142 * --> active = active = CPU2 143 * / \ / \ 144 * CPUs 0 1 2 3 145 * --> idle idle active idle 146 * 147 * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of 148 * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also 149 * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the 150 * hierarchy to perform the needed update from their point of view. The 151 * currently visible state looks the following: 152 * 153 * LVL 1 [GRP1:0] 154 * migrator = GRP0:1 155 * active = GRP0:0, GRP0:1 156 * / \ 157 * LVL 0 [GRP0:0] [GRP0:1] 158 * --> migrator = CPU1 migrator = CPU2 159 * --> active = CPU1 active = CPU2 160 * / \ / \ 161 * CPUs 0 1 2 3 162 * idle --> active active idle 163 * 164 * 3. Here is the race condition: CPU1 managed to propagate its changes (from 165 * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The 166 * active members of GRP1:0 remain unchanged after the update since it is 167 * still valid from CPU1 current point of view: 168 * 169 * LVL 1 [GRP1:0] 170 * --> migrator = GRP0:1 171 * --> active = GRP0:0, GRP0:1 172 * / \ 173 * LVL 0 [GRP0:0] [GRP0:1] 174 * migrator = CPU1 migrator = CPU2 175 * active = CPU1 active = CPU2 176 * / \ / \ 177 * CPUs 0 1 2 3 178 * idle active active idle 179 * 180 * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0. 181 * 182 * LVL 1 [GRP1:0] 183 * --> migrator = GRP0:1 184 * --> active = GRP0:1 185 * / \ 186 * LVL 0 [GRP0:0] [GRP0:1] 187 * migrator = CPU1 migrator = CPU2 188 * active = CPU1 active = CPU2 189 * / \ / \ 190 * CPUs 0 1 2 3 191 * idle active active idle 192 * 193 * 194 * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is 195 * active and is correctly listed as active in GRP0:0. However GRP1:0 does not 196 * have GRP0:0 listed as active, which is wrong. The sequence counter has been 197 * added to avoid inconsistent states during updates. The state is updated 198 * atomically only if all members, including the sequence counter, match the 199 * expected value (compare-and-exchange). 200 * 201 * Looking back at the previous example with the addition of the sequence 202 * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed 203 * the sequence number during the update in step 3 so the expected old value (as 204 * seen by CPU0 before starting the walk) does not match. 205 * 206 * Prevent race between new event and last CPU going inactive 207 * ---------------------------------------------------------- 208 * 209 * When the last CPU is going idle and there is a concurrent update of a new 210 * first global timer of an idle CPU, the group and child states have to be read 211 * while holding the lock in tmigr_update_events(). The following scenario shows 212 * what happens, when this is not done. 213 * 214 * 1. Only CPU2 is active: 215 * 216 * LVL 1 [GRP1:0] 217 * migrator = GRP0:1 218 * active = GRP0:1 219 * next_expiry = KTIME_MAX 220 * / \ 221 * LVL 0 [GRP0:0] [GRP0:1] 222 * migrator = TMIGR_NONE migrator = CPU2 223 * active = active = CPU2 224 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 225 * / \ / \ 226 * CPUs 0 1 2 3 227 * idle idle active idle 228 * 229 * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and 230 * propagates that to GRP0:1: 231 * 232 * LVL 1 [GRP1:0] 233 * migrator = GRP0:1 234 * active = GRP0:1 235 * next_expiry = KTIME_MAX 236 * / \ 237 * LVL 0 [GRP0:0] [GRP0:1] 238 * migrator = TMIGR_NONE --> migrator = TMIGR_NONE 239 * active = --> active = 240 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 241 * / \ / \ 242 * CPUs 0 1 2 3 243 * idle idle --> idle idle 244 * 245 * 3. Now the idle state is propagated up to GRP1:0. As this is now the last 246 * child going idle in top level group, the expiry of the next group event 247 * has to be handed back to make sure no event is lost. As there is no event 248 * enqueued, KTIME_MAX is handed back to CPU2. 249 * 250 * LVL 1 [GRP1:0] 251 * --> migrator = TMIGR_NONE 252 * --> active = 253 * next_expiry = KTIME_MAX 254 * / \ 255 * LVL 0 [GRP0:0] [GRP0:1] 256 * migrator = TMIGR_NONE migrator = TMIGR_NONE 257 * active = active = 258 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 259 * / \ / \ 260 * CPUs 0 1 2 3 261 * idle idle --> idle idle 262 * 263 * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0 264 * propagates that to GRP0:0: 265 * 266 * LVL 1 [GRP1:0] 267 * migrator = TMIGR_NONE 268 * active = 269 * next_expiry = KTIME_MAX 270 * / \ 271 * LVL 0 [GRP0:0] [GRP0:1] 272 * migrator = TMIGR_NONE migrator = TMIGR_NONE 273 * active = active = 274 * --> next_expiry = TIMER0 next_expiry = KTIME_MAX 275 * / \ / \ 276 * CPUs 0 1 2 3 277 * idle idle idle idle 278 * 279 * 5. GRP0:0 is not active, so the new timer has to be propagated to 280 * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value 281 * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is 282 * handed back to CPU0, as it seems that there is still an active child in 283 * top level group. 284 * 285 * LVL 1 [GRP1:0] 286 * migrator = TMIGR_NONE 287 * active = 288 * --> next_expiry = TIMER0 289 * / \ 290 * LVL 0 [GRP0:0] [GRP0:1] 291 * migrator = TMIGR_NONE migrator = TMIGR_NONE 292 * active = active = 293 * next_expiry = TIMER0 next_expiry = KTIME_MAX 294 * / \ / \ 295 * CPUs 0 1 2 3 296 * idle idle idle idle 297 * 298 * This is prevented by reading the state when holding the lock (when a new 299 * timer has to be propagated from idle path):: 300 * 301 * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up()) 302 * -------------------------- --------------------------- 303 * // step 3: 304 * cmpxchg(&GRP1:0->state); 305 * tmigr_update_events() { 306 * spin_lock(&GRP1:0->lock); 307 * // ... update events ... 308 * // hand back first expiry when GRP1:0 is idle 309 * spin_unlock(&GRP1:0->lock); 310 * // ^^^ release state modification 311 * } 312 * tmigr_update_events() { 313 * spin_lock(&GRP1:0->lock) 314 * // ^^^ acquire state modification 315 * group_state = atomic_read(&GRP1:0->state) 316 * // .... update events ... 317 * // hand back first expiry when GRP1:0 is idle 318 * spin_unlock(&GRP1:0->lock) <3> 319 * // ^^^ makes state visible for other 320 * // callers of tmigr_new_timer_up() 321 * } 322 * 323 * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported 324 * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent 325 * update of the group state from active path is no problem, as the upcoming CPU 326 * will take care of the group events. 327 * 328 * Required event and timerqueue update after a remote expiry: 329 * ----------------------------------------------------------- 330 * 331 * After expiring timers of a remote CPU, a walk through the hierarchy and 332 * update of events and timerqueues is required. It is obviously needed if there 333 * is a 'new' global timer but also if there is no new global timer but the 334 * remote CPU is still idle. 335 * 336 * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same 337 * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is 338 * also idle and has no global timer pending. CPU2 is the only active CPU and 339 * thus also the migrator: 340 * 341 * LVL 1 [GRP1:0] 342 * migrator = GRP0:1 343 * active = GRP0:1 344 * --> timerqueue = evt-GRP0:0 345 * / \ 346 * LVL 0 [GRP0:0] [GRP0:1] 347 * migrator = TMIGR_NONE migrator = CPU2 348 * active = active = CPU2 349 * groupevt.ignore = false groupevt.ignore = true 350 * groupevt.cpu = CPU0 groupevt.cpu = 351 * timerqueue = evt-CPU0, timerqueue = 352 * evt-CPU1 353 * / \ / \ 354 * CPUs 0 1 2 3 355 * idle idle active idle 356 * 357 * 2. CPU2 starts to expire remote timers. It starts with LVL0 group 358 * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with 359 * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It 360 * looks at tmigr_event::cpu struct member and expires the pending timer(s) 361 * of CPU0. 362 * 363 * LVL 1 [GRP1:0] 364 * migrator = GRP0:1 365 * active = GRP0:1 366 * --> timerqueue = 367 * / \ 368 * LVL 0 [GRP0:0] [GRP0:1] 369 * migrator = TMIGR_NONE migrator = CPU2 370 * active = active = CPU2 371 * groupevt.ignore = false groupevt.ignore = true 372 * --> groupevt.cpu = CPU0 groupevt.cpu = 373 * timerqueue = evt-CPU0, timerqueue = 374 * evt-CPU1 375 * / \ / \ 376 * CPUs 0 1 2 3 377 * idle idle active idle 378 * 379 * 3. Some work has to be done after expiring the timers of CPU0. If we stop 380 * here, then CPU1's pending global timer(s) will not expire in time and the 381 * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just 382 * been processed. So it is required to walk the hierarchy from CPU0's point 383 * of view and update it accordingly. CPU0's event will be removed from the 384 * timerqueue because it has no pending timer. If CPU0 would have a timer 385 * pending then it has to expire after CPU1's first timer because all timers 386 * from this period were just expired. Either way CPU1's event will be first 387 * in GRP0:0's timerqueue and therefore set in the CPU field of the group 388 * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not 389 * active: 390 * 391 * LVL 1 [GRP1:0] 392 * migrator = GRP0:1 393 * active = GRP0:1 394 * --> timerqueue = evt-GRP0:0 395 * / \ 396 * LVL 0 [GRP0:0] [GRP0:1] 397 * migrator = TMIGR_NONE migrator = CPU2 398 * active = active = CPU2 399 * groupevt.ignore = false groupevt.ignore = true 400 * --> groupevt.cpu = CPU1 groupevt.cpu = 401 * --> timerqueue = evt-CPU1 timerqueue = 402 * / \ / \ 403 * CPUs 0 1 2 3 404 * idle idle active idle 405 * 406 * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the 407 * timer(s) of CPU1. 408 * 409 * The hierarchy walk in step 3 can be skipped if the migrator notices that a 410 * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care 411 * of the group as migrator and any needed updates within the hierarchy. 412 */ 413 414 static DEFINE_MUTEX(tmigr_mutex); 415 static struct list_head *tmigr_level_list __read_mostly; 416 417 static unsigned int tmigr_hierarchy_levels __read_mostly; 418 static unsigned int tmigr_crossnode_level __read_mostly; 419 420 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); 421 422 #define TMIGR_NONE 0xFF 423 #define BIT_CNT 8 424 425 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) 426 { 427 return !(tmc->tmgroup && tmc->online); 428 } 429 430 /* 431 * Returns true, when @childmask corresponds to the group migrator or when the 432 * group is not active - so no migrator is set. 433 */ 434 static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask) 435 { 436 union tmigr_state s; 437 438 s.state = atomic_read(&group->migr_state); 439 440 if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 441 return true; 442 443 return false; 444 } 445 446 static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask) 447 { 448 bool lonely, migrator = false; 449 unsigned long active; 450 union tmigr_state s; 451 452 s.state = atomic_read(&group->migr_state); 453 454 if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 455 migrator = true; 456 457 active = s.active; 458 lonely = bitmap_weight(&active, BIT_CNT) <= 1; 459 460 return (migrator && lonely); 461 } 462 463 static bool tmigr_check_lonely(struct tmigr_group *group) 464 { 465 unsigned long active; 466 union tmigr_state s; 467 468 s.state = atomic_read(&group->migr_state); 469 470 active = s.active; 471 472 return bitmap_weight(&active, BIT_CNT) <= 1; 473 } 474 475 typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); 476 477 static void __walk_groups(up_f up, void *data, 478 struct tmigr_cpu *tmc) 479 { 480 struct tmigr_group *child = NULL, *group = tmc->tmgroup; 481 482 do { 483 WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); 484 485 if (up(group, child, data)) 486 break; 487 488 child = group; 489 group = group->parent; 490 } while (group); 491 } 492 493 static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) 494 { 495 lockdep_assert_held(&tmc->lock); 496 497 __walk_groups(up, data, tmc); 498 } 499 500 /** 501 * struct tmigr_walk - data required for walking the hierarchy 502 * @nextexp: Next CPU event expiry information which is handed into 503 * the timer migration code by the timer code 504 * (get_next_timer_interrupt()) 505 * @firstexp: Contains the first event expiry information when last 506 * active CPU of hierarchy is on the way to idle to make 507 * sure CPU will be back in time. 508 * @evt: Pointer to tmigr_event which needs to be queued (of idle 509 * child group) 510 * @childmask: childmask of child group 511 * @remote: Is set, when the new timer path is executed in 512 * tmigr_handle_remote_cpu() 513 */ 514 struct tmigr_walk { 515 u64 nextexp; 516 u64 firstexp; 517 struct tmigr_event *evt; 518 u8 childmask; 519 bool remote; 520 }; 521 522 /** 523 * struct tmigr_remote_data - data required for remote expiry hierarchy walk 524 * @basej: timer base in jiffies 525 * @now: timer base monotonic 526 * @firstexp: returns expiry of the first timer in the idle timer 527 * migration hierarchy to make sure the timer is handled in 528 * time; it is stored in the per CPU tmigr_cpu struct of 529 * CPU which expires remote timers 530 * @childmask: childmask of child group 531 * @check: is set if there is the need to handle remote timers; 532 * required in tmigr_requires_handle_remote() only 533 * @tmc_active: this flag indicates, whether the CPU which triggers 534 * the hierarchy walk is !idle in the timer migration 535 * hierarchy. When the CPU is idle and the whole hierarchy is 536 * idle, only the first event of the top level has to be 537 * considered. 538 */ 539 struct tmigr_remote_data { 540 unsigned long basej; 541 u64 now; 542 u64 firstexp; 543 u8 childmask; 544 bool check; 545 bool tmc_active; 546 }; 547 548 /* 549 * Returns the next event of the timerqueue @group->events 550 * 551 * Removes timers with ignore flag and update next_expiry of the group. Values 552 * of the group event are updated in tmigr_update_events() only. 553 */ 554 static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) 555 { 556 struct timerqueue_node *node = NULL; 557 struct tmigr_event *evt = NULL; 558 559 lockdep_assert_held(&group->lock); 560 561 WRITE_ONCE(group->next_expiry, KTIME_MAX); 562 563 while ((node = timerqueue_getnext(&group->events))) { 564 evt = container_of(node, struct tmigr_event, nextevt); 565 566 if (!evt->ignore) { 567 WRITE_ONCE(group->next_expiry, evt->nextevt.expires); 568 return evt; 569 } 570 571 /* 572 * Remove next timers with ignore flag, because the group lock 573 * is held anyway 574 */ 575 if (!timerqueue_del(&group->events, node)) 576 break; 577 } 578 579 return NULL; 580 } 581 582 /* 583 * Return the next event (with the expiry equal or before @now) 584 * 585 * Event, which is returned, is also removed from the queue. 586 */ 587 static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group, 588 u64 now) 589 { 590 struct tmigr_event *evt = tmigr_next_groupevt(group); 591 592 if (!evt || now < evt->nextevt.expires) 593 return NULL; 594 595 /* 596 * The event is ready to expire. Remove it and update next group event. 597 */ 598 timerqueue_del(&group->events, &evt->nextevt); 599 tmigr_next_groupevt(group); 600 601 return evt; 602 } 603 604 static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) 605 { 606 struct tmigr_event *evt; 607 608 evt = tmigr_next_groupevt(group); 609 610 if (!evt) 611 return KTIME_MAX; 612 else 613 return evt->nextevt.expires; 614 } 615 616 static bool tmigr_active_up(struct tmigr_group *group, 617 struct tmigr_group *child, 618 void *ptr) 619 { 620 union tmigr_state curstate, newstate; 621 struct tmigr_walk *data = ptr; 622 bool walk_done; 623 u8 childmask; 624 625 childmask = data->childmask; 626 /* 627 * No memory barrier is required here in contrast to 628 * tmigr_inactive_up(), as the group state change does not depend on the 629 * child state. 630 */ 631 curstate.state = atomic_read(&group->migr_state); 632 633 do { 634 newstate = curstate; 635 walk_done = true; 636 637 if (newstate.migrator == TMIGR_NONE) { 638 newstate.migrator = childmask; 639 640 /* Changes need to be propagated */ 641 walk_done = false; 642 } 643 644 newstate.active |= childmask; 645 newstate.seq++; 646 647 } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); 648 649 if ((walk_done == false) && group->parent) 650 data->childmask = group->childmask; 651 652 /* 653 * The group is active (again). The group event might be still queued 654 * into the parent group's timerqueue but can now be handled by the 655 * migrator of this group. Therefore the ignore flag for the group event 656 * is updated to reflect this. 657 * 658 * The update of the ignore flag in the active path is done lockless. In 659 * worst case the migrator of the parent group observes the change too 660 * late and expires remotely all events belonging to this group. The 661 * lock is held while updating the ignore flag in idle path. So this 662 * state change will not be lost. 663 */ 664 group->groupevt.ignore = true; 665 666 return walk_done; 667 } 668 669 static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) 670 { 671 struct tmigr_walk data; 672 673 data.childmask = tmc->childmask; 674 675 tmc->cpuevt.ignore = true; 676 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 677 678 walk_groups(&tmigr_active_up, &data, tmc); 679 } 680 681 /** 682 * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy 683 * 684 * Call site timer_clear_idle() is called with interrupts disabled. 685 */ 686 void tmigr_cpu_activate(void) 687 { 688 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 689 690 if (tmigr_is_not_available(tmc)) 691 return; 692 693 if (WARN_ON_ONCE(!tmc->idle)) 694 return; 695 696 raw_spin_lock(&tmc->lock); 697 tmc->idle = false; 698 __tmigr_cpu_activate(tmc); 699 raw_spin_unlock(&tmc->lock); 700 } 701 702 /* 703 * Returns true, if there is nothing to be propagated to the next level 704 * 705 * @data->firstexp is set to expiry of first gobal event of the (top level of 706 * the) hierarchy, but only when hierarchy is completely idle. 707 * 708 * The child and group states need to be read under the lock, to prevent a race 709 * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See 710 * also section "Prevent race between new event and last CPU going inactive" in 711 * the documentation at the top. 712 * 713 * This is the only place where the group event expiry value is set. 714 */ 715 static 716 bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, 717 struct tmigr_walk *data) 718 { 719 struct tmigr_event *evt, *first_childevt; 720 union tmigr_state childstate, groupstate; 721 bool remote = data->remote; 722 bool walk_done = false; 723 u64 nextexp; 724 725 if (child) { 726 raw_spin_lock(&child->lock); 727 raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); 728 729 childstate.state = atomic_read(&child->migr_state); 730 groupstate.state = atomic_read(&group->migr_state); 731 732 if (childstate.active) { 733 walk_done = true; 734 goto unlock; 735 } 736 737 first_childevt = tmigr_next_groupevt(child); 738 nextexp = child->next_expiry; 739 evt = &child->groupevt; 740 741 evt->ignore = (nextexp == KTIME_MAX) ? true : false; 742 } else { 743 nextexp = data->nextexp; 744 745 first_childevt = evt = data->evt; 746 747 /* 748 * Walking the hierarchy is required in any case when a 749 * remote expiry was done before. This ensures to not lose 750 * already queued events in non active groups (see section 751 * "Required event and timerqueue update after a remote 752 * expiry" in the documentation at the top). 753 * 754 * The two call sites which are executed without a remote expiry 755 * before, are not prevented from propagating changes through 756 * the hierarchy by the return: 757 * - When entering this path by tmigr_new_timer(), @evt->ignore 758 * is never set. 759 * - tmigr_inactive_up() takes care of the propagation by 760 * itself and ignores the return value. But an immediate 761 * return is required because nothing has to be done in this 762 * level as the event could be ignored. 763 */ 764 if (evt->ignore && !remote) 765 return true; 766 767 raw_spin_lock(&group->lock); 768 769 childstate.state = 0; 770 groupstate.state = atomic_read(&group->migr_state); 771 } 772 773 /* 774 * If the child event is already queued in the group, remove it from the 775 * queue when the expiry time changed only or when it could be ignored. 776 */ 777 if (timerqueue_node_queued(&evt->nextevt)) { 778 if ((evt->nextevt.expires == nextexp) && !evt->ignore) 779 goto check_toplvl; 780 781 if (!timerqueue_del(&group->events, &evt->nextevt)) 782 WRITE_ONCE(group->next_expiry, KTIME_MAX); 783 } 784 785 if (evt->ignore) { 786 /* 787 * When the next child event could be ignored (nextexp is 788 * KTIME_MAX) and there was no remote timer handling before or 789 * the group is already active, there is no need to walk the 790 * hierarchy even if there is a parent group. 791 * 792 * The other way round: even if the event could be ignored, but 793 * if a remote timer handling was executed before and the group 794 * is not active, walking the hierarchy is required to not miss 795 * an enqueued timer in the non active group. The enqueued timer 796 * of the group needs to be propagated to a higher level to 797 * ensure it is handled. 798 */ 799 if (!remote || groupstate.active) 800 walk_done = true; 801 } else { 802 evt->nextevt.expires = nextexp; 803 evt->cpu = first_childevt->cpu; 804 805 if (timerqueue_add(&group->events, &evt->nextevt)) 806 WRITE_ONCE(group->next_expiry, nextexp); 807 } 808 809 check_toplvl: 810 if (!group->parent && (groupstate.migrator == TMIGR_NONE)) { 811 walk_done = true; 812 813 /* 814 * Nothing to do when update was done during remote timer 815 * handling. First timer in top level group which needs to be 816 * handled when top level group is not active, is calculated 817 * directly in tmigr_handle_remote_up(). 818 */ 819 if (remote) 820 goto unlock; 821 822 /* 823 * The top level group is idle and it has to be ensured the 824 * global timers are handled in time. (This could be optimized 825 * by keeping track of the last global scheduled event and only 826 * arming it on the CPU if the new event is earlier. Not sure if 827 * its worth the complexity.) 828 */ 829 data->firstexp = tmigr_next_groupevt_expires(group); 830 } 831 832 unlock: 833 raw_spin_unlock(&group->lock); 834 835 if (child) 836 raw_spin_unlock(&child->lock); 837 838 return walk_done; 839 } 840 841 static bool tmigr_new_timer_up(struct tmigr_group *group, 842 struct tmigr_group *child, 843 void *ptr) 844 { 845 struct tmigr_walk *data = ptr; 846 847 return tmigr_update_events(group, child, data); 848 } 849 850 /* 851 * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is 852 * returned, if an active CPU will handle all the timer migration hierarchy 853 * timers. 854 */ 855 static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp) 856 { 857 struct tmigr_walk data = { .nextexp = nextexp, 858 .firstexp = KTIME_MAX, 859 .evt = &tmc->cpuevt }; 860 861 lockdep_assert_held(&tmc->lock); 862 863 if (tmc->remote) 864 return KTIME_MAX; 865 866 tmc->cpuevt.ignore = false; 867 data.remote = false; 868 869 walk_groups(&tmigr_new_timer_up, &data, tmc); 870 871 /* If there is a new first global event, make sure it is handled */ 872 return data.firstexp; 873 } 874 875 static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, 876 unsigned long jif) 877 { 878 struct timer_events tevt; 879 struct tmigr_walk data; 880 struct tmigr_cpu *tmc; 881 882 tmc = per_cpu_ptr(&tmigr_cpu, cpu); 883 884 raw_spin_lock_irq(&tmc->lock); 885 886 /* 887 * If the remote CPU is offline then the timers have been migrated to 888 * another CPU. 889 * 890 * If tmigr_cpu::remote is set, at the moment another CPU already 891 * expires the timers of the remote CPU. 892 * 893 * If tmigr_event::ignore is set, then the CPU returns from idle and 894 * takes care of its timers. 895 * 896 * If the next event expires in the future, then the event has been 897 * updated and there are no timers to expire right now. The CPU which 898 * updated the event takes care when hierarchy is completely 899 * idle. Otherwise the migrator does it as the event is enqueued. 900 */ 901 if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || 902 now < tmc->cpuevt.nextevt.expires) { 903 raw_spin_unlock_irq(&tmc->lock); 904 return; 905 } 906 907 tmc->remote = true; 908 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 909 910 /* Drop the lock to allow the remote CPU to exit idle */ 911 raw_spin_unlock_irq(&tmc->lock); 912 913 if (cpu != smp_processor_id()) 914 timer_expire_remote(cpu); 915 916 /* 917 * Lock ordering needs to be preserved - timer_base locks before tmigr 918 * related locks (see section "Locking rules" in the documentation at 919 * the top). During fetching the next timer interrupt, also tmc->lock 920 * needs to be held. Otherwise there is a possible race window against 921 * the CPU itself when it comes out of idle, updates the first timer in 922 * the hierarchy and goes back to idle. 923 * 924 * timer base locks are dropped as fast as possible: After checking 925 * whether the remote CPU went offline in the meantime and after 926 * fetching the next remote timer interrupt. Dropping the locks as fast 927 * as possible keeps the locking region small and prevents holding 928 * several (unnecessary) locks during walking the hierarchy for updating 929 * the timerqueue and group events. 930 */ 931 local_irq_disable(); 932 timer_lock_remote_bases(cpu); 933 raw_spin_lock(&tmc->lock); 934 935 /* 936 * When the CPU went offline in the meantime, no hierarchy walk has to 937 * be done for updating the queued events, because the walk was 938 * already done during marking the CPU offline in the hierarchy. 939 * 940 * When the CPU is no longer idle, the CPU takes care of the timers and 941 * also of the timers in the hierarchy. 942 * 943 * (See also section "Required event and timerqueue update after a 944 * remote expiry" in the documentation at the top) 945 */ 946 if (!tmc->online || !tmc->idle) { 947 timer_unlock_remote_bases(cpu); 948 goto unlock; 949 } 950 951 /* next event of CPU */ 952 fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu); 953 timer_unlock_remote_bases(cpu); 954 955 data.nextexp = tevt.global; 956 data.firstexp = KTIME_MAX; 957 data.evt = &tmc->cpuevt; 958 data.remote = true; 959 960 /* 961 * The update is done even when there is no 'new' global timer pending 962 * on the remote CPU (see section "Required event and timerqueue update 963 * after a remote expiry" in the documentation at the top) 964 */ 965 walk_groups(&tmigr_new_timer_up, &data, tmc); 966 967 unlock: 968 tmc->remote = false; 969 raw_spin_unlock_irq(&tmc->lock); 970 } 971 972 static bool tmigr_handle_remote_up(struct tmigr_group *group, 973 struct tmigr_group *child, 974 void *ptr) 975 { 976 struct tmigr_remote_data *data = ptr; 977 struct tmigr_event *evt; 978 unsigned long jif; 979 u8 childmask; 980 u64 now; 981 982 jif = data->basej; 983 now = data->now; 984 985 childmask = data->childmask; 986 987 again: 988 /* 989 * Handle the group only if @childmask is the migrator or if the 990 * group has no migrator. Otherwise the group is active and is 991 * handled by its own migrator. 992 */ 993 if (!tmigr_check_migrator(group, childmask)) 994 return true; 995 996 raw_spin_lock_irq(&group->lock); 997 998 evt = tmigr_next_expired_groupevt(group, now); 999 1000 if (evt) { 1001 unsigned int remote_cpu = evt->cpu; 1002 1003 raw_spin_unlock_irq(&group->lock); 1004 1005 tmigr_handle_remote_cpu(remote_cpu, now, jif); 1006 1007 /* check if there is another event, that needs to be handled */ 1008 goto again; 1009 } 1010 1011 /* 1012 * Update of childmask for the next level and keep track of the expiry 1013 * of the first event that needs to be handled (group->next_expiry was 1014 * updated by tmigr_next_expired_groupevt(), next was set by 1015 * tmigr_handle_remote_cpu()). 1016 */ 1017 data->childmask = group->childmask; 1018 data->firstexp = group->next_expiry; 1019 1020 raw_spin_unlock_irq(&group->lock); 1021 1022 return false; 1023 } 1024 1025 /** 1026 * tmigr_handle_remote() - Handle global timers of remote idle CPUs 1027 * 1028 * Called from the timer soft interrupt with interrupts enabled. 1029 */ 1030 void tmigr_handle_remote(void) 1031 { 1032 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1033 struct tmigr_remote_data data; 1034 1035 if (tmigr_is_not_available(tmc)) 1036 return; 1037 1038 data.childmask = tmc->childmask; 1039 data.firstexp = KTIME_MAX; 1040 1041 /* 1042 * NOTE: This is a doubled check because the migrator test will be done 1043 * in tmigr_handle_remote_up() anyway. Keep this check to speed up the 1044 * return when nothing has to be done. 1045 */ 1046 if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) 1047 return; 1048 1049 data.now = get_jiffies_update(&data.basej); 1050 1051 /* 1052 * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to 1053 * KTIME_MAX. Even if tmc->lock is not held during the whole remote 1054 * handling, tmc->wakeup is fine to be stale as it is called in 1055 * interrupt context and tick_nohz_next_event() is executed in interrupt 1056 * exit path only after processing the last pending interrupt. 1057 */ 1058 1059 __walk_groups(&tmigr_handle_remote_up, &data, tmc); 1060 1061 raw_spin_lock_irq(&tmc->lock); 1062 WRITE_ONCE(tmc->wakeup, data.firstexp); 1063 raw_spin_unlock_irq(&tmc->lock); 1064 } 1065 1066 static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, 1067 struct tmigr_group *child, 1068 void *ptr) 1069 { 1070 struct tmigr_remote_data *data = ptr; 1071 u8 childmask; 1072 1073 childmask = data->childmask; 1074 1075 /* 1076 * Handle the group only if the child is the migrator or if the group 1077 * has no migrator. Otherwise the group is active and is handled by its 1078 * own migrator. 1079 */ 1080 if (!tmigr_check_migrator(group, childmask)) 1081 return true; 1082 1083 /* 1084 * When there is a parent group and the CPU which triggered the 1085 * hierarchy walk is not active, proceed the walk to reach the top level 1086 * group before reading the next_expiry value. 1087 */ 1088 if (group->parent && !data->tmc_active) 1089 goto out; 1090 1091 /* 1092 * The lock is required on 32bit architectures to read the variable 1093 * consistently with a concurrent writer. On 64bit the lock is not 1094 * required because the read operation is not split and so it is always 1095 * consistent. 1096 */ 1097 if (IS_ENABLED(CONFIG_64BIT)) { 1098 data->firstexp = READ_ONCE(group->next_expiry); 1099 if (data->now >= data->firstexp) { 1100 data->check = true; 1101 return true; 1102 } 1103 } else { 1104 raw_spin_lock(&group->lock); 1105 data->firstexp = group->next_expiry; 1106 if (data->now >= group->next_expiry) { 1107 data->check = true; 1108 raw_spin_unlock(&group->lock); 1109 return true; 1110 } 1111 raw_spin_unlock(&group->lock); 1112 } 1113 1114 out: 1115 /* Update of childmask for the next level */ 1116 data->childmask = group->childmask; 1117 return false; 1118 } 1119 1120 /** 1121 * tmigr_requires_handle_remote() - Check the need of remote timer handling 1122 * 1123 * Must be called with interrupts disabled. 1124 */ 1125 bool tmigr_requires_handle_remote(void) 1126 { 1127 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1128 struct tmigr_remote_data data; 1129 unsigned long jif; 1130 bool ret = false; 1131 1132 if (tmigr_is_not_available(tmc)) 1133 return ret; 1134 1135 data.now = get_jiffies_update(&jif); 1136 data.childmask = tmc->childmask; 1137 data.firstexp = KTIME_MAX; 1138 data.tmc_active = !tmc->idle; 1139 data.check = false; 1140 1141 /* 1142 * If the CPU is active, walk the hierarchy to check whether a remote 1143 * expiry is required. 1144 * 1145 * Check is done lockless as interrupts are disabled and @tmc->idle is 1146 * set only by the local CPU. 1147 */ 1148 if (!tmc->idle) { 1149 __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc); 1150 1151 return data.check; 1152 } 1153 1154 /* 1155 * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock 1156 * is required on 32bit architectures to read the variable consistently 1157 * with a concurrent writer. On 64bit the lock is not required because 1158 * the read operation is not split and so it is always consistent. 1159 */ 1160 if (IS_ENABLED(CONFIG_64BIT)) { 1161 if (data.now >= READ_ONCE(tmc->wakeup)) 1162 return true; 1163 } else { 1164 raw_spin_lock(&tmc->lock); 1165 if (data.now >= tmc->wakeup) 1166 ret = true; 1167 raw_spin_unlock(&tmc->lock); 1168 } 1169 1170 return ret; 1171 } 1172 1173 /** 1174 * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc) 1175 * @nextexp: Next expiry of global timer (or KTIME_MAX if not) 1176 * 1177 * The CPU is already deactivated in the timer migration 1178 * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event() 1179 * and thereby the timer idle path is executed once more. @tmc->wakeup 1180 * holds the first timer, when the timer migration hierarchy is 1181 * completely idle. 1182 * 1183 * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if 1184 * nothing needs to be done. 1185 */ 1186 u64 tmigr_cpu_new_timer(u64 nextexp) 1187 { 1188 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1189 u64 ret; 1190 1191 if (tmigr_is_not_available(tmc)) 1192 return nextexp; 1193 1194 raw_spin_lock(&tmc->lock); 1195 1196 ret = READ_ONCE(tmc->wakeup); 1197 if (nextexp != KTIME_MAX) { 1198 if (nextexp != tmc->cpuevt.nextevt.expires || 1199 tmc->cpuevt.ignore) { 1200 ret = tmigr_new_timer(tmc, nextexp); 1201 } 1202 } 1203 /* 1204 * Make sure the reevaluation of timers in idle path will not miss an 1205 * event. 1206 */ 1207 WRITE_ONCE(tmc->wakeup, ret); 1208 1209 raw_spin_unlock(&tmc->lock); 1210 return ret; 1211 } 1212 1213 static bool tmigr_inactive_up(struct tmigr_group *group, 1214 struct tmigr_group *child, 1215 void *ptr) 1216 { 1217 union tmigr_state curstate, newstate, childstate; 1218 struct tmigr_walk *data = ptr; 1219 bool walk_done; 1220 u8 childmask; 1221 1222 childmask = data->childmask; 1223 childstate.state = 0; 1224 1225 /* 1226 * The memory barrier is paired with the cmpxchg() in tmigr_active_up() 1227 * to make sure the updates of child and group states are ordered. The 1228 * ordering is mandatory, as the group state change depends on the child 1229 * state. 1230 */ 1231 curstate.state = atomic_read_acquire(&group->migr_state); 1232 1233 for (;;) { 1234 if (child) 1235 childstate.state = atomic_read(&child->migr_state); 1236 1237 newstate = curstate; 1238 walk_done = true; 1239 1240 /* Reset active bit when the child is no longer active */ 1241 if (!childstate.active) 1242 newstate.active &= ~childmask; 1243 1244 if (newstate.migrator == childmask) { 1245 /* 1246 * Find a new migrator for the group, because the child 1247 * group is idle! 1248 */ 1249 if (!childstate.active) { 1250 unsigned long new_migr_bit, active = newstate.active; 1251 1252 new_migr_bit = find_first_bit(&active, BIT_CNT); 1253 1254 if (new_migr_bit != BIT_CNT) { 1255 newstate.migrator = BIT(new_migr_bit); 1256 } else { 1257 newstate.migrator = TMIGR_NONE; 1258 1259 /* Changes need to be propagated */ 1260 walk_done = false; 1261 } 1262 } 1263 } 1264 1265 newstate.seq++; 1266 1267 WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); 1268 1269 if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, 1270 newstate.state)) 1271 break; 1272 1273 /* 1274 * The memory barrier is paired with the cmpxchg() in 1275 * tmigr_active_up() to make sure the updates of child and group 1276 * states are ordered. It is required only when the above 1277 * try_cmpxchg() fails. 1278 */ 1279 smp_mb__after_atomic(); 1280 } 1281 1282 data->remote = false; 1283 1284 /* Event Handling */ 1285 tmigr_update_events(group, child, data); 1286 1287 if (group->parent && (walk_done == false)) 1288 data->childmask = group->childmask; 1289 1290 /* 1291 * data->firstexp was set by tmigr_update_events() and contains the 1292 * expiry of the first global event which needs to be handled. It 1293 * differs from KTIME_MAX if: 1294 * - group is the top level group and 1295 * - group is idle (which means CPU was the last active CPU in the 1296 * hierarchy) and 1297 * - there is a pending event in the hierarchy 1298 */ 1299 WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); 1300 1301 return walk_done; 1302 } 1303 1304 static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) 1305 { 1306 struct tmigr_walk data = { .nextexp = nextexp, 1307 .firstexp = KTIME_MAX, 1308 .evt = &tmc->cpuevt, 1309 .childmask = tmc->childmask }; 1310 1311 /* 1312 * If nextexp is KTIME_MAX, the CPU event will be ignored because the 1313 * local timer expires before the global timer, no global timer is set 1314 * or CPU goes offline. 1315 */ 1316 if (nextexp != KTIME_MAX) 1317 tmc->cpuevt.ignore = false; 1318 1319 walk_groups(&tmigr_inactive_up, &data, tmc); 1320 return data.firstexp; 1321 } 1322 1323 /** 1324 * tmigr_cpu_deactivate() - Put current CPU into inactive state 1325 * @nextexp: The next global timer expiry of the current CPU 1326 * 1327 * Must be called with interrupts disabled. 1328 * 1329 * Return: the next event expiry of the current CPU or the next event expiry 1330 * from the hierarchy if this CPU is the top level migrator or the hierarchy is 1331 * completely idle. 1332 */ 1333 u64 tmigr_cpu_deactivate(u64 nextexp) 1334 { 1335 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1336 u64 ret; 1337 1338 if (tmigr_is_not_available(tmc)) 1339 return nextexp; 1340 1341 raw_spin_lock(&tmc->lock); 1342 1343 ret = __tmigr_cpu_deactivate(tmc, nextexp); 1344 1345 tmc->idle = true; 1346 1347 /* 1348 * Make sure the reevaluation of timers in idle path will not miss an 1349 * event. 1350 */ 1351 WRITE_ONCE(tmc->wakeup, ret); 1352 1353 raw_spin_unlock(&tmc->lock); 1354 return ret; 1355 } 1356 1357 /** 1358 * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to 1359 * go idle 1360 * @nextevt: The next global timer expiry of the current CPU 1361 * 1362 * Return: 1363 * * KTIME_MAX - when it is probable that nothing has to be done (not 1364 * the only one in the level 0 group; and if it is the 1365 * only one in level 0 group, but there are more than a 1366 * single group active on the way to top level) 1367 * * nextevt - when CPU is offline and has to handle timer on his own 1368 * or when on the way to top in every group only a single 1369 * child is active and but @nextevt is before next_expiry 1370 * of top level group 1371 * * next_expiry (top) - value of top level group, when on the way to top in 1372 * every group only a single child is active and @nextevt 1373 * is after this value active child. 1374 */ 1375 u64 tmigr_quick_check(u64 nextevt) 1376 { 1377 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1378 struct tmigr_group *group = tmc->tmgroup; 1379 1380 if (tmigr_is_not_available(tmc)) 1381 return nextevt; 1382 1383 if (WARN_ON_ONCE(tmc->idle)) 1384 return nextevt; 1385 1386 if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) 1387 return KTIME_MAX; 1388 1389 do { 1390 if (!tmigr_check_lonely(group)) { 1391 return KTIME_MAX; 1392 } else if (!group->parent) { 1393 u64 first_global = READ_ONCE(group->next_expiry); 1394 1395 return min_t(u64, nextevt, first_global); 1396 } 1397 group = group->parent; 1398 } while (group); 1399 1400 return KTIME_MAX; 1401 } 1402 1403 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, 1404 int node) 1405 { 1406 union tmigr_state s; 1407 1408 raw_spin_lock_init(&group->lock); 1409 1410 group->level = lvl; 1411 group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE; 1412 1413 group->num_children = 0; 1414 1415 s.migrator = TMIGR_NONE; 1416 s.active = 0; 1417 s.seq = 0; 1418 atomic_set(&group->migr_state, s.state); 1419 1420 timerqueue_init_head(&group->events); 1421 timerqueue_init(&group->groupevt.nextevt); 1422 group->groupevt.nextevt.expires = KTIME_MAX; 1423 WRITE_ONCE(group->next_expiry, KTIME_MAX); 1424 group->groupevt.ignore = true; 1425 } 1426 1427 static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, 1428 unsigned int lvl) 1429 { 1430 struct tmigr_group *tmp, *group = NULL; 1431 1432 lockdep_assert_held(&tmigr_mutex); 1433 1434 /* Try to attach to an existing group first */ 1435 list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { 1436 /* 1437 * If @lvl is below the cross NUMA node level, check whether 1438 * this group belongs to the same NUMA node. 1439 */ 1440 if (lvl < tmigr_crossnode_level && tmp->numa_node != node) 1441 continue; 1442 1443 /* Capacity left? */ 1444 if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP) 1445 continue; 1446 1447 /* 1448 * TODO: A possible further improvement: Make sure that all CPU 1449 * siblings end up in the same group of the lowest level of the 1450 * hierarchy. Rely on the topology sibling mask would be a 1451 * reasonable solution. 1452 */ 1453 1454 group = tmp; 1455 break; 1456 } 1457 1458 if (group) 1459 return group; 1460 1461 /* Allocate and set up a new group */ 1462 group = kzalloc_node(sizeof(*group), GFP_KERNEL, node); 1463 if (!group) 1464 return ERR_PTR(-ENOMEM); 1465 1466 tmigr_init_group(group, lvl, node); 1467 1468 /* Setup successful. Add it to the hierarchy */ 1469 list_add(&group->list, &tmigr_level_list[lvl]); 1470 return group; 1471 } 1472 1473 static void tmigr_connect_child_parent(struct tmigr_group *child, 1474 struct tmigr_group *parent) 1475 { 1476 union tmigr_state childstate; 1477 1478 raw_spin_lock_irq(&child->lock); 1479 raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); 1480 1481 child->parent = parent; 1482 child->childmask = BIT(parent->num_children++); 1483 1484 raw_spin_unlock(&parent->lock); 1485 raw_spin_unlock_irq(&child->lock); 1486 1487 /* 1488 * To prevent inconsistent states, active children need to be active in 1489 * the new parent as well. Inactive children are already marked inactive 1490 * in the parent group: 1491 * 1492 * * When new groups were created by tmigr_setup_groups() starting from 1493 * the lowest level (and not higher then one level below the current 1494 * top level), then they are not active. They will be set active when 1495 * the new online CPU comes active. 1496 * 1497 * * But if a new group above the current top level is required, it is 1498 * mandatory to propagate the active state of the already existing 1499 * child to the new parent. So tmigr_connect_child_parent() is 1500 * executed with the formerly top level group (child) and the newly 1501 * created group (parent). 1502 */ 1503 childstate.state = atomic_read(&child->migr_state); 1504 if (childstate.migrator != TMIGR_NONE) { 1505 struct tmigr_walk data; 1506 1507 data.childmask = child->childmask; 1508 1509 /* 1510 * There is only one new level per time. When connecting the 1511 * child and the parent and set the child active when the parent 1512 * is inactive, the parent needs to be the uppermost 1513 * level. Otherwise there went something wrong! 1514 */ 1515 WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); 1516 } 1517 } 1518 1519 static int tmigr_setup_groups(unsigned int cpu, unsigned int node) 1520 { 1521 struct tmigr_group *group, *child, **stack; 1522 int top = 0, err = 0, i = 0; 1523 struct list_head *lvllist; 1524 1525 stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); 1526 if (!stack) 1527 return -ENOMEM; 1528 1529 do { 1530 group = tmigr_get_group(cpu, node, i); 1531 if (IS_ERR(group)) { 1532 err = PTR_ERR(group); 1533 break; 1534 } 1535 1536 top = i; 1537 stack[i++] = group; 1538 1539 /* 1540 * When booting only less CPUs of a system than CPUs are 1541 * available, not all calculated hierarchy levels are required. 1542 * 1543 * The loop is aborted as soon as the highest level, which might 1544 * be different from tmigr_hierarchy_levels, contains only a 1545 * single group. 1546 */ 1547 if (group->parent || i == tmigr_hierarchy_levels || 1548 (list_empty(&tmigr_level_list[i]) && 1549 list_is_singular(&tmigr_level_list[i - 1]))) 1550 break; 1551 1552 } while (i < tmigr_hierarchy_levels); 1553 1554 do { 1555 group = stack[--i]; 1556 1557 if (err < 0) { 1558 list_del(&group->list); 1559 kfree(group); 1560 continue; 1561 } 1562 1563 WARN_ON_ONCE(i != group->level); 1564 1565 /* 1566 * Update tmc -> group / child -> group connection 1567 */ 1568 if (i == 0) { 1569 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1570 1571 raw_spin_lock_irq(&group->lock); 1572 1573 tmc->tmgroup = group; 1574 tmc->childmask = BIT(group->num_children++); 1575 1576 raw_spin_unlock_irq(&group->lock); 1577 1578 /* There are no children that need to be connected */ 1579 continue; 1580 } else { 1581 child = stack[i - 1]; 1582 tmigr_connect_child_parent(child, group); 1583 } 1584 1585 /* check if uppermost level was newly created */ 1586 if (top != i) 1587 continue; 1588 1589 WARN_ON_ONCE(top == 0); 1590 1591 lvllist = &tmigr_level_list[top]; 1592 if (group->num_children == 1 && list_is_singular(lvllist)) { 1593 lvllist = &tmigr_level_list[top - 1]; 1594 list_for_each_entry(child, lvllist, list) { 1595 if (child->parent) 1596 continue; 1597 1598 tmigr_connect_child_parent(child, group); 1599 } 1600 } 1601 } while (i > 0); 1602 1603 kfree(stack); 1604 1605 return err; 1606 } 1607 1608 static int tmigr_add_cpu(unsigned int cpu) 1609 { 1610 int node = cpu_to_node(cpu); 1611 int ret; 1612 1613 mutex_lock(&tmigr_mutex); 1614 ret = tmigr_setup_groups(cpu, node); 1615 mutex_unlock(&tmigr_mutex); 1616 1617 return ret; 1618 } 1619 1620 static int tmigr_cpu_online(unsigned int cpu) 1621 { 1622 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1623 int ret; 1624 1625 /* First online attempt? Initialize CPU data */ 1626 if (!tmc->tmgroup) { 1627 raw_spin_lock_init(&tmc->lock); 1628 1629 ret = tmigr_add_cpu(cpu); 1630 if (ret < 0) 1631 return ret; 1632 1633 if (tmc->childmask == 0) 1634 return -EINVAL; 1635 1636 timerqueue_init(&tmc->cpuevt.nextevt); 1637 tmc->cpuevt.nextevt.expires = KTIME_MAX; 1638 tmc->cpuevt.ignore = true; 1639 tmc->cpuevt.cpu = cpu; 1640 1641 tmc->remote = false; 1642 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 1643 } 1644 raw_spin_lock_irq(&tmc->lock); 1645 tmc->idle = timer_base_is_idle(); 1646 if (!tmc->idle) 1647 __tmigr_cpu_activate(tmc); 1648 tmc->online = true; 1649 raw_spin_unlock_irq(&tmc->lock); 1650 return 0; 1651 } 1652 1653 /* 1654 * tmigr_trigger_active() - trigger a CPU to become active again 1655 * 1656 * This function is executed on a CPU which is part of cpu_online_mask, when the 1657 * last active CPU in the hierarchy is offlining. With this, it is ensured that 1658 * the other CPU is active and takes over the migrator duty. 1659 */ 1660 static long tmigr_trigger_active(void *unused) 1661 { 1662 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1663 1664 WARN_ON_ONCE(!tmc->online || tmc->idle); 1665 1666 return 0; 1667 } 1668 1669 static int tmigr_cpu_offline(unsigned int cpu) 1670 { 1671 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1672 int migrator; 1673 u64 firstexp; 1674 1675 raw_spin_lock_irq(&tmc->lock); 1676 tmc->online = false; 1677 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 1678 1679 /* 1680 * CPU has to handle the local events on his own, when on the way to 1681 * offline; Therefore nextevt value is set to KTIME_MAX 1682 */ 1683 firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); 1684 raw_spin_unlock_irq(&tmc->lock); 1685 1686 if (firstexp != KTIME_MAX) { 1687 migrator = cpumask_any_but(cpu_online_mask, cpu); 1688 work_on_cpu(migrator, tmigr_trigger_active, NULL); 1689 } 1690 1691 return 0; 1692 } 1693 1694 static int __init tmigr_init(void) 1695 { 1696 unsigned int cpulvl, nodelvl, cpus_per_node, i; 1697 unsigned int nnodes = num_possible_nodes(); 1698 unsigned int ncpus = num_possible_cpus(); 1699 int ret = -ENOMEM; 1700 1701 BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP); 1702 1703 /* Nothing to do if running on UP */ 1704 if (ncpus == 1) 1705 return 0; 1706 1707 /* 1708 * Calculate the required hierarchy levels. Unfortunately there is no 1709 * reliable information available, unless all possible CPUs have been 1710 * brought up and all NUMA nodes are populated. 1711 * 1712 * Estimate the number of levels with the number of possible nodes and 1713 * the number of possible CPUs. Assume CPUs are spread evenly across 1714 * nodes. We cannot rely on cpumask_of_node() because it only works for 1715 * online CPUs. 1716 */ 1717 cpus_per_node = DIV_ROUND_UP(ncpus, nnodes); 1718 1719 /* Calc the hierarchy levels required to hold the CPUs of a node */ 1720 cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node), 1721 ilog2(TMIGR_CHILDREN_PER_GROUP)); 1722 1723 /* Calculate the extra levels to connect all nodes */ 1724 nodelvl = DIV_ROUND_UP(order_base_2(nnodes), 1725 ilog2(TMIGR_CHILDREN_PER_GROUP)); 1726 1727 tmigr_hierarchy_levels = cpulvl + nodelvl; 1728 1729 /* 1730 * If a NUMA node spawns more than one CPU level group then the next 1731 * level(s) of the hierarchy contains groups which handle all CPU groups 1732 * of the same NUMA node. The level above goes across NUMA nodes. Store 1733 * this information for the setup code to decide in which level node 1734 * matching is no longer required. 1735 */ 1736 tmigr_crossnode_level = cpulvl; 1737 1738 tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL); 1739 if (!tmigr_level_list) 1740 goto err; 1741 1742 for (i = 0; i < tmigr_hierarchy_levels; i++) 1743 INIT_LIST_HEAD(&tmigr_level_list[i]); 1744 1745 pr_info("Timer migration: %d hierarchy levels; %d children per group;" 1746 " %d crossnode level\n", 1747 tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, 1748 tmigr_crossnode_level); 1749 1750 ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", 1751 tmigr_cpu_online, tmigr_cpu_offline); 1752 if (ret) 1753 goto err; 1754 1755 return 0; 1756 1757 err: 1758 pr_err("Timer migration setup failed\n"); 1759 return ret; 1760 } 1761 late_initcall(tmigr_init); 1762