1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kernel/workqueue.c - generic async execution with shared worker pool 4 * 5 * Copyright (C) 2002 Ingo Molnar 6 * 7 * Derived from the taskqueue/keventd code by: 8 * David Woodhouse <[email protected]> 9 * Andrew Morton 10 * Kai Petzke <[email protected]> 11 * Theodore Ts'o <[email protected]> 12 * 13 * Made to use alloc_percpu by Christoph Lameter. 14 * 15 * Copyright (C) 2010 SUSE Linux Products GmbH 16 * Copyright (C) 2010 Tejun Heo <[email protected]> 17 * 18 * This is the generic async execution mechanism. Work items as are 19 * executed in process context. The worker pool is shared and 20 * automatically managed. There are two worker pools for each CPU (one for 21 * normal work items and the other for high priority ones) and some extra 22 * pools for workqueues which are not bound to any specific CPU - the 23 * number of these backing pools is dynamic. 24 * 25 * Please read Documentation/core-api/workqueue.rst for details. 26 */ 27 28 #include <linux/export.h> 29 #include <linux/kernel.h> 30 #include <linux/sched.h> 31 #include <linux/init.h> 32 #include <linux/signal.h> 33 #include <linux/completion.h> 34 #include <linux/workqueue.h> 35 #include <linux/slab.h> 36 #include <linux/cpu.h> 37 #include <linux/notifier.h> 38 #include <linux/kthread.h> 39 #include <linux/hardirq.h> 40 #include <linux/mempolicy.h> 41 #include <linux/freezer.h> 42 #include <linux/debug_locks.h> 43 #include <linux/lockdep.h> 44 #include <linux/idr.h> 45 #include <linux/jhash.h> 46 #include <linux/hashtable.h> 47 #include <linux/rculist.h> 48 #include <linux/nodemask.h> 49 #include <linux/moduleparam.h> 50 #include <linux/uaccess.h> 51 #include <linux/sched/isolation.h> 52 #include <linux/sched/debug.h> 53 #include <linux/nmi.h> 54 #include <linux/kvm_para.h> 55 #include <linux/delay.h> 56 57 #include "workqueue_internal.h" 58 59 enum { 60 /* 61 * worker_pool flags 62 * 63 * A bound pool is either associated or disassociated with its CPU. 64 * While associated (!DISASSOCIATED), all workers are bound to the 65 * CPU and none has %WORKER_UNBOUND set and concurrency management 66 * is in effect. 67 * 68 * While DISASSOCIATED, the cpu may be offline and all workers have 69 * %WORKER_UNBOUND set and concurrency management disabled, and may 70 * be executing on any CPU. The pool behaves as an unbound one. 71 * 72 * Note that DISASSOCIATED should be flipped only while holding 73 * wq_pool_attach_mutex to avoid changing binding state while 74 * worker_attach_to_pool() is in progress. 75 */ 76 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ 77 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 78 79 /* worker flags */ 80 WORKER_DIE = 1 << 1, /* die die die */ 81 WORKER_IDLE = 1 << 2, /* is idle */ 82 WORKER_PREP = 1 << 3, /* preparing to run works */ 83 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 84 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 85 WORKER_REBOUND = 1 << 8, /* worker was rebound */ 86 87 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | 88 WORKER_UNBOUND | WORKER_REBOUND, 89 90 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 91 92 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ 93 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 94 95 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 96 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ 97 98 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, 99 /* call for help after 10ms 100 (min two ticks) */ 101 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 102 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 103 104 /* 105 * Rescue workers are used only on emergencies and shared by 106 * all cpus. Give MIN_NICE. 107 */ 108 RESCUER_NICE_LEVEL = MIN_NICE, 109 HIGHPRI_NICE_LEVEL = MIN_NICE, 110 111 WQ_NAME_LEN = 24, 112 }; 113 114 /* 115 * Structure fields follow one of the following exclusion rules. 116 * 117 * I: Modifiable by initialization/destruction paths and read-only for 118 * everyone else. 119 * 120 * P: Preemption protected. Disabling preemption is enough and should 121 * only be modified and accessed from the local cpu. 122 * 123 * L: pool->lock protected. Access with pool->lock held. 124 * 125 * K: Only modified by worker while holding pool->lock. Can be safely read by 126 * self, while holding pool->lock or from IRQ context if %current is the 127 * kworker. 128 * 129 * S: Only modified by worker self. 130 * 131 * A: wq_pool_attach_mutex protected. 132 * 133 * PL: wq_pool_mutex protected. 134 * 135 * PR: wq_pool_mutex protected for writes. RCU protected for reads. 136 * 137 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. 138 * 139 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or 140 * RCU for reads. 141 * 142 * WQ: wq->mutex protected. 143 * 144 * WR: wq->mutex protected for writes. RCU protected for reads. 145 * 146 * MD: wq_mayday_lock protected. 147 * 148 * WD: Used internally by the watchdog. 149 */ 150 151 /* struct worker is defined in workqueue_internal.h */ 152 153 struct worker_pool { 154 raw_spinlock_t lock; /* the pool lock */ 155 int cpu; /* I: the associated cpu */ 156 int node; /* I: the associated node ID */ 157 int id; /* I: pool ID */ 158 unsigned int flags; /* L: flags */ 159 160 unsigned long watchdog_ts; /* L: watchdog timestamp */ 161 bool cpu_stall; /* WD: stalled cpu bound pool */ 162 163 /* 164 * The counter is incremented in a process context on the associated CPU 165 * w/ preemption disabled, and decremented or reset in the same context 166 * but w/ pool->lock held. The readers grab pool->lock and are 167 * guaranteed to see if the counter reached zero. 168 */ 169 int nr_running; 170 171 struct list_head worklist; /* L: list of pending works */ 172 173 int nr_workers; /* L: total number of workers */ 174 int nr_idle; /* L: currently idle workers */ 175 176 struct list_head idle_list; /* L: list of idle workers */ 177 struct timer_list idle_timer; /* L: worker idle timeout */ 178 struct work_struct idle_cull_work; /* L: worker idle cleanup */ 179 180 struct timer_list mayday_timer; /* L: SOS timer for workers */ 181 182 /* a workers is either on busy_hash or idle_list, or the manager */ 183 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 184 /* L: hash of busy workers */ 185 186 struct worker *manager; /* L: purely informational */ 187 struct list_head workers; /* A: attached workers */ 188 struct list_head dying_workers; /* A: workers about to die */ 189 struct completion *detach_completion; /* all workers detached */ 190 191 struct ida worker_ida; /* worker IDs for task name */ 192 193 struct workqueue_attrs *attrs; /* I: worker attributes */ 194 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 195 int refcnt; /* PL: refcnt for unbound pools */ 196 197 /* 198 * Destruction of pool is RCU protected to allow dereferences 199 * from get_work_pool(). 200 */ 201 struct rcu_head rcu; 202 }; 203 204 /* 205 * Per-pool_workqueue statistics. These can be monitored using 206 * tools/workqueue/wq_monitor.py. 207 */ 208 enum pool_workqueue_stats { 209 PWQ_STAT_STARTED, /* work items started execution */ 210 PWQ_STAT_COMPLETED, /* work items completed execution */ 211 PWQ_STAT_CPU_TIME, /* total CPU time consumed */ 212 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ 213 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ 214 PWQ_STAT_MAYDAY, /* maydays to rescuer */ 215 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ 216 217 PWQ_NR_STATS, 218 }; 219 220 /* 221 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS 222 * of work_struct->data are used for flags and the remaining high bits 223 * point to the pwq; thus, pwqs need to be aligned at two's power of the 224 * number of flag bits. 225 */ 226 struct pool_workqueue { 227 struct worker_pool *pool; /* I: the associated pool */ 228 struct workqueue_struct *wq; /* I: the owning workqueue */ 229 int work_color; /* L: current color */ 230 int flush_color; /* L: flushing color */ 231 int refcnt; /* L: reference count */ 232 int nr_in_flight[WORK_NR_COLORS]; 233 /* L: nr of in_flight works */ 234 235 /* 236 * nr_active management and WORK_STRUCT_INACTIVE: 237 * 238 * When pwq->nr_active >= max_active, new work item is queued to 239 * pwq->inactive_works instead of pool->worklist and marked with 240 * WORK_STRUCT_INACTIVE. 241 * 242 * All work items marked with WORK_STRUCT_INACTIVE do not participate 243 * in pwq->nr_active and all work items in pwq->inactive_works are 244 * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE 245 * work items are in pwq->inactive_works. Some of them are ready to 246 * run in pool->worklist or worker->scheduled. Those work itmes are 247 * only struct wq_barrier which is used for flush_work() and should 248 * not participate in pwq->nr_active. For non-barrier work item, it 249 * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. 250 */ 251 int nr_active; /* L: nr of active works */ 252 int max_active; /* L: max active works */ 253 struct list_head inactive_works; /* L: inactive works */ 254 struct list_head pwqs_node; /* WR: node on wq->pwqs */ 255 struct list_head mayday_node; /* MD: node on wq->maydays */ 256 257 u64 stats[PWQ_NR_STATS]; 258 259 /* 260 * Release of unbound pwq is punted to a kthread_worker. See put_pwq() 261 * and pwq_release_workfn() for details. pool_workqueue itself is also 262 * RCU protected so that the first pwq can be determined without 263 * grabbing wq->mutex. 264 */ 265 struct kthread_work release_work; 266 struct rcu_head rcu; 267 } __aligned(1 << WORK_STRUCT_FLAG_BITS); 268 269 /* 270 * Structure used to wait for workqueue flush. 271 */ 272 struct wq_flusher { 273 struct list_head list; /* WQ: list of flushers */ 274 int flush_color; /* WQ: flush color waiting for */ 275 struct completion done; /* flush completion */ 276 }; 277 278 struct wq_device; 279 280 /* 281 * The externally visible workqueue. It relays the issued work items to 282 * the appropriate worker_pool through its pool_workqueues. 283 */ 284 struct workqueue_struct { 285 struct list_head pwqs; /* WR: all pwqs of this wq */ 286 struct list_head list; /* PR: list of all workqueues */ 287 288 struct mutex mutex; /* protects this wq */ 289 int work_color; /* WQ: current work color */ 290 int flush_color; /* WQ: current flush color */ 291 atomic_t nr_pwqs_to_flush; /* flush in progress */ 292 struct wq_flusher *first_flusher; /* WQ: first flusher */ 293 struct list_head flusher_queue; /* WQ: flush waiters */ 294 struct list_head flusher_overflow; /* WQ: flush overflow list */ 295 296 struct list_head maydays; /* MD: pwqs requesting rescue */ 297 struct worker *rescuer; /* MD: rescue worker */ 298 299 int nr_drainers; /* WQ: drain in progress */ 300 int saved_max_active; /* WQ: saved pwq max_active */ 301 302 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ 303 struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ 304 305 #ifdef CONFIG_SYSFS 306 struct wq_device *wq_dev; /* I: for sysfs interface */ 307 #endif 308 #ifdef CONFIG_LOCKDEP 309 char *lock_name; 310 struct lock_class_key key; 311 struct lockdep_map lockdep_map; 312 #endif 313 char name[WQ_NAME_LEN]; /* I: workqueue name */ 314 315 /* 316 * Destruction of workqueue_struct is RCU protected to allow walking 317 * the workqueues list without grabbing wq_pool_mutex. 318 * This is used to dump all workqueues from sysrq. 319 */ 320 struct rcu_head rcu; 321 322 /* hot fields used during command issue, aligned to cacheline */ 323 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ 324 struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ 325 }; 326 327 static struct kmem_cache *pwq_cache; 328 329 /* 330 * Each pod type describes how CPUs should be grouped for unbound workqueues. 331 * See the comment above workqueue_attrs->affn_scope. 332 */ 333 struct wq_pod_type { 334 int nr_pods; /* number of pods */ 335 cpumask_var_t *pod_cpus; /* pod -> cpus */ 336 int *pod_node; /* pod -> node */ 337 int *cpu_pod; /* cpu -> pod */ 338 }; 339 340 static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; 341 static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL; 342 343 static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { 344 [WQ_AFFN_CPU] = "cpu", 345 [WQ_AFFN_SMT] = "smt", 346 [WQ_AFFN_CACHE] = "cache", 347 [WQ_AFFN_NUMA] = "numa", 348 [WQ_AFFN_SYSTEM] = "system", 349 }; 350 351 /* 352 * Per-cpu work items which run for longer than the following threshold are 353 * automatically considered CPU intensive and excluded from concurrency 354 * management to prevent them from noticeably delaying other per-cpu work items. 355 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. 356 * The actual value is initialized in wq_cpu_intensive_thresh_init(). 357 */ 358 static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; 359 module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); 360 361 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 362 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); 363 module_param_named(power_efficient, wq_power_efficient, bool, 0444); 364 365 static bool wq_online; /* can kworkers be created yet? */ 366 367 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ 368 static struct workqueue_attrs *wq_update_pod_attrs_buf; 369 static cpumask_var_t wq_update_pod_cpumask_buf; 370 371 static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 372 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ 373 static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 374 /* wait for manager to go away */ 375 static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); 376 377 static LIST_HEAD(workqueues); /* PR: list of all workqueues */ 378 static bool workqueue_freezing; /* PL: have wqs started freezing? */ 379 380 /* PL&A: allowable cpus for unbound wqs and work items */ 381 static cpumask_var_t wq_unbound_cpumask; 382 383 /* for further constrain wq_unbound_cpumask by cmdline parameter*/ 384 static struct cpumask wq_cmdline_cpumask __initdata; 385 386 /* CPU where unbound work was last round robin scheduled from this CPU */ 387 static DEFINE_PER_CPU(int, wq_rr_cpu_last); 388 389 /* 390 * Local execution of unbound work items is no longer guaranteed. The 391 * following always forces round-robin CPU selection on unbound work items 392 * to uncover usages which depend on it. 393 */ 394 #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU 395 static bool wq_debug_force_rr_cpu = true; 396 #else 397 static bool wq_debug_force_rr_cpu = false; 398 #endif 399 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); 400 401 /* the per-cpu worker pools */ 402 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); 403 404 static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ 405 406 /* PL: hash of all unbound pools keyed by pool->attrs */ 407 static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); 408 409 /* I: attributes used when instantiating standard unbound pools on demand */ 410 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 411 412 /* I: attributes used when instantiating ordered pools on demand */ 413 static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; 414 415 /* 416 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a 417 * process context while holding a pool lock. Bounce to a dedicated kthread 418 * worker to avoid A-A deadlocks. 419 */ 420 static struct kthread_worker *pwq_release_worker; 421 422 struct workqueue_struct *system_wq __read_mostly; 423 EXPORT_SYMBOL(system_wq); 424 struct workqueue_struct *system_highpri_wq __read_mostly; 425 EXPORT_SYMBOL_GPL(system_highpri_wq); 426 struct workqueue_struct *system_long_wq __read_mostly; 427 EXPORT_SYMBOL_GPL(system_long_wq); 428 struct workqueue_struct *system_unbound_wq __read_mostly; 429 EXPORT_SYMBOL_GPL(system_unbound_wq); 430 struct workqueue_struct *system_freezable_wq __read_mostly; 431 EXPORT_SYMBOL_GPL(system_freezable_wq); 432 struct workqueue_struct *system_power_efficient_wq __read_mostly; 433 EXPORT_SYMBOL_GPL(system_power_efficient_wq); 434 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; 435 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); 436 437 static int worker_thread(void *__worker); 438 static void workqueue_sysfs_unregister(struct workqueue_struct *wq); 439 static void show_pwq(struct pool_workqueue *pwq); 440 static void show_one_worker_pool(struct worker_pool *pool); 441 442 #define CREATE_TRACE_POINTS 443 #include <trace/events/workqueue.h> 444 445 #define assert_rcu_or_pool_mutex() \ 446 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 447 !lockdep_is_held(&wq_pool_mutex), \ 448 "RCU or wq_pool_mutex should be held") 449 450 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ 451 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 452 !lockdep_is_held(&wq->mutex) && \ 453 !lockdep_is_held(&wq_pool_mutex), \ 454 "RCU, wq->mutex or wq_pool_mutex should be held") 455 456 #define for_each_cpu_worker_pool(pool, cpu) \ 457 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 458 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 459 (pool)++) 460 461 /** 462 * for_each_pool - iterate through all worker_pools in the system 463 * @pool: iteration cursor 464 * @pi: integer used for iteration 465 * 466 * This must be called either with wq_pool_mutex held or RCU read 467 * locked. If the pool needs to be used beyond the locking in effect, the 468 * caller is responsible for guaranteeing that the pool stays online. 469 * 470 * The if/else clause exists only for the lockdep assertion and can be 471 * ignored. 472 */ 473 #define for_each_pool(pool, pi) \ 474 idr_for_each_entry(&worker_pool_idr, pool, pi) \ 475 if (({ assert_rcu_or_pool_mutex(); false; })) { } \ 476 else 477 478 /** 479 * for_each_pool_worker - iterate through all workers of a worker_pool 480 * @worker: iteration cursor 481 * @pool: worker_pool to iterate workers of 482 * 483 * This must be called with wq_pool_attach_mutex. 484 * 485 * The if/else clause exists only for the lockdep assertion and can be 486 * ignored. 487 */ 488 #define for_each_pool_worker(worker, pool) \ 489 list_for_each_entry((worker), &(pool)->workers, node) \ 490 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ 491 else 492 493 /** 494 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue 495 * @pwq: iteration cursor 496 * @wq: the target workqueue 497 * 498 * This must be called either with wq->mutex held or RCU read locked. 499 * If the pwq needs to be used beyond the locking in effect, the caller is 500 * responsible for guaranteeing that the pwq stays online. 501 * 502 * The if/else clause exists only for the lockdep assertion and can be 503 * ignored. 504 */ 505 #define for_each_pwq(pwq, wq) \ 506 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ 507 lockdep_is_held(&(wq->mutex))) 508 509 #ifdef CONFIG_DEBUG_OBJECTS_WORK 510 511 static const struct debug_obj_descr work_debug_descr; 512 513 static void *work_debug_hint(void *addr) 514 { 515 return ((struct work_struct *) addr)->func; 516 } 517 518 static bool work_is_static_object(void *addr) 519 { 520 struct work_struct *work = addr; 521 522 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); 523 } 524 525 /* 526 * fixup_init is called when: 527 * - an active object is initialized 528 */ 529 static bool work_fixup_init(void *addr, enum debug_obj_state state) 530 { 531 struct work_struct *work = addr; 532 533 switch (state) { 534 case ODEBUG_STATE_ACTIVE: 535 cancel_work_sync(work); 536 debug_object_init(work, &work_debug_descr); 537 return true; 538 default: 539 return false; 540 } 541 } 542 543 /* 544 * fixup_free is called when: 545 * - an active object is freed 546 */ 547 static bool work_fixup_free(void *addr, enum debug_obj_state state) 548 { 549 struct work_struct *work = addr; 550 551 switch (state) { 552 case ODEBUG_STATE_ACTIVE: 553 cancel_work_sync(work); 554 debug_object_free(work, &work_debug_descr); 555 return true; 556 default: 557 return false; 558 } 559 } 560 561 static const struct debug_obj_descr work_debug_descr = { 562 .name = "work_struct", 563 .debug_hint = work_debug_hint, 564 .is_static_object = work_is_static_object, 565 .fixup_init = work_fixup_init, 566 .fixup_free = work_fixup_free, 567 }; 568 569 static inline void debug_work_activate(struct work_struct *work) 570 { 571 debug_object_activate(work, &work_debug_descr); 572 } 573 574 static inline void debug_work_deactivate(struct work_struct *work) 575 { 576 debug_object_deactivate(work, &work_debug_descr); 577 } 578 579 void __init_work(struct work_struct *work, int onstack) 580 { 581 if (onstack) 582 debug_object_init_on_stack(work, &work_debug_descr); 583 else 584 debug_object_init(work, &work_debug_descr); 585 } 586 EXPORT_SYMBOL_GPL(__init_work); 587 588 void destroy_work_on_stack(struct work_struct *work) 589 { 590 debug_object_free(work, &work_debug_descr); 591 } 592 EXPORT_SYMBOL_GPL(destroy_work_on_stack); 593 594 void destroy_delayed_work_on_stack(struct delayed_work *work) 595 { 596 destroy_timer_on_stack(&work->timer); 597 debug_object_free(&work->work, &work_debug_descr); 598 } 599 EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); 600 601 #else 602 static inline void debug_work_activate(struct work_struct *work) { } 603 static inline void debug_work_deactivate(struct work_struct *work) { } 604 #endif 605 606 /** 607 * worker_pool_assign_id - allocate ID and assign it to @pool 608 * @pool: the pool pointer of interest 609 * 610 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned 611 * successfully, -errno on failure. 612 */ 613 static int worker_pool_assign_id(struct worker_pool *pool) 614 { 615 int ret; 616 617 lockdep_assert_held(&wq_pool_mutex); 618 619 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, 620 GFP_KERNEL); 621 if (ret >= 0) { 622 pool->id = ret; 623 return 0; 624 } 625 return ret; 626 } 627 628 static unsigned int work_color_to_flags(int color) 629 { 630 return color << WORK_STRUCT_COLOR_SHIFT; 631 } 632 633 static int get_work_color(unsigned long work_data) 634 { 635 return (work_data >> WORK_STRUCT_COLOR_SHIFT) & 636 ((1 << WORK_STRUCT_COLOR_BITS) - 1); 637 } 638 639 static int work_next_color(int color) 640 { 641 return (color + 1) % WORK_NR_COLORS; 642 } 643 644 /* 645 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data 646 * contain the pointer to the queued pwq. Once execution starts, the flag 647 * is cleared and the high bits contain OFFQ flags and pool ID. 648 * 649 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() 650 * and clear_work_data() can be used to set the pwq, pool or clear 651 * work->data. These functions should only be called while the work is 652 * owned - ie. while the PENDING bit is set. 653 * 654 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq 655 * corresponding to a work. Pool is available once the work has been 656 * queued anywhere after initialization until it is sync canceled. pwq is 657 * available only while the work item is queued. 658 * 659 * %WORK_OFFQ_CANCELING is used to mark a work item which is being 660 * canceled. While being canceled, a work item may have its PENDING set 661 * but stay off timer and worklist for arbitrarily long and nobody should 662 * try to steal the PENDING bit. 663 */ 664 static inline void set_work_data(struct work_struct *work, unsigned long data, 665 unsigned long flags) 666 { 667 WARN_ON_ONCE(!work_pending(work)); 668 atomic_long_set(&work->data, data | flags | work_static(work)); 669 } 670 671 static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, 672 unsigned long extra_flags) 673 { 674 set_work_data(work, (unsigned long)pwq, 675 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); 676 } 677 678 static void set_work_pool_and_keep_pending(struct work_struct *work, 679 int pool_id) 680 { 681 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 682 WORK_STRUCT_PENDING); 683 } 684 685 static void set_work_pool_and_clear_pending(struct work_struct *work, 686 int pool_id) 687 { 688 /* 689 * The following wmb is paired with the implied mb in 690 * test_and_set_bit(PENDING) and ensures all updates to @work made 691 * here are visible to and precede any updates by the next PENDING 692 * owner. 693 */ 694 smp_wmb(); 695 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); 696 /* 697 * The following mb guarantees that previous clear of a PENDING bit 698 * will not be reordered with any speculative LOADS or STORES from 699 * work->current_func, which is executed afterwards. This possible 700 * reordering can lead to a missed execution on attempt to queue 701 * the same @work. E.g. consider this case: 702 * 703 * CPU#0 CPU#1 704 * ---------------------------- -------------------------------- 705 * 706 * 1 STORE event_indicated 707 * 2 queue_work_on() { 708 * 3 test_and_set_bit(PENDING) 709 * 4 } set_..._and_clear_pending() { 710 * 5 set_work_data() # clear bit 711 * 6 smp_mb() 712 * 7 work->current_func() { 713 * 8 LOAD event_indicated 714 * } 715 * 716 * Without an explicit full barrier speculative LOAD on line 8 can 717 * be executed before CPU#0 does STORE on line 1. If that happens, 718 * CPU#0 observes the PENDING bit is still set and new execution of 719 * a @work is not queued in a hope, that CPU#1 will eventually 720 * finish the queued @work. Meanwhile CPU#1 does not see 721 * event_indicated is set, because speculative LOAD was executed 722 * before actual STORE. 723 */ 724 smp_mb(); 725 } 726 727 static void clear_work_data(struct work_struct *work) 728 { 729 smp_wmb(); /* see set_work_pool_and_clear_pending() */ 730 set_work_data(work, WORK_STRUCT_NO_POOL, 0); 731 } 732 733 static inline struct pool_workqueue *work_struct_pwq(unsigned long data) 734 { 735 return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); 736 } 737 738 static struct pool_workqueue *get_work_pwq(struct work_struct *work) 739 { 740 unsigned long data = atomic_long_read(&work->data); 741 742 if (data & WORK_STRUCT_PWQ) 743 return work_struct_pwq(data); 744 else 745 return NULL; 746 } 747 748 /** 749 * get_work_pool - return the worker_pool a given work was associated with 750 * @work: the work item of interest 751 * 752 * Pools are created and destroyed under wq_pool_mutex, and allows read 753 * access under RCU read lock. As such, this function should be 754 * called under wq_pool_mutex or inside of a rcu_read_lock() region. 755 * 756 * All fields of the returned pool are accessible as long as the above 757 * mentioned locking is in effect. If the returned pool needs to be used 758 * beyond the critical section, the caller is responsible for ensuring the 759 * returned pool is and stays online. 760 * 761 * Return: The worker_pool @work was last associated with. %NULL if none. 762 */ 763 static struct worker_pool *get_work_pool(struct work_struct *work) 764 { 765 unsigned long data = atomic_long_read(&work->data); 766 int pool_id; 767 768 assert_rcu_or_pool_mutex(); 769 770 if (data & WORK_STRUCT_PWQ) 771 return work_struct_pwq(data)->pool; 772 773 pool_id = data >> WORK_OFFQ_POOL_SHIFT; 774 if (pool_id == WORK_OFFQ_POOL_NONE) 775 return NULL; 776 777 return idr_find(&worker_pool_idr, pool_id); 778 } 779 780 /** 781 * get_work_pool_id - return the worker pool ID a given work is associated with 782 * @work: the work item of interest 783 * 784 * Return: The worker_pool ID @work was last associated with. 785 * %WORK_OFFQ_POOL_NONE if none. 786 */ 787 static int get_work_pool_id(struct work_struct *work) 788 { 789 unsigned long data = atomic_long_read(&work->data); 790 791 if (data & WORK_STRUCT_PWQ) 792 return work_struct_pwq(data)->pool->id; 793 794 return data >> WORK_OFFQ_POOL_SHIFT; 795 } 796 797 static void mark_work_canceling(struct work_struct *work) 798 { 799 unsigned long pool_id = get_work_pool_id(work); 800 801 pool_id <<= WORK_OFFQ_POOL_SHIFT; 802 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); 803 } 804 805 static bool work_is_canceling(struct work_struct *work) 806 { 807 unsigned long data = atomic_long_read(&work->data); 808 809 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); 810 } 811 812 /* 813 * Policy functions. These define the policies on how the global worker 814 * pools are managed. Unless noted otherwise, these functions assume that 815 * they're being called with pool->lock held. 816 */ 817 818 static bool __need_more_worker(struct worker_pool *pool) 819 { 820 return !pool->nr_running; 821 } 822 823 /* 824 * Need to wake up a worker? Called from anything but currently 825 * running workers. 826 * 827 * Note that, because unbound workers never contribute to nr_running, this 828 * function will always return %true for unbound pools as long as the 829 * worklist isn't empty. 830 */ 831 static bool need_more_worker(struct worker_pool *pool) 832 { 833 return !list_empty(&pool->worklist) && __need_more_worker(pool); 834 } 835 836 /* Can I start working? Called from busy but !running workers. */ 837 static bool may_start_working(struct worker_pool *pool) 838 { 839 return pool->nr_idle; 840 } 841 842 /* Do I need to keep working? Called from currently running workers. */ 843 static bool keep_working(struct worker_pool *pool) 844 { 845 return !list_empty(&pool->worklist) && (pool->nr_running <= 1); 846 } 847 848 /* Do we need a new worker? Called from manager. */ 849 static bool need_to_create_worker(struct worker_pool *pool) 850 { 851 return need_more_worker(pool) && !may_start_working(pool); 852 } 853 854 /* Do we have too many workers and should some go away? */ 855 static bool too_many_workers(struct worker_pool *pool) 856 { 857 bool managing = pool->flags & POOL_MANAGER_ACTIVE; 858 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 859 int nr_busy = pool->nr_workers - nr_idle; 860 861 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; 862 } 863 864 /** 865 * worker_set_flags - set worker flags and adjust nr_running accordingly 866 * @worker: self 867 * @flags: flags to set 868 * 869 * Set @flags in @worker->flags and adjust nr_running accordingly. 870 */ 871 static inline void worker_set_flags(struct worker *worker, unsigned int flags) 872 { 873 struct worker_pool *pool = worker->pool; 874 875 lockdep_assert_held(&pool->lock); 876 877 /* If transitioning into NOT_RUNNING, adjust nr_running. */ 878 if ((flags & WORKER_NOT_RUNNING) && 879 !(worker->flags & WORKER_NOT_RUNNING)) { 880 pool->nr_running--; 881 } 882 883 worker->flags |= flags; 884 } 885 886 /** 887 * worker_clr_flags - clear worker flags and adjust nr_running accordingly 888 * @worker: self 889 * @flags: flags to clear 890 * 891 * Clear @flags in @worker->flags and adjust nr_running accordingly. 892 */ 893 static inline void worker_clr_flags(struct worker *worker, unsigned int flags) 894 { 895 struct worker_pool *pool = worker->pool; 896 unsigned int oflags = worker->flags; 897 898 lockdep_assert_held(&pool->lock); 899 900 worker->flags &= ~flags; 901 902 /* 903 * If transitioning out of NOT_RUNNING, increment nr_running. Note 904 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask 905 * of multiple flags, not a single flag. 906 */ 907 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 908 if (!(worker->flags & WORKER_NOT_RUNNING)) 909 pool->nr_running++; 910 } 911 912 /* Return the first idle worker. Called with pool->lock held. */ 913 static struct worker *first_idle_worker(struct worker_pool *pool) 914 { 915 if (unlikely(list_empty(&pool->idle_list))) 916 return NULL; 917 918 return list_first_entry(&pool->idle_list, struct worker, entry); 919 } 920 921 /** 922 * worker_enter_idle - enter idle state 923 * @worker: worker which is entering idle state 924 * 925 * @worker is entering idle state. Update stats and idle timer if 926 * necessary. 927 * 928 * LOCKING: 929 * raw_spin_lock_irq(pool->lock). 930 */ 931 static void worker_enter_idle(struct worker *worker) 932 { 933 struct worker_pool *pool = worker->pool; 934 935 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || 936 WARN_ON_ONCE(!list_empty(&worker->entry) && 937 (worker->hentry.next || worker->hentry.pprev))) 938 return; 939 940 /* can't use worker_set_flags(), also called from create_worker() */ 941 worker->flags |= WORKER_IDLE; 942 pool->nr_idle++; 943 worker->last_active = jiffies; 944 945 /* idle_list is LIFO */ 946 list_add(&worker->entry, &pool->idle_list); 947 948 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 949 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 950 951 /* Sanity check nr_running. */ 952 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); 953 } 954 955 /** 956 * worker_leave_idle - leave idle state 957 * @worker: worker which is leaving idle state 958 * 959 * @worker is leaving idle state. Update stats. 960 * 961 * LOCKING: 962 * raw_spin_lock_irq(pool->lock). 963 */ 964 static void worker_leave_idle(struct worker *worker) 965 { 966 struct worker_pool *pool = worker->pool; 967 968 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) 969 return; 970 worker_clr_flags(worker, WORKER_IDLE); 971 pool->nr_idle--; 972 list_del_init(&worker->entry); 973 } 974 975 /** 976 * find_worker_executing_work - find worker which is executing a work 977 * @pool: pool of interest 978 * @work: work to find worker for 979 * 980 * Find a worker which is executing @work on @pool by searching 981 * @pool->busy_hash which is keyed by the address of @work. For a worker 982 * to match, its current execution should match the address of @work and 983 * its work function. This is to avoid unwanted dependency between 984 * unrelated work executions through a work item being recycled while still 985 * being executed. 986 * 987 * This is a bit tricky. A work item may be freed once its execution 988 * starts and nothing prevents the freed area from being recycled for 989 * another work item. If the same work item address ends up being reused 990 * before the original execution finishes, workqueue will identify the 991 * recycled work item as currently executing and make it wait until the 992 * current execution finishes, introducing an unwanted dependency. 993 * 994 * This function checks the work item address and work function to avoid 995 * false positives. Note that this isn't complete as one may construct a 996 * work function which can introduce dependency onto itself through a 997 * recycled work item. Well, if somebody wants to shoot oneself in the 998 * foot that badly, there's only so much we can do, and if such deadlock 999 * actually occurs, it should be easy to locate the culprit work function. 1000 * 1001 * CONTEXT: 1002 * raw_spin_lock_irq(pool->lock). 1003 * 1004 * Return: 1005 * Pointer to worker which is executing @work if found, %NULL 1006 * otherwise. 1007 */ 1008 static struct worker *find_worker_executing_work(struct worker_pool *pool, 1009 struct work_struct *work) 1010 { 1011 struct worker *worker; 1012 1013 hash_for_each_possible(pool->busy_hash, worker, hentry, 1014 (unsigned long)work) 1015 if (worker->current_work == work && 1016 worker->current_func == work->func) 1017 return worker; 1018 1019 return NULL; 1020 } 1021 1022 /** 1023 * move_linked_works - move linked works to a list 1024 * @work: start of series of works to be scheduled 1025 * @head: target list to append @work to 1026 * @nextp: out parameter for nested worklist walking 1027 * 1028 * Schedule linked works starting from @work to @head. Work series to 1029 * be scheduled starts at @work and includes any consecutive work with 1030 * WORK_STRUCT_LINKED set in its predecessor. 1031 * 1032 * If @nextp is not NULL, it's updated to point to the next work of 1033 * the last scheduled work. This allows move_linked_works() to be 1034 * nested inside outer list_for_each_entry_safe(). 1035 * 1036 * CONTEXT: 1037 * raw_spin_lock_irq(pool->lock). 1038 */ 1039 static void move_linked_works(struct work_struct *work, struct list_head *head, 1040 struct work_struct **nextp) 1041 { 1042 struct work_struct *n; 1043 1044 /* 1045 * Linked worklist will always end before the end of the list, 1046 * use NULL for list head. 1047 */ 1048 list_for_each_entry_safe_from(work, n, NULL, entry) { 1049 list_move_tail(&work->entry, head); 1050 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) 1051 break; 1052 } 1053 1054 /* 1055 * If we're already inside safe list traversal and have moved 1056 * multiple works to the scheduled queue, the next position 1057 * needs to be updated. 1058 */ 1059 if (nextp) 1060 *nextp = n; 1061 } 1062 1063 /** 1064 * wake_up_worker - wake up an idle worker 1065 * @pool: worker pool to wake worker from 1066 * 1067 * Wake up the first idle worker of @pool. 1068 * 1069 * CONTEXT: 1070 * raw_spin_lock_irq(pool->lock). 1071 */ 1072 static void wake_up_worker(struct worker_pool *pool) 1073 { 1074 struct worker *worker = first_idle_worker(pool); 1075 1076 if (likely(worker)) 1077 wake_up_process(worker->task); 1078 } 1079 1080 #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT 1081 1082 /* 1083 * Concurrency-managed per-cpu work items that hog CPU for longer than 1084 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, 1085 * which prevents them from stalling other concurrency-managed work items. If a 1086 * work function keeps triggering this mechanism, it's likely that the work item 1087 * should be using an unbound workqueue instead. 1088 * 1089 * wq_cpu_intensive_report() tracks work functions which trigger such conditions 1090 * and report them so that they can be examined and converted to use unbound 1091 * workqueues as appropriate. To avoid flooding the console, each violating work 1092 * function is tracked and reported with exponential backoff. 1093 */ 1094 #define WCI_MAX_ENTS 128 1095 1096 struct wci_ent { 1097 work_func_t func; 1098 atomic64_t cnt; 1099 struct hlist_node hash_node; 1100 }; 1101 1102 static struct wci_ent wci_ents[WCI_MAX_ENTS]; 1103 static int wci_nr_ents; 1104 static DEFINE_RAW_SPINLOCK(wci_lock); 1105 static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); 1106 1107 static struct wci_ent *wci_find_ent(work_func_t func) 1108 { 1109 struct wci_ent *ent; 1110 1111 hash_for_each_possible_rcu(wci_hash, ent, hash_node, 1112 (unsigned long)func) { 1113 if (ent->func == func) 1114 return ent; 1115 } 1116 return NULL; 1117 } 1118 1119 static void wq_cpu_intensive_report(work_func_t func) 1120 { 1121 struct wci_ent *ent; 1122 1123 restart: 1124 ent = wci_find_ent(func); 1125 if (ent) { 1126 u64 cnt; 1127 1128 /* 1129 * Start reporting from the fourth time and back off 1130 * exponentially. 1131 */ 1132 cnt = atomic64_inc_return_relaxed(&ent->cnt); 1133 if (cnt >= 4 && is_power_of_2(cnt)) 1134 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", 1135 ent->func, wq_cpu_intensive_thresh_us, 1136 atomic64_read(&ent->cnt)); 1137 return; 1138 } 1139 1140 /* 1141 * @func is a new violation. Allocate a new entry for it. If wcn_ents[] 1142 * is exhausted, something went really wrong and we probably made enough 1143 * noise already. 1144 */ 1145 if (wci_nr_ents >= WCI_MAX_ENTS) 1146 return; 1147 1148 raw_spin_lock(&wci_lock); 1149 1150 if (wci_nr_ents >= WCI_MAX_ENTS) { 1151 raw_spin_unlock(&wci_lock); 1152 return; 1153 } 1154 1155 if (wci_find_ent(func)) { 1156 raw_spin_unlock(&wci_lock); 1157 goto restart; 1158 } 1159 1160 ent = &wci_ents[wci_nr_ents++]; 1161 ent->func = func; 1162 atomic64_set(&ent->cnt, 1); 1163 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); 1164 1165 raw_spin_unlock(&wci_lock); 1166 } 1167 1168 #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1169 static void wq_cpu_intensive_report(work_func_t func) {} 1170 #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1171 1172 /** 1173 * wq_worker_running - a worker is running again 1174 * @task: task waking up 1175 * 1176 * This function is called when a worker returns from schedule() 1177 */ 1178 void wq_worker_running(struct task_struct *task) 1179 { 1180 struct worker *worker = kthread_data(task); 1181 1182 if (!READ_ONCE(worker->sleeping)) 1183 return; 1184 1185 /* 1186 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check 1187 * and the nr_running increment below, we may ruin the nr_running reset 1188 * and leave with an unexpected pool->nr_running == 1 on the newly unbound 1189 * pool. Protect against such race. 1190 */ 1191 preempt_disable(); 1192 if (!(worker->flags & WORKER_NOT_RUNNING)) 1193 worker->pool->nr_running++; 1194 preempt_enable(); 1195 1196 /* 1197 * CPU intensive auto-detection cares about how long a work item hogged 1198 * CPU without sleeping. Reset the starting timestamp on wakeup. 1199 */ 1200 worker->current_at = worker->task->se.sum_exec_runtime; 1201 1202 WRITE_ONCE(worker->sleeping, 0); 1203 } 1204 1205 /** 1206 * wq_worker_sleeping - a worker is going to sleep 1207 * @task: task going to sleep 1208 * 1209 * This function is called from schedule() when a busy worker is 1210 * going to sleep. 1211 */ 1212 void wq_worker_sleeping(struct task_struct *task) 1213 { 1214 struct worker *worker = kthread_data(task); 1215 struct worker_pool *pool; 1216 1217 /* 1218 * Rescuers, which may not have all the fields set up like normal 1219 * workers, also reach here, let's not access anything before 1220 * checking NOT_RUNNING. 1221 */ 1222 if (worker->flags & WORKER_NOT_RUNNING) 1223 return; 1224 1225 pool = worker->pool; 1226 1227 /* Return if preempted before wq_worker_running() was reached */ 1228 if (READ_ONCE(worker->sleeping)) 1229 return; 1230 1231 WRITE_ONCE(worker->sleeping, 1); 1232 raw_spin_lock_irq(&pool->lock); 1233 1234 /* 1235 * Recheck in case unbind_workers() preempted us. We don't 1236 * want to decrement nr_running after the worker is unbound 1237 * and nr_running has been reset. 1238 */ 1239 if (worker->flags & WORKER_NOT_RUNNING) { 1240 raw_spin_unlock_irq(&pool->lock); 1241 return; 1242 } 1243 1244 pool->nr_running--; 1245 if (need_more_worker(pool)) { 1246 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1247 wake_up_worker(pool); 1248 } 1249 raw_spin_unlock_irq(&pool->lock); 1250 } 1251 1252 /** 1253 * wq_worker_tick - a scheduler tick occurred while a kworker is running 1254 * @task: task currently running 1255 * 1256 * Called from scheduler_tick(). We're in the IRQ context and the current 1257 * worker's fields which follow the 'K' locking rule can be accessed safely. 1258 */ 1259 void wq_worker_tick(struct task_struct *task) 1260 { 1261 struct worker *worker = kthread_data(task); 1262 struct pool_workqueue *pwq = worker->current_pwq; 1263 struct worker_pool *pool = worker->pool; 1264 1265 if (!pwq) 1266 return; 1267 1268 pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; 1269 1270 if (!wq_cpu_intensive_thresh_us) 1271 return; 1272 1273 /* 1274 * If the current worker is concurrency managed and hogged the CPU for 1275 * longer than wq_cpu_intensive_thresh_us, it's automatically marked 1276 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. 1277 * 1278 * Set @worker->sleeping means that @worker is in the process of 1279 * switching out voluntarily and won't be contributing to 1280 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also 1281 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to 1282 * double decrements. The task is releasing the CPU anyway. Let's skip. 1283 * We probably want to make this prettier in the future. 1284 */ 1285 if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || 1286 worker->task->se.sum_exec_runtime - worker->current_at < 1287 wq_cpu_intensive_thresh_us * NSEC_PER_USEC) 1288 return; 1289 1290 raw_spin_lock(&pool->lock); 1291 1292 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 1293 wq_cpu_intensive_report(worker->current_func); 1294 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; 1295 1296 if (need_more_worker(pool)) { 1297 pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1298 wake_up_worker(pool); 1299 } 1300 1301 raw_spin_unlock(&pool->lock); 1302 } 1303 1304 /** 1305 * wq_worker_last_func - retrieve worker's last work function 1306 * @task: Task to retrieve last work function of. 1307 * 1308 * Determine the last function a worker executed. This is called from 1309 * the scheduler to get a worker's last known identity. 1310 * 1311 * CONTEXT: 1312 * raw_spin_lock_irq(rq->lock) 1313 * 1314 * This function is called during schedule() when a kworker is going 1315 * to sleep. It's used by psi to identify aggregation workers during 1316 * dequeuing, to allow periodic aggregation to shut-off when that 1317 * worker is the last task in the system or cgroup to go to sleep. 1318 * 1319 * As this function doesn't involve any workqueue-related locking, it 1320 * only returns stable values when called from inside the scheduler's 1321 * queuing and dequeuing paths, when @task, which must be a kworker, 1322 * is guaranteed to not be processing any works. 1323 * 1324 * Return: 1325 * The last work function %current executed as a worker, NULL if it 1326 * hasn't executed any work yet. 1327 */ 1328 work_func_t wq_worker_last_func(struct task_struct *task) 1329 { 1330 struct worker *worker = kthread_data(task); 1331 1332 return worker->last_func; 1333 } 1334 1335 /** 1336 * get_pwq - get an extra reference on the specified pool_workqueue 1337 * @pwq: pool_workqueue to get 1338 * 1339 * Obtain an extra reference on @pwq. The caller should guarantee that 1340 * @pwq has positive refcnt and be holding the matching pool->lock. 1341 */ 1342 static void get_pwq(struct pool_workqueue *pwq) 1343 { 1344 lockdep_assert_held(&pwq->pool->lock); 1345 WARN_ON_ONCE(pwq->refcnt <= 0); 1346 pwq->refcnt++; 1347 } 1348 1349 /** 1350 * put_pwq - put a pool_workqueue reference 1351 * @pwq: pool_workqueue to put 1352 * 1353 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its 1354 * destruction. The caller should be holding the matching pool->lock. 1355 */ 1356 static void put_pwq(struct pool_workqueue *pwq) 1357 { 1358 lockdep_assert_held(&pwq->pool->lock); 1359 if (likely(--pwq->refcnt)) 1360 return; 1361 /* 1362 * @pwq can't be released under pool->lock, bounce to a dedicated 1363 * kthread_worker to avoid A-A deadlocks. 1364 */ 1365 kthread_queue_work(pwq_release_worker, &pwq->release_work); 1366 } 1367 1368 /** 1369 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock 1370 * @pwq: pool_workqueue to put (can be %NULL) 1371 * 1372 * put_pwq() with locking. This function also allows %NULL @pwq. 1373 */ 1374 static void put_pwq_unlocked(struct pool_workqueue *pwq) 1375 { 1376 if (pwq) { 1377 /* 1378 * As both pwqs and pools are RCU protected, the 1379 * following lock operations are safe. 1380 */ 1381 raw_spin_lock_irq(&pwq->pool->lock); 1382 put_pwq(pwq); 1383 raw_spin_unlock_irq(&pwq->pool->lock); 1384 } 1385 } 1386 1387 static void pwq_activate_inactive_work(struct work_struct *work) 1388 { 1389 struct pool_workqueue *pwq = get_work_pwq(work); 1390 1391 trace_workqueue_activate_work(work); 1392 if (list_empty(&pwq->pool->worklist)) 1393 pwq->pool->watchdog_ts = jiffies; 1394 move_linked_works(work, &pwq->pool->worklist, NULL); 1395 __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); 1396 pwq->nr_active++; 1397 } 1398 1399 static void pwq_activate_first_inactive(struct pool_workqueue *pwq) 1400 { 1401 struct work_struct *work = list_first_entry(&pwq->inactive_works, 1402 struct work_struct, entry); 1403 1404 pwq_activate_inactive_work(work); 1405 } 1406 1407 /** 1408 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight 1409 * @pwq: pwq of interest 1410 * @work_data: work_data of work which left the queue 1411 * 1412 * A work either has completed or is removed from pending queue, 1413 * decrement nr_in_flight of its pwq and handle workqueue flushing. 1414 * 1415 * CONTEXT: 1416 * raw_spin_lock_irq(pool->lock). 1417 */ 1418 static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) 1419 { 1420 int color = get_work_color(work_data); 1421 1422 if (!(work_data & WORK_STRUCT_INACTIVE)) { 1423 pwq->nr_active--; 1424 if (!list_empty(&pwq->inactive_works)) { 1425 /* one down, submit an inactive one */ 1426 if (pwq->nr_active < pwq->max_active) 1427 pwq_activate_first_inactive(pwq); 1428 } 1429 } 1430 1431 pwq->nr_in_flight[color]--; 1432 1433 /* is flush in progress and are we at the flushing tip? */ 1434 if (likely(pwq->flush_color != color)) 1435 goto out_put; 1436 1437 /* are there still in-flight works? */ 1438 if (pwq->nr_in_flight[color]) 1439 goto out_put; 1440 1441 /* this pwq is done, clear flush_color */ 1442 pwq->flush_color = -1; 1443 1444 /* 1445 * If this was the last pwq, wake up the first flusher. It 1446 * will handle the rest. 1447 */ 1448 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 1449 complete(&pwq->wq->first_flusher->done); 1450 out_put: 1451 put_pwq(pwq); 1452 } 1453 1454 /** 1455 * try_to_grab_pending - steal work item from worklist and disable irq 1456 * @work: work item to steal 1457 * @is_dwork: @work is a delayed_work 1458 * @flags: place to store irq state 1459 * 1460 * Try to grab PENDING bit of @work. This function can handle @work in any 1461 * stable state - idle, on timer or on worklist. 1462 * 1463 * Return: 1464 * 1465 * ======== ================================================================ 1466 * 1 if @work was pending and we successfully stole PENDING 1467 * 0 if @work was idle and we claimed PENDING 1468 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry 1469 * -ENOENT if someone else is canceling @work, this state may persist 1470 * for arbitrarily long 1471 * ======== ================================================================ 1472 * 1473 * Note: 1474 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting 1475 * interrupted while holding PENDING and @work off queue, irq must be 1476 * disabled on entry. This, combined with delayed_work->timer being 1477 * irqsafe, ensures that we return -EAGAIN for finite short period of time. 1478 * 1479 * On successful return, >= 0, irq is disabled and the caller is 1480 * responsible for releasing it using local_irq_restore(*@flags). 1481 * 1482 * This function is safe to call from any context including IRQ handler. 1483 */ 1484 static int try_to_grab_pending(struct work_struct *work, bool is_dwork, 1485 unsigned long *flags) 1486 { 1487 struct worker_pool *pool; 1488 struct pool_workqueue *pwq; 1489 1490 local_irq_save(*flags); 1491 1492 /* try to steal the timer if it exists */ 1493 if (is_dwork) { 1494 struct delayed_work *dwork = to_delayed_work(work); 1495 1496 /* 1497 * dwork->timer is irqsafe. If del_timer() fails, it's 1498 * guaranteed that the timer is not queued anywhere and not 1499 * running on the local CPU. 1500 */ 1501 if (likely(del_timer(&dwork->timer))) 1502 return 1; 1503 } 1504 1505 /* try to claim PENDING the normal way */ 1506 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) 1507 return 0; 1508 1509 rcu_read_lock(); 1510 /* 1511 * The queueing is in progress, or it is already queued. Try to 1512 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. 1513 */ 1514 pool = get_work_pool(work); 1515 if (!pool) 1516 goto fail; 1517 1518 raw_spin_lock(&pool->lock); 1519 /* 1520 * work->data is guaranteed to point to pwq only while the work 1521 * item is queued on pwq->wq, and both updating work->data to point 1522 * to pwq on queueing and to pool on dequeueing are done under 1523 * pwq->pool->lock. This in turn guarantees that, if work->data 1524 * points to pwq which is associated with a locked pool, the work 1525 * item is currently queued on that pool. 1526 */ 1527 pwq = get_work_pwq(work); 1528 if (pwq && pwq->pool == pool) { 1529 debug_work_deactivate(work); 1530 1531 /* 1532 * A cancelable inactive work item must be in the 1533 * pwq->inactive_works since a queued barrier can't be 1534 * canceled (see the comments in insert_wq_barrier()). 1535 * 1536 * An inactive work item cannot be grabbed directly because 1537 * it might have linked barrier work items which, if left 1538 * on the inactive_works list, will confuse pwq->nr_active 1539 * management later on and cause stall. Make sure the work 1540 * item is activated before grabbing. 1541 */ 1542 if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) 1543 pwq_activate_inactive_work(work); 1544 1545 list_del_init(&work->entry); 1546 pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); 1547 1548 /* work->data points to pwq iff queued, point to pool */ 1549 set_work_pool_and_keep_pending(work, pool->id); 1550 1551 raw_spin_unlock(&pool->lock); 1552 rcu_read_unlock(); 1553 return 1; 1554 } 1555 raw_spin_unlock(&pool->lock); 1556 fail: 1557 rcu_read_unlock(); 1558 local_irq_restore(*flags); 1559 if (work_is_canceling(work)) 1560 return -ENOENT; 1561 cpu_relax(); 1562 return -EAGAIN; 1563 } 1564 1565 /** 1566 * insert_work - insert a work into a pool 1567 * @pwq: pwq @work belongs to 1568 * @work: work to insert 1569 * @head: insertion point 1570 * @extra_flags: extra WORK_STRUCT_* flags to set 1571 * 1572 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to 1573 * work_struct flags. 1574 * 1575 * CONTEXT: 1576 * raw_spin_lock_irq(pool->lock). 1577 */ 1578 static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, 1579 struct list_head *head, unsigned int extra_flags) 1580 { 1581 debug_work_activate(work); 1582 1583 /* record the work call stack in order to print it in KASAN reports */ 1584 kasan_record_aux_stack_noalloc(work); 1585 1586 /* we own @work, set data and link */ 1587 set_work_pwq(work, pwq, extra_flags); 1588 list_add_tail(&work->entry, head); 1589 get_pwq(pwq); 1590 } 1591 1592 /* 1593 * Test whether @work is being queued from another work executing on the 1594 * same workqueue. 1595 */ 1596 static bool is_chained_work(struct workqueue_struct *wq) 1597 { 1598 struct worker *worker; 1599 1600 worker = current_wq_worker(); 1601 /* 1602 * Return %true iff I'm a worker executing a work item on @wq. If 1603 * I'm @worker, it's safe to dereference it without locking. 1604 */ 1605 return worker && worker->current_pwq->wq == wq; 1606 } 1607 1608 /* 1609 * When queueing an unbound work item to a wq, prefer local CPU if allowed 1610 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to 1611 * avoid perturbing sensitive tasks. 1612 */ 1613 static int wq_select_unbound_cpu(int cpu) 1614 { 1615 int new_cpu; 1616 1617 if (likely(!wq_debug_force_rr_cpu)) { 1618 if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) 1619 return cpu; 1620 } else { 1621 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); 1622 } 1623 1624 if (cpumask_empty(wq_unbound_cpumask)) 1625 return cpu; 1626 1627 new_cpu = __this_cpu_read(wq_rr_cpu_last); 1628 new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); 1629 if (unlikely(new_cpu >= nr_cpu_ids)) { 1630 new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); 1631 if (unlikely(new_cpu >= nr_cpu_ids)) 1632 return cpu; 1633 } 1634 __this_cpu_write(wq_rr_cpu_last, new_cpu); 1635 1636 return new_cpu; 1637 } 1638 1639 static void __queue_work(int cpu, struct workqueue_struct *wq, 1640 struct work_struct *work) 1641 { 1642 struct pool_workqueue *pwq; 1643 struct worker_pool *last_pool, *pool; 1644 unsigned int work_flags; 1645 unsigned int req_cpu = cpu; 1646 1647 /* 1648 * While a work item is PENDING && off queue, a task trying to 1649 * steal the PENDING will busy-loop waiting for it to either get 1650 * queued or lose PENDING. Grabbing PENDING and queueing should 1651 * happen with IRQ disabled. 1652 */ 1653 lockdep_assert_irqs_disabled(); 1654 1655 1656 /* 1657 * For a draining wq, only works from the same workqueue are 1658 * allowed. The __WQ_DESTROYING helps to spot the issue that 1659 * queues a new work item to a wq after destroy_workqueue(wq). 1660 */ 1661 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && 1662 WARN_ON_ONCE(!is_chained_work(wq)))) 1663 return; 1664 rcu_read_lock(); 1665 retry: 1666 /* pwq which will be used unless @work is executing elsewhere */ 1667 if (req_cpu == WORK_CPU_UNBOUND) { 1668 if (wq->flags & WQ_UNBOUND) 1669 cpu = wq_select_unbound_cpu(raw_smp_processor_id()); 1670 else 1671 cpu = raw_smp_processor_id(); 1672 } 1673 1674 pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); 1675 pool = pwq->pool; 1676 1677 /* 1678 * If @work was previously on a different pool, it might still be 1679 * running there, in which case the work needs to be queued on that 1680 * pool to guarantee non-reentrancy. 1681 */ 1682 last_pool = get_work_pool(work); 1683 if (last_pool && last_pool != pool) { 1684 struct worker *worker; 1685 1686 raw_spin_lock(&last_pool->lock); 1687 1688 worker = find_worker_executing_work(last_pool, work); 1689 1690 if (worker && worker->current_pwq->wq == wq) { 1691 pwq = worker->current_pwq; 1692 pool = pwq->pool; 1693 WARN_ON_ONCE(pool != last_pool); 1694 } else { 1695 /* meh... not running there, queue here */ 1696 raw_spin_unlock(&last_pool->lock); 1697 raw_spin_lock(&pool->lock); 1698 } 1699 } else { 1700 raw_spin_lock(&pool->lock); 1701 } 1702 1703 /* 1704 * pwq is determined and locked. For unbound pools, we could have raced 1705 * with pwq release and it could already be dead. If its refcnt is zero, 1706 * repeat pwq selection. Note that unbound pwqs never die without 1707 * another pwq replacing it in cpu_pwq or while work items are executing 1708 * on it, so the retrying is guaranteed to make forward-progress. 1709 */ 1710 if (unlikely(!pwq->refcnt)) { 1711 if (wq->flags & WQ_UNBOUND) { 1712 raw_spin_unlock(&pool->lock); 1713 cpu_relax(); 1714 goto retry; 1715 } 1716 /* oops */ 1717 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", 1718 wq->name, cpu); 1719 } 1720 1721 /* pwq determined, queue */ 1722 trace_workqueue_queue_work(req_cpu, pwq, work); 1723 1724 if (WARN_ON(!list_empty(&work->entry))) 1725 goto out; 1726 1727 pwq->nr_in_flight[pwq->work_color]++; 1728 work_flags = work_color_to_flags(pwq->work_color); 1729 1730 if (likely(pwq->nr_active < pwq->max_active)) { 1731 if (list_empty(&pool->worklist)) 1732 pool->watchdog_ts = jiffies; 1733 1734 trace_workqueue_activate_work(work); 1735 pwq->nr_active++; 1736 insert_work(pwq, work, &pool->worklist, work_flags); 1737 1738 if (__need_more_worker(pool)) 1739 wake_up_worker(pool); 1740 } else { 1741 work_flags |= WORK_STRUCT_INACTIVE; 1742 insert_work(pwq, work, &pwq->inactive_works, work_flags); 1743 } 1744 1745 out: 1746 raw_spin_unlock(&pool->lock); 1747 rcu_read_unlock(); 1748 } 1749 1750 /** 1751 * queue_work_on - queue work on specific cpu 1752 * @cpu: CPU number to execute work on 1753 * @wq: workqueue to use 1754 * @work: work to queue 1755 * 1756 * We queue the work to a specific CPU, the caller must ensure it 1757 * can't go away. Callers that fail to ensure that the specified 1758 * CPU cannot go away will execute on a randomly chosen CPU. 1759 * But note well that callers specifying a CPU that never has been 1760 * online will get a splat. 1761 * 1762 * Return: %false if @work was already on a queue, %true otherwise. 1763 */ 1764 bool queue_work_on(int cpu, struct workqueue_struct *wq, 1765 struct work_struct *work) 1766 { 1767 bool ret = false; 1768 unsigned long flags; 1769 1770 local_irq_save(flags); 1771 1772 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1773 __queue_work(cpu, wq, work); 1774 ret = true; 1775 } 1776 1777 local_irq_restore(flags); 1778 return ret; 1779 } 1780 EXPORT_SYMBOL(queue_work_on); 1781 1782 /** 1783 * select_numa_node_cpu - Select a CPU based on NUMA node 1784 * @node: NUMA node ID that we want to select a CPU from 1785 * 1786 * This function will attempt to find a "random" cpu available on a given 1787 * node. If there are no CPUs available on the given node it will return 1788 * WORK_CPU_UNBOUND indicating that we should just schedule to any 1789 * available CPU if we need to schedule this work. 1790 */ 1791 static int select_numa_node_cpu(int node) 1792 { 1793 int cpu; 1794 1795 /* Delay binding to CPU if node is not valid or online */ 1796 if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) 1797 return WORK_CPU_UNBOUND; 1798 1799 /* Use local node/cpu if we are already there */ 1800 cpu = raw_smp_processor_id(); 1801 if (node == cpu_to_node(cpu)) 1802 return cpu; 1803 1804 /* Use "random" otherwise know as "first" online CPU of node */ 1805 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); 1806 1807 /* If CPU is valid return that, otherwise just defer */ 1808 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; 1809 } 1810 1811 /** 1812 * queue_work_node - queue work on a "random" cpu for a given NUMA node 1813 * @node: NUMA node that we are targeting the work for 1814 * @wq: workqueue to use 1815 * @work: work to queue 1816 * 1817 * We queue the work to a "random" CPU within a given NUMA node. The basic 1818 * idea here is to provide a way to somehow associate work with a given 1819 * NUMA node. 1820 * 1821 * This function will only make a best effort attempt at getting this onto 1822 * the right NUMA node. If no node is requested or the requested node is 1823 * offline then we just fall back to standard queue_work behavior. 1824 * 1825 * Currently the "random" CPU ends up being the first available CPU in the 1826 * intersection of cpu_online_mask and the cpumask of the node, unless we 1827 * are running on the node. In that case we just use the current CPU. 1828 * 1829 * Return: %false if @work was already on a queue, %true otherwise. 1830 */ 1831 bool queue_work_node(int node, struct workqueue_struct *wq, 1832 struct work_struct *work) 1833 { 1834 unsigned long flags; 1835 bool ret = false; 1836 1837 /* 1838 * This current implementation is specific to unbound workqueues. 1839 * Specifically we only return the first available CPU for a given 1840 * node instead of cycling through individual CPUs within the node. 1841 * 1842 * If this is used with a per-cpu workqueue then the logic in 1843 * workqueue_select_cpu_near would need to be updated to allow for 1844 * some round robin type logic. 1845 */ 1846 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); 1847 1848 local_irq_save(flags); 1849 1850 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1851 int cpu = select_numa_node_cpu(node); 1852 1853 __queue_work(cpu, wq, work); 1854 ret = true; 1855 } 1856 1857 local_irq_restore(flags); 1858 return ret; 1859 } 1860 EXPORT_SYMBOL_GPL(queue_work_node); 1861 1862 void delayed_work_timer_fn(struct timer_list *t) 1863 { 1864 struct delayed_work *dwork = from_timer(dwork, t, timer); 1865 1866 /* should have been called from irqsafe timer with irq already off */ 1867 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 1868 } 1869 EXPORT_SYMBOL(delayed_work_timer_fn); 1870 1871 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, 1872 struct delayed_work *dwork, unsigned long delay) 1873 { 1874 struct timer_list *timer = &dwork->timer; 1875 struct work_struct *work = &dwork->work; 1876 1877 WARN_ON_ONCE(!wq); 1878 WARN_ON_ONCE(timer->function != delayed_work_timer_fn); 1879 WARN_ON_ONCE(timer_pending(timer)); 1880 WARN_ON_ONCE(!list_empty(&work->entry)); 1881 1882 /* 1883 * If @delay is 0, queue @dwork->work immediately. This is for 1884 * both optimization and correctness. The earliest @timer can 1885 * expire is on the closest next tick and delayed_work users depend 1886 * on that there's no such delay when @delay is 0. 1887 */ 1888 if (!delay) { 1889 __queue_work(cpu, wq, &dwork->work); 1890 return; 1891 } 1892 1893 dwork->wq = wq; 1894 dwork->cpu = cpu; 1895 timer->expires = jiffies + delay; 1896 1897 if (unlikely(cpu != WORK_CPU_UNBOUND)) 1898 add_timer_on(timer, cpu); 1899 else 1900 add_timer(timer); 1901 } 1902 1903 /** 1904 * queue_delayed_work_on - queue work on specific CPU after delay 1905 * @cpu: CPU number to execute work on 1906 * @wq: workqueue to use 1907 * @dwork: work to queue 1908 * @delay: number of jiffies to wait before queueing 1909 * 1910 * Return: %false if @work was already on a queue, %true otherwise. If 1911 * @delay is zero and @dwork is idle, it will be scheduled for immediate 1912 * execution. 1913 */ 1914 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 1915 struct delayed_work *dwork, unsigned long delay) 1916 { 1917 struct work_struct *work = &dwork->work; 1918 bool ret = false; 1919 unsigned long flags; 1920 1921 /* read the comment in __queue_work() */ 1922 local_irq_save(flags); 1923 1924 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1925 __queue_delayed_work(cpu, wq, dwork, delay); 1926 ret = true; 1927 } 1928 1929 local_irq_restore(flags); 1930 return ret; 1931 } 1932 EXPORT_SYMBOL(queue_delayed_work_on); 1933 1934 /** 1935 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1936 * @cpu: CPU number to execute work on 1937 * @wq: workqueue to use 1938 * @dwork: work to queue 1939 * @delay: number of jiffies to wait before queueing 1940 * 1941 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, 1942 * modify @dwork's timer so that it expires after @delay. If @delay is 1943 * zero, @work is guaranteed to be scheduled immediately regardless of its 1944 * current state. 1945 * 1946 * Return: %false if @dwork was idle and queued, %true if @dwork was 1947 * pending and its timer was modified. 1948 * 1949 * This function is safe to call from any context including IRQ handler. 1950 * See try_to_grab_pending() for details. 1951 */ 1952 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, 1953 struct delayed_work *dwork, unsigned long delay) 1954 { 1955 unsigned long flags; 1956 int ret; 1957 1958 do { 1959 ret = try_to_grab_pending(&dwork->work, true, &flags); 1960 } while (unlikely(ret == -EAGAIN)); 1961 1962 if (likely(ret >= 0)) { 1963 __queue_delayed_work(cpu, wq, dwork, delay); 1964 local_irq_restore(flags); 1965 } 1966 1967 /* -ENOENT from try_to_grab_pending() becomes %true */ 1968 return ret; 1969 } 1970 EXPORT_SYMBOL_GPL(mod_delayed_work_on); 1971 1972 static void rcu_work_rcufn(struct rcu_head *rcu) 1973 { 1974 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); 1975 1976 /* read the comment in __queue_work() */ 1977 local_irq_disable(); 1978 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); 1979 local_irq_enable(); 1980 } 1981 1982 /** 1983 * queue_rcu_work - queue work after a RCU grace period 1984 * @wq: workqueue to use 1985 * @rwork: work to queue 1986 * 1987 * Return: %false if @rwork was already pending, %true otherwise. Note 1988 * that a full RCU grace period is guaranteed only after a %true return. 1989 * While @rwork is guaranteed to be executed after a %false return, the 1990 * execution may happen before a full RCU grace period has passed. 1991 */ 1992 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) 1993 { 1994 struct work_struct *work = &rwork->work; 1995 1996 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { 1997 rwork->wq = wq; 1998 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn); 1999 return true; 2000 } 2001 2002 return false; 2003 } 2004 EXPORT_SYMBOL(queue_rcu_work); 2005 2006 static struct worker *alloc_worker(int node) 2007 { 2008 struct worker *worker; 2009 2010 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); 2011 if (worker) { 2012 INIT_LIST_HEAD(&worker->entry); 2013 INIT_LIST_HEAD(&worker->scheduled); 2014 INIT_LIST_HEAD(&worker->node); 2015 /* on creation a worker is in !idle && prep state */ 2016 worker->flags = WORKER_PREP; 2017 } 2018 return worker; 2019 } 2020 2021 /** 2022 * worker_attach_to_pool() - attach a worker to a pool 2023 * @worker: worker to be attached 2024 * @pool: the target pool 2025 * 2026 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and 2027 * cpu-binding of @worker are kept coordinated with the pool across 2028 * cpu-[un]hotplugs. 2029 */ 2030 static void worker_attach_to_pool(struct worker *worker, 2031 struct worker_pool *pool) 2032 { 2033 mutex_lock(&wq_pool_attach_mutex); 2034 2035 /* 2036 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains 2037 * stable across this function. See the comments above the flag 2038 * definition for details. 2039 */ 2040 if (pool->flags & POOL_DISASSOCIATED) 2041 worker->flags |= WORKER_UNBOUND; 2042 else 2043 kthread_set_per_cpu(worker->task, pool->cpu); 2044 2045 if (worker->rescue_wq) 2046 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 2047 2048 list_add_tail(&worker->node, &pool->workers); 2049 worker->pool = pool; 2050 2051 mutex_unlock(&wq_pool_attach_mutex); 2052 } 2053 2054 /** 2055 * worker_detach_from_pool() - detach a worker from its pool 2056 * @worker: worker which is attached to its pool 2057 * 2058 * Undo the attaching which had been done in worker_attach_to_pool(). The 2059 * caller worker shouldn't access to the pool after detached except it has 2060 * other reference to the pool. 2061 */ 2062 static void worker_detach_from_pool(struct worker *worker) 2063 { 2064 struct worker_pool *pool = worker->pool; 2065 struct completion *detach_completion = NULL; 2066 2067 mutex_lock(&wq_pool_attach_mutex); 2068 2069 kthread_set_per_cpu(worker->task, -1); 2070 list_del(&worker->node); 2071 worker->pool = NULL; 2072 2073 if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) 2074 detach_completion = pool->detach_completion; 2075 mutex_unlock(&wq_pool_attach_mutex); 2076 2077 /* clear leftover flags without pool->lock after it is detached */ 2078 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); 2079 2080 if (detach_completion) 2081 complete(detach_completion); 2082 } 2083 2084 /** 2085 * create_worker - create a new workqueue worker 2086 * @pool: pool the new worker will belong to 2087 * 2088 * Create and start a new worker which is attached to @pool. 2089 * 2090 * CONTEXT: 2091 * Might sleep. Does GFP_KERNEL allocations. 2092 * 2093 * Return: 2094 * Pointer to the newly created worker. 2095 */ 2096 static struct worker *create_worker(struct worker_pool *pool) 2097 { 2098 struct worker *worker; 2099 int id; 2100 char id_buf[16]; 2101 2102 /* ID is needed to determine kthread name */ 2103 id = ida_alloc(&pool->worker_ida, GFP_KERNEL); 2104 if (id < 0) { 2105 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", 2106 ERR_PTR(id)); 2107 return NULL; 2108 } 2109 2110 worker = alloc_worker(pool->node); 2111 if (!worker) { 2112 pr_err_once("workqueue: Failed to allocate a worker\n"); 2113 goto fail; 2114 } 2115 2116 worker->id = id; 2117 2118 if (pool->cpu >= 0) 2119 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, 2120 pool->attrs->nice < 0 ? "H" : ""); 2121 else 2122 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); 2123 2124 worker->task = kthread_create_on_node(worker_thread, worker, pool->node, 2125 "kworker/%s", id_buf); 2126 if (IS_ERR(worker->task)) { 2127 if (PTR_ERR(worker->task) == -EINTR) { 2128 pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", 2129 id_buf); 2130 } else { 2131 pr_err_once("workqueue: Failed to create a worker thread: %pe", 2132 worker->task); 2133 } 2134 goto fail; 2135 } 2136 2137 set_user_nice(worker->task, pool->attrs->nice); 2138 kthread_bind_mask(worker->task, pool->attrs->cpumask); 2139 2140 /* successful, attach the worker to the pool */ 2141 worker_attach_to_pool(worker, pool); 2142 2143 /* start the newly created worker */ 2144 raw_spin_lock_irq(&pool->lock); 2145 worker->pool->nr_workers++; 2146 worker_enter_idle(worker); 2147 wake_up_process(worker->task); 2148 raw_spin_unlock_irq(&pool->lock); 2149 2150 return worker; 2151 2152 fail: 2153 ida_free(&pool->worker_ida, id); 2154 kfree(worker); 2155 return NULL; 2156 } 2157 2158 static void unbind_worker(struct worker *worker) 2159 { 2160 lockdep_assert_held(&wq_pool_attach_mutex); 2161 2162 kthread_set_per_cpu(worker->task, -1); 2163 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) 2164 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); 2165 else 2166 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); 2167 } 2168 2169 static void wake_dying_workers(struct list_head *cull_list) 2170 { 2171 struct worker *worker, *tmp; 2172 2173 list_for_each_entry_safe(worker, tmp, cull_list, entry) { 2174 list_del_init(&worker->entry); 2175 unbind_worker(worker); 2176 /* 2177 * If the worker was somehow already running, then it had to be 2178 * in pool->idle_list when set_worker_dying() happened or we 2179 * wouldn't have gotten here. 2180 * 2181 * Thus, the worker must either have observed the WORKER_DIE 2182 * flag, or have set its state to TASK_IDLE. Either way, the 2183 * below will be observed by the worker and is safe to do 2184 * outside of pool->lock. 2185 */ 2186 wake_up_process(worker->task); 2187 } 2188 } 2189 2190 /** 2191 * set_worker_dying - Tag a worker for destruction 2192 * @worker: worker to be destroyed 2193 * @list: transfer worker away from its pool->idle_list and into list 2194 * 2195 * Tag @worker for destruction and adjust @pool stats accordingly. The worker 2196 * should be idle. 2197 * 2198 * CONTEXT: 2199 * raw_spin_lock_irq(pool->lock). 2200 */ 2201 static void set_worker_dying(struct worker *worker, struct list_head *list) 2202 { 2203 struct worker_pool *pool = worker->pool; 2204 2205 lockdep_assert_held(&pool->lock); 2206 lockdep_assert_held(&wq_pool_attach_mutex); 2207 2208 /* sanity check frenzy */ 2209 if (WARN_ON(worker->current_work) || 2210 WARN_ON(!list_empty(&worker->scheduled)) || 2211 WARN_ON(!(worker->flags & WORKER_IDLE))) 2212 return; 2213 2214 pool->nr_workers--; 2215 pool->nr_idle--; 2216 2217 worker->flags |= WORKER_DIE; 2218 2219 list_move(&worker->entry, list); 2220 list_move(&worker->node, &pool->dying_workers); 2221 } 2222 2223 /** 2224 * idle_worker_timeout - check if some idle workers can now be deleted. 2225 * @t: The pool's idle_timer that just expired 2226 * 2227 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in 2228 * worker_leave_idle(), as a worker flicking between idle and active while its 2229 * pool is at the too_many_workers() tipping point would cause too much timer 2230 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let 2231 * it expire and re-evaluate things from there. 2232 */ 2233 static void idle_worker_timeout(struct timer_list *t) 2234 { 2235 struct worker_pool *pool = from_timer(pool, t, idle_timer); 2236 bool do_cull = false; 2237 2238 if (work_pending(&pool->idle_cull_work)) 2239 return; 2240 2241 raw_spin_lock_irq(&pool->lock); 2242 2243 if (too_many_workers(pool)) { 2244 struct worker *worker; 2245 unsigned long expires; 2246 2247 /* idle_list is kept in LIFO order, check the last one */ 2248 worker = list_entry(pool->idle_list.prev, struct worker, entry); 2249 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2250 do_cull = !time_before(jiffies, expires); 2251 2252 if (!do_cull) 2253 mod_timer(&pool->idle_timer, expires); 2254 } 2255 raw_spin_unlock_irq(&pool->lock); 2256 2257 if (do_cull) 2258 queue_work(system_unbound_wq, &pool->idle_cull_work); 2259 } 2260 2261 /** 2262 * idle_cull_fn - cull workers that have been idle for too long. 2263 * @work: the pool's work for handling these idle workers 2264 * 2265 * This goes through a pool's idle workers and gets rid of those that have been 2266 * idle for at least IDLE_WORKER_TIMEOUT seconds. 2267 * 2268 * We don't want to disturb isolated CPUs because of a pcpu kworker being 2269 * culled, so this also resets worker affinity. This requires a sleepable 2270 * context, hence the split between timer callback and work item. 2271 */ 2272 static void idle_cull_fn(struct work_struct *work) 2273 { 2274 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); 2275 LIST_HEAD(cull_list); 2276 2277 /* 2278 * Grabbing wq_pool_attach_mutex here ensures an already-running worker 2279 * cannot proceed beyong worker_detach_from_pool() in its self-destruct 2280 * path. This is required as a previously-preempted worker could run after 2281 * set_worker_dying() has happened but before wake_dying_workers() did. 2282 */ 2283 mutex_lock(&wq_pool_attach_mutex); 2284 raw_spin_lock_irq(&pool->lock); 2285 2286 while (too_many_workers(pool)) { 2287 struct worker *worker; 2288 unsigned long expires; 2289 2290 worker = list_entry(pool->idle_list.prev, struct worker, entry); 2291 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 2292 2293 if (time_before(jiffies, expires)) { 2294 mod_timer(&pool->idle_timer, expires); 2295 break; 2296 } 2297 2298 set_worker_dying(worker, &cull_list); 2299 } 2300 2301 raw_spin_unlock_irq(&pool->lock); 2302 wake_dying_workers(&cull_list); 2303 mutex_unlock(&wq_pool_attach_mutex); 2304 } 2305 2306 static void send_mayday(struct work_struct *work) 2307 { 2308 struct pool_workqueue *pwq = get_work_pwq(work); 2309 struct workqueue_struct *wq = pwq->wq; 2310 2311 lockdep_assert_held(&wq_mayday_lock); 2312 2313 if (!wq->rescuer) 2314 return; 2315 2316 /* mayday mayday mayday */ 2317 if (list_empty(&pwq->mayday_node)) { 2318 /* 2319 * If @pwq is for an unbound wq, its base ref may be put at 2320 * any time due to an attribute change. Pin @pwq until the 2321 * rescuer is done with it. 2322 */ 2323 get_pwq(pwq); 2324 list_add_tail(&pwq->mayday_node, &wq->maydays); 2325 wake_up_process(wq->rescuer->task); 2326 pwq->stats[PWQ_STAT_MAYDAY]++; 2327 } 2328 } 2329 2330 static void pool_mayday_timeout(struct timer_list *t) 2331 { 2332 struct worker_pool *pool = from_timer(pool, t, mayday_timer); 2333 struct work_struct *work; 2334 2335 raw_spin_lock_irq(&pool->lock); 2336 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */ 2337 2338 if (need_to_create_worker(pool)) { 2339 /* 2340 * We've been trying to create a new worker but 2341 * haven't been successful. We might be hitting an 2342 * allocation deadlock. Send distress signals to 2343 * rescuers. 2344 */ 2345 list_for_each_entry(work, &pool->worklist, entry) 2346 send_mayday(work); 2347 } 2348 2349 raw_spin_unlock(&wq_mayday_lock); 2350 raw_spin_unlock_irq(&pool->lock); 2351 2352 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 2353 } 2354 2355 /** 2356 * maybe_create_worker - create a new worker if necessary 2357 * @pool: pool to create a new worker for 2358 * 2359 * Create a new worker for @pool if necessary. @pool is guaranteed to 2360 * have at least one idle worker on return from this function. If 2361 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is 2362 * sent to all rescuers with works scheduled on @pool to resolve 2363 * possible allocation deadlock. 2364 * 2365 * On return, need_to_create_worker() is guaranteed to be %false and 2366 * may_start_working() %true. 2367 * 2368 * LOCKING: 2369 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2370 * multiple times. Does GFP_KERNEL allocations. Called only from 2371 * manager. 2372 */ 2373 static void maybe_create_worker(struct worker_pool *pool) 2374 __releases(&pool->lock) 2375 __acquires(&pool->lock) 2376 { 2377 restart: 2378 raw_spin_unlock_irq(&pool->lock); 2379 2380 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ 2381 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); 2382 2383 while (true) { 2384 if (create_worker(pool) || !need_to_create_worker(pool)) 2385 break; 2386 2387 schedule_timeout_interruptible(CREATE_COOLDOWN); 2388 2389 if (!need_to_create_worker(pool)) 2390 break; 2391 } 2392 2393 del_timer_sync(&pool->mayday_timer); 2394 raw_spin_lock_irq(&pool->lock); 2395 /* 2396 * This is necessary even after a new worker was just successfully 2397 * created as @pool->lock was dropped and the new worker might have 2398 * already become busy. 2399 */ 2400 if (need_to_create_worker(pool)) 2401 goto restart; 2402 } 2403 2404 /** 2405 * manage_workers - manage worker pool 2406 * @worker: self 2407 * 2408 * Assume the manager role and manage the worker pool @worker belongs 2409 * to. At any given time, there can be only zero or one manager per 2410 * pool. The exclusion is handled automatically by this function. 2411 * 2412 * The caller can safely start processing works on false return. On 2413 * true return, it's guaranteed that need_to_create_worker() is false 2414 * and may_start_working() is true. 2415 * 2416 * CONTEXT: 2417 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2418 * multiple times. Does GFP_KERNEL allocations. 2419 * 2420 * Return: 2421 * %false if the pool doesn't need management and the caller can safely 2422 * start processing works, %true if management function was performed and 2423 * the conditions that the caller verified before calling the function may 2424 * no longer be true. 2425 */ 2426 static bool manage_workers(struct worker *worker) 2427 { 2428 struct worker_pool *pool = worker->pool; 2429 2430 if (pool->flags & POOL_MANAGER_ACTIVE) 2431 return false; 2432 2433 pool->flags |= POOL_MANAGER_ACTIVE; 2434 pool->manager = worker; 2435 2436 maybe_create_worker(pool); 2437 2438 pool->manager = NULL; 2439 pool->flags &= ~POOL_MANAGER_ACTIVE; 2440 rcuwait_wake_up(&manager_wait); 2441 return true; 2442 } 2443 2444 /** 2445 * process_one_work - process single work 2446 * @worker: self 2447 * @work: work to process 2448 * 2449 * Process @work. This function contains all the logics necessary to 2450 * process a single work including synchronization against and 2451 * interaction with other workers on the same cpu, queueing and 2452 * flushing. As long as context requirement is met, any worker can 2453 * call this function to process a work. 2454 * 2455 * CONTEXT: 2456 * raw_spin_lock_irq(pool->lock) which is released and regrabbed. 2457 */ 2458 static void process_one_work(struct worker *worker, struct work_struct *work) 2459 __releases(&pool->lock) 2460 __acquires(&pool->lock) 2461 { 2462 struct pool_workqueue *pwq = get_work_pwq(work); 2463 struct worker_pool *pool = worker->pool; 2464 unsigned long work_data; 2465 struct worker *collision; 2466 #ifdef CONFIG_LOCKDEP 2467 /* 2468 * It is permissible to free the struct work_struct from 2469 * inside the function that is called from it, this we need to 2470 * take into account for lockdep too. To avoid bogus "held 2471 * lock freed" warnings as well as problems when looking into 2472 * work->lockdep_map, make a copy and use that here. 2473 */ 2474 struct lockdep_map lockdep_map; 2475 2476 lockdep_copy_map(&lockdep_map, &work->lockdep_map); 2477 #endif 2478 /* ensure we're on the correct CPU */ 2479 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && 2480 raw_smp_processor_id() != pool->cpu); 2481 2482 /* 2483 * A single work shouldn't be executed concurrently by 2484 * multiple workers on a single cpu. Check whether anyone is 2485 * already processing the work. If so, defer the work to the 2486 * currently executing one. 2487 */ 2488 collision = find_worker_executing_work(pool, work); 2489 if (unlikely(collision)) { 2490 move_linked_works(work, &collision->scheduled, NULL); 2491 return; 2492 } 2493 2494 /* claim and dequeue */ 2495 debug_work_deactivate(work); 2496 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); 2497 worker->current_work = work; 2498 worker->current_func = work->func; 2499 worker->current_pwq = pwq; 2500 worker->current_at = worker->task->se.sum_exec_runtime; 2501 work_data = *work_data_bits(work); 2502 worker->current_color = get_work_color(work_data); 2503 2504 /* 2505 * Record wq name for cmdline and debug reporting, may get 2506 * overridden through set_worker_desc(). 2507 */ 2508 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); 2509 2510 list_del_init(&work->entry); 2511 2512 /* 2513 * CPU intensive works don't participate in concurrency management. 2514 * They're the scheduler's responsibility. This takes @worker out 2515 * of concurrency management and the next code block will chain 2516 * execution of the pending work items. 2517 */ 2518 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) 2519 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 2520 2521 /* 2522 * Wake up another worker if necessary. The condition is always 2523 * false for normal per-cpu workers since nr_running would always 2524 * be >= 1 at this point. This is used to chain execution of the 2525 * pending work items for WORKER_NOT_RUNNING workers such as the 2526 * UNBOUND and CPU_INTENSIVE ones. 2527 */ 2528 if (need_more_worker(pool)) 2529 wake_up_worker(pool); 2530 2531 /* 2532 * Record the last pool and clear PENDING which should be the last 2533 * update to @work. Also, do this inside @pool->lock so that 2534 * PENDING and queued state changes happen together while IRQ is 2535 * disabled. 2536 */ 2537 set_work_pool_and_clear_pending(work, pool->id); 2538 2539 raw_spin_unlock_irq(&pool->lock); 2540 2541 lock_map_acquire(&pwq->wq->lockdep_map); 2542 lock_map_acquire(&lockdep_map); 2543 /* 2544 * Strictly speaking we should mark the invariant state without holding 2545 * any locks, that is, before these two lock_map_acquire()'s. 2546 * 2547 * However, that would result in: 2548 * 2549 * A(W1) 2550 * WFC(C) 2551 * A(W1) 2552 * C(C) 2553 * 2554 * Which would create W1->C->W1 dependencies, even though there is no 2555 * actual deadlock possible. There are two solutions, using a 2556 * read-recursive acquire on the work(queue) 'locks', but this will then 2557 * hit the lockdep limitation on recursive locks, or simply discard 2558 * these locks. 2559 * 2560 * AFAICT there is no possible deadlock scenario between the 2561 * flush_work() and complete() primitives (except for single-threaded 2562 * workqueues), so hiding them isn't a problem. 2563 */ 2564 lockdep_invariant_state(true); 2565 pwq->stats[PWQ_STAT_STARTED]++; 2566 trace_workqueue_execute_start(work); 2567 worker->current_func(work); 2568 /* 2569 * While we must be careful to not use "work" after this, the trace 2570 * point will only record its address. 2571 */ 2572 trace_workqueue_execute_end(work, worker->current_func); 2573 pwq->stats[PWQ_STAT_COMPLETED]++; 2574 lock_map_release(&lockdep_map); 2575 lock_map_release(&pwq->wq->lockdep_map); 2576 2577 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2578 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2579 " last function: %ps\n", 2580 current->comm, preempt_count(), task_pid_nr(current), 2581 worker->current_func); 2582 debug_show_held_locks(current); 2583 dump_stack(); 2584 } 2585 2586 /* 2587 * The following prevents a kworker from hogging CPU on !PREEMPTION 2588 * kernels, where a requeueing work item waiting for something to 2589 * happen could deadlock with stop_machine as such work item could 2590 * indefinitely requeue itself while all other CPUs are trapped in 2591 * stop_machine. At the same time, report a quiescent RCU state so 2592 * the same condition doesn't freeze RCU. 2593 */ 2594 cond_resched(); 2595 2596 raw_spin_lock_irq(&pool->lock); 2597 2598 /* 2599 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked 2600 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than 2601 * wq_cpu_intensive_thresh_us. Clear it. 2602 */ 2603 worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2604 2605 /* tag the worker for identification in schedule() */ 2606 worker->last_func = worker->current_func; 2607 2608 /* we're done with it, release */ 2609 hash_del(&worker->hentry); 2610 worker->current_work = NULL; 2611 worker->current_func = NULL; 2612 worker->current_pwq = NULL; 2613 worker->current_color = INT_MAX; 2614 pwq_dec_nr_in_flight(pwq, work_data); 2615 } 2616 2617 /** 2618 * process_scheduled_works - process scheduled works 2619 * @worker: self 2620 * 2621 * Process all scheduled works. Please note that the scheduled list 2622 * may change while processing a work, so this function repeatedly 2623 * fetches a work from the top and executes it. 2624 * 2625 * CONTEXT: 2626 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed 2627 * multiple times. 2628 */ 2629 static void process_scheduled_works(struct worker *worker) 2630 { 2631 struct work_struct *work; 2632 bool first = true; 2633 2634 while ((work = list_first_entry_or_null(&worker->scheduled, 2635 struct work_struct, entry))) { 2636 if (first) { 2637 worker->pool->watchdog_ts = jiffies; 2638 first = false; 2639 } 2640 process_one_work(worker, work); 2641 } 2642 } 2643 2644 static void set_pf_worker(bool val) 2645 { 2646 mutex_lock(&wq_pool_attach_mutex); 2647 if (val) 2648 current->flags |= PF_WQ_WORKER; 2649 else 2650 current->flags &= ~PF_WQ_WORKER; 2651 mutex_unlock(&wq_pool_attach_mutex); 2652 } 2653 2654 /** 2655 * worker_thread - the worker thread function 2656 * @__worker: self 2657 * 2658 * The worker thread function. All workers belong to a worker_pool - 2659 * either a per-cpu one or dynamic unbound one. These workers process all 2660 * work items regardless of their specific target workqueue. The only 2661 * exception is work items which belong to workqueues with a rescuer which 2662 * will be explained in rescuer_thread(). 2663 * 2664 * Return: 0 2665 */ 2666 static int worker_thread(void *__worker) 2667 { 2668 struct worker *worker = __worker; 2669 struct worker_pool *pool = worker->pool; 2670 2671 /* tell the scheduler that this is a workqueue worker */ 2672 set_pf_worker(true); 2673 woke_up: 2674 raw_spin_lock_irq(&pool->lock); 2675 2676 /* am I supposed to die? */ 2677 if (unlikely(worker->flags & WORKER_DIE)) { 2678 raw_spin_unlock_irq(&pool->lock); 2679 set_pf_worker(false); 2680 2681 set_task_comm(worker->task, "kworker/dying"); 2682 ida_free(&pool->worker_ida, worker->id); 2683 worker_detach_from_pool(worker); 2684 WARN_ON_ONCE(!list_empty(&worker->entry)); 2685 kfree(worker); 2686 return 0; 2687 } 2688 2689 worker_leave_idle(worker); 2690 recheck: 2691 /* no more worker necessary? */ 2692 if (!need_more_worker(pool)) 2693 goto sleep; 2694 2695 /* do we need to manage? */ 2696 if (unlikely(!may_start_working(pool)) && manage_workers(worker)) 2697 goto recheck; 2698 2699 /* 2700 * ->scheduled list can only be filled while a worker is 2701 * preparing to process a work or actually processing it. 2702 * Make sure nobody diddled with it while I was sleeping. 2703 */ 2704 WARN_ON_ONCE(!list_empty(&worker->scheduled)); 2705 2706 /* 2707 * Finish PREP stage. We're guaranteed to have at least one idle 2708 * worker or that someone else has already assumed the manager 2709 * role. This is where @worker starts participating in concurrency 2710 * management if applicable and concurrency management is restored 2711 * after being rebound. See rebind_workers() for details. 2712 */ 2713 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); 2714 2715 do { 2716 struct work_struct *work = 2717 list_first_entry(&pool->worklist, 2718 struct work_struct, entry); 2719 2720 move_linked_works(work, &worker->scheduled, NULL); 2721 process_scheduled_works(worker); 2722 } while (keep_working(pool)); 2723 2724 worker_set_flags(worker, WORKER_PREP); 2725 sleep: 2726 /* 2727 * pool->lock is held and there's no work to process and no need to 2728 * manage, sleep. Workers are woken up only while holding 2729 * pool->lock or from local cpu, so setting the current state 2730 * before releasing pool->lock is enough to prevent losing any 2731 * event. 2732 */ 2733 worker_enter_idle(worker); 2734 __set_current_state(TASK_IDLE); 2735 raw_spin_unlock_irq(&pool->lock); 2736 schedule(); 2737 goto woke_up; 2738 } 2739 2740 /** 2741 * rescuer_thread - the rescuer thread function 2742 * @__rescuer: self 2743 * 2744 * Workqueue rescuer thread function. There's one rescuer for each 2745 * workqueue which has WQ_MEM_RECLAIM set. 2746 * 2747 * Regular work processing on a pool may block trying to create a new 2748 * worker which uses GFP_KERNEL allocation which has slight chance of 2749 * developing into deadlock if some works currently on the same queue 2750 * need to be processed to satisfy the GFP_KERNEL allocation. This is 2751 * the problem rescuer solves. 2752 * 2753 * When such condition is possible, the pool summons rescuers of all 2754 * workqueues which have works queued on the pool and let them process 2755 * those works so that forward progress can be guaranteed. 2756 * 2757 * This should happen rarely. 2758 * 2759 * Return: 0 2760 */ 2761 static int rescuer_thread(void *__rescuer) 2762 { 2763 struct worker *rescuer = __rescuer; 2764 struct workqueue_struct *wq = rescuer->rescue_wq; 2765 struct list_head *scheduled = &rescuer->scheduled; 2766 bool should_stop; 2767 2768 set_user_nice(current, RESCUER_NICE_LEVEL); 2769 2770 /* 2771 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it 2772 * doesn't participate in concurrency management. 2773 */ 2774 set_pf_worker(true); 2775 repeat: 2776 set_current_state(TASK_IDLE); 2777 2778 /* 2779 * By the time the rescuer is requested to stop, the workqueue 2780 * shouldn't have any work pending, but @wq->maydays may still have 2781 * pwq(s) queued. This can happen by non-rescuer workers consuming 2782 * all the work items before the rescuer got to them. Go through 2783 * @wq->maydays processing before acting on should_stop so that the 2784 * list is always empty on exit. 2785 */ 2786 should_stop = kthread_should_stop(); 2787 2788 /* see whether any pwq is asking for help */ 2789 raw_spin_lock_irq(&wq_mayday_lock); 2790 2791 while (!list_empty(&wq->maydays)) { 2792 struct pool_workqueue *pwq = list_first_entry(&wq->maydays, 2793 struct pool_workqueue, mayday_node); 2794 struct worker_pool *pool = pwq->pool; 2795 struct work_struct *work, *n; 2796 2797 __set_current_state(TASK_RUNNING); 2798 list_del_init(&pwq->mayday_node); 2799 2800 raw_spin_unlock_irq(&wq_mayday_lock); 2801 2802 worker_attach_to_pool(rescuer, pool); 2803 2804 raw_spin_lock_irq(&pool->lock); 2805 2806 /* 2807 * Slurp in all works issued via this workqueue and 2808 * process'em. 2809 */ 2810 WARN_ON_ONCE(!list_empty(scheduled)); 2811 list_for_each_entry_safe(work, n, &pool->worklist, entry) { 2812 if (get_work_pwq(work) == pwq) { 2813 move_linked_works(work, scheduled, &n); 2814 pwq->stats[PWQ_STAT_RESCUED]++; 2815 } 2816 } 2817 2818 if (!list_empty(scheduled)) { 2819 process_scheduled_works(rescuer); 2820 2821 /* 2822 * The above execution of rescued work items could 2823 * have created more to rescue through 2824 * pwq_activate_first_inactive() or chained 2825 * queueing. Let's put @pwq back on mayday list so 2826 * that such back-to-back work items, which may be 2827 * being used to relieve memory pressure, don't 2828 * incur MAYDAY_INTERVAL delay inbetween. 2829 */ 2830 if (pwq->nr_active && need_to_create_worker(pool)) { 2831 raw_spin_lock(&wq_mayday_lock); 2832 /* 2833 * Queue iff we aren't racing destruction 2834 * and somebody else hasn't queued it already. 2835 */ 2836 if (wq->rescuer && list_empty(&pwq->mayday_node)) { 2837 get_pwq(pwq); 2838 list_add_tail(&pwq->mayday_node, &wq->maydays); 2839 } 2840 raw_spin_unlock(&wq_mayday_lock); 2841 } 2842 } 2843 2844 /* 2845 * Put the reference grabbed by send_mayday(). @pool won't 2846 * go away while we're still attached to it. 2847 */ 2848 put_pwq(pwq); 2849 2850 /* 2851 * Leave this pool. If need_more_worker() is %true, notify a 2852 * regular worker; otherwise, we end up with 0 concurrency 2853 * and stalling the execution. 2854 */ 2855 if (need_more_worker(pool)) 2856 wake_up_worker(pool); 2857 2858 raw_spin_unlock_irq(&pool->lock); 2859 2860 worker_detach_from_pool(rescuer); 2861 2862 raw_spin_lock_irq(&wq_mayday_lock); 2863 } 2864 2865 raw_spin_unlock_irq(&wq_mayday_lock); 2866 2867 if (should_stop) { 2868 __set_current_state(TASK_RUNNING); 2869 set_pf_worker(false); 2870 return 0; 2871 } 2872 2873 /* rescuers should never participate in concurrency management */ 2874 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2875 schedule(); 2876 goto repeat; 2877 } 2878 2879 /** 2880 * check_flush_dependency - check for flush dependency sanity 2881 * @target_wq: workqueue being flushed 2882 * @target_work: work item being flushed (NULL for workqueue flushes) 2883 * 2884 * %current is trying to flush the whole @target_wq or @target_work on it. 2885 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not 2886 * reclaiming memory or running on a workqueue which doesn't have 2887 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to 2888 * a deadlock. 2889 */ 2890 static void check_flush_dependency(struct workqueue_struct *target_wq, 2891 struct work_struct *target_work) 2892 { 2893 work_func_t target_func = target_work ? target_work->func : NULL; 2894 struct worker *worker; 2895 2896 if (target_wq->flags & WQ_MEM_RECLAIM) 2897 return; 2898 2899 worker = current_wq_worker(); 2900 2901 WARN_ONCE(current->flags & PF_MEMALLOC, 2902 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", 2903 current->pid, current->comm, target_wq->name, target_func); 2904 WARN_ONCE(worker && ((worker->current_pwq->wq->flags & 2905 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), 2906 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", 2907 worker->current_pwq->wq->name, worker->current_func, 2908 target_wq->name, target_func); 2909 } 2910 2911 struct wq_barrier { 2912 struct work_struct work; 2913 struct completion done; 2914 struct task_struct *task; /* purely informational */ 2915 }; 2916 2917 static void wq_barrier_func(struct work_struct *work) 2918 { 2919 struct wq_barrier *barr = container_of(work, struct wq_barrier, work); 2920 complete(&barr->done); 2921 } 2922 2923 /** 2924 * insert_wq_barrier - insert a barrier work 2925 * @pwq: pwq to insert barrier into 2926 * @barr: wq_barrier to insert 2927 * @target: target work to attach @barr to 2928 * @worker: worker currently executing @target, NULL if @target is not executing 2929 * 2930 * @barr is linked to @target such that @barr is completed only after 2931 * @target finishes execution. Please note that the ordering 2932 * guarantee is observed only with respect to @target and on the local 2933 * cpu. 2934 * 2935 * Currently, a queued barrier can't be canceled. This is because 2936 * try_to_grab_pending() can't determine whether the work to be 2937 * grabbed is at the head of the queue and thus can't clear LINKED 2938 * flag of the previous work while there must be a valid next work 2939 * after a work with LINKED flag set. 2940 * 2941 * Note that when @worker is non-NULL, @target may be modified 2942 * underneath us, so we can't reliably determine pwq from @target. 2943 * 2944 * CONTEXT: 2945 * raw_spin_lock_irq(pool->lock). 2946 */ 2947 static void insert_wq_barrier(struct pool_workqueue *pwq, 2948 struct wq_barrier *barr, 2949 struct work_struct *target, struct worker *worker) 2950 { 2951 unsigned int work_flags = 0; 2952 unsigned int work_color; 2953 struct list_head *head; 2954 2955 /* 2956 * debugobject calls are safe here even with pool->lock locked 2957 * as we know for sure that this will not trigger any of the 2958 * checks and call back into the fixup functions where we 2959 * might deadlock. 2960 */ 2961 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); 2962 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); 2963 2964 init_completion_map(&barr->done, &target->lockdep_map); 2965 2966 barr->task = current; 2967 2968 /* The barrier work item does not participate in pwq->nr_active. */ 2969 work_flags |= WORK_STRUCT_INACTIVE; 2970 2971 /* 2972 * If @target is currently being executed, schedule the 2973 * barrier to the worker; otherwise, put it after @target. 2974 */ 2975 if (worker) { 2976 head = worker->scheduled.next; 2977 work_color = worker->current_color; 2978 } else { 2979 unsigned long *bits = work_data_bits(target); 2980 2981 head = target->entry.next; 2982 /* there can already be other linked works, inherit and set */ 2983 work_flags |= *bits & WORK_STRUCT_LINKED; 2984 work_color = get_work_color(*bits); 2985 __set_bit(WORK_STRUCT_LINKED_BIT, bits); 2986 } 2987 2988 pwq->nr_in_flight[work_color]++; 2989 work_flags |= work_color_to_flags(work_color); 2990 2991 insert_work(pwq, &barr->work, head, work_flags); 2992 } 2993 2994 /** 2995 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing 2996 * @wq: workqueue being flushed 2997 * @flush_color: new flush color, < 0 for no-op 2998 * @work_color: new work color, < 0 for no-op 2999 * 3000 * Prepare pwqs for workqueue flushing. 3001 * 3002 * If @flush_color is non-negative, flush_color on all pwqs should be 3003 * -1. If no pwq has in-flight commands at the specified color, all 3004 * pwq->flush_color's stay at -1 and %false is returned. If any pwq 3005 * has in flight commands, its pwq->flush_color is set to 3006 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq 3007 * wakeup logic is armed and %true is returned. 3008 * 3009 * The caller should have initialized @wq->first_flusher prior to 3010 * calling this function with non-negative @flush_color. If 3011 * @flush_color is negative, no flush color update is done and %false 3012 * is returned. 3013 * 3014 * If @work_color is non-negative, all pwqs should have the same 3015 * work_color which is previous to @work_color and all will be 3016 * advanced to @work_color. 3017 * 3018 * CONTEXT: 3019 * mutex_lock(wq->mutex). 3020 * 3021 * Return: 3022 * %true if @flush_color >= 0 and there's something to flush. %false 3023 * otherwise. 3024 */ 3025 static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, 3026 int flush_color, int work_color) 3027 { 3028 bool wait = false; 3029 struct pool_workqueue *pwq; 3030 3031 if (flush_color >= 0) { 3032 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); 3033 atomic_set(&wq->nr_pwqs_to_flush, 1); 3034 } 3035 3036 for_each_pwq(pwq, wq) { 3037 struct worker_pool *pool = pwq->pool; 3038 3039 raw_spin_lock_irq(&pool->lock); 3040 3041 if (flush_color >= 0) { 3042 WARN_ON_ONCE(pwq->flush_color != -1); 3043 3044 if (pwq->nr_in_flight[flush_color]) { 3045 pwq->flush_color = flush_color; 3046 atomic_inc(&wq->nr_pwqs_to_flush); 3047 wait = true; 3048 } 3049 } 3050 3051 if (work_color >= 0) { 3052 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); 3053 pwq->work_color = work_color; 3054 } 3055 3056 raw_spin_unlock_irq(&pool->lock); 3057 } 3058 3059 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) 3060 complete(&wq->first_flusher->done); 3061 3062 return wait; 3063 } 3064 3065 /** 3066 * __flush_workqueue - ensure that any scheduled work has run to completion. 3067 * @wq: workqueue to flush 3068 * 3069 * This function sleeps until all work items which were queued on entry 3070 * have finished execution, but it is not livelocked by new incoming ones. 3071 */ 3072 void __flush_workqueue(struct workqueue_struct *wq) 3073 { 3074 struct wq_flusher this_flusher = { 3075 .list = LIST_HEAD_INIT(this_flusher.list), 3076 .flush_color = -1, 3077 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), 3078 }; 3079 int next_color; 3080 3081 if (WARN_ON(!wq_online)) 3082 return; 3083 3084 lock_map_acquire(&wq->lockdep_map); 3085 lock_map_release(&wq->lockdep_map); 3086 3087 mutex_lock(&wq->mutex); 3088 3089 /* 3090 * Start-to-wait phase 3091 */ 3092 next_color = work_next_color(wq->work_color); 3093 3094 if (next_color != wq->flush_color) { 3095 /* 3096 * Color space is not full. The current work_color 3097 * becomes our flush_color and work_color is advanced 3098 * by one. 3099 */ 3100 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); 3101 this_flusher.flush_color = wq->work_color; 3102 wq->work_color = next_color; 3103 3104 if (!wq->first_flusher) { 3105 /* no flush in progress, become the first flusher */ 3106 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 3107 3108 wq->first_flusher = &this_flusher; 3109 3110 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, 3111 wq->work_color)) { 3112 /* nothing to flush, done */ 3113 wq->flush_color = next_color; 3114 wq->first_flusher = NULL; 3115 goto out_unlock; 3116 } 3117 } else { 3118 /* wait in queue */ 3119 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); 3120 list_add_tail(&this_flusher.list, &wq->flusher_queue); 3121 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 3122 } 3123 } else { 3124 /* 3125 * Oops, color space is full, wait on overflow queue. 3126 * The next flush completion will assign us 3127 * flush_color and transfer to flusher_queue. 3128 */ 3129 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 3130 } 3131 3132 check_flush_dependency(wq, NULL); 3133 3134 mutex_unlock(&wq->mutex); 3135 3136 wait_for_completion(&this_flusher.done); 3137 3138 /* 3139 * Wake-up-and-cascade phase 3140 * 3141 * First flushers are responsible for cascading flushes and 3142 * handling overflow. Non-first flushers can simply return. 3143 */ 3144 if (READ_ONCE(wq->first_flusher) != &this_flusher) 3145 return; 3146 3147 mutex_lock(&wq->mutex); 3148 3149 /* we might have raced, check again with mutex held */ 3150 if (wq->first_flusher != &this_flusher) 3151 goto out_unlock; 3152 3153 WRITE_ONCE(wq->first_flusher, NULL); 3154 3155 WARN_ON_ONCE(!list_empty(&this_flusher.list)); 3156 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 3157 3158 while (true) { 3159 struct wq_flusher *next, *tmp; 3160 3161 /* complete all the flushers sharing the current flush color */ 3162 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { 3163 if (next->flush_color != wq->flush_color) 3164 break; 3165 list_del_init(&next->list); 3166 complete(&next->done); 3167 } 3168 3169 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && 3170 wq->flush_color != work_next_color(wq->work_color)); 3171 3172 /* this flush_color is finished, advance by one */ 3173 wq->flush_color = work_next_color(wq->flush_color); 3174 3175 /* one color has been freed, handle overflow queue */ 3176 if (!list_empty(&wq->flusher_overflow)) { 3177 /* 3178 * Assign the same color to all overflowed 3179 * flushers, advance work_color and append to 3180 * flusher_queue. This is the start-to-wait 3181 * phase for these overflowed flushers. 3182 */ 3183 list_for_each_entry(tmp, &wq->flusher_overflow, list) 3184 tmp->flush_color = wq->work_color; 3185 3186 wq->work_color = work_next_color(wq->work_color); 3187 3188 list_splice_tail_init(&wq->flusher_overflow, 3189 &wq->flusher_queue); 3190 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 3191 } 3192 3193 if (list_empty(&wq->flusher_queue)) { 3194 WARN_ON_ONCE(wq->flush_color != wq->work_color); 3195 break; 3196 } 3197 3198 /* 3199 * Need to flush more colors. Make the next flusher 3200 * the new first flusher and arm pwqs. 3201 */ 3202 WARN_ON_ONCE(wq->flush_color == wq->work_color); 3203 WARN_ON_ONCE(wq->flush_color != next->flush_color); 3204 3205 list_del_init(&next->list); 3206 wq->first_flusher = next; 3207 3208 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) 3209 break; 3210 3211 /* 3212 * Meh... this color is already done, clear first 3213 * flusher and repeat cascading. 3214 */ 3215 wq->first_flusher = NULL; 3216 } 3217 3218 out_unlock: 3219 mutex_unlock(&wq->mutex); 3220 } 3221 EXPORT_SYMBOL(__flush_workqueue); 3222 3223 /** 3224 * drain_workqueue - drain a workqueue 3225 * @wq: workqueue to drain 3226 * 3227 * Wait until the workqueue becomes empty. While draining is in progress, 3228 * only chain queueing is allowed. IOW, only currently pending or running 3229 * work items on @wq can queue further work items on it. @wq is flushed 3230 * repeatedly until it becomes empty. The number of flushing is determined 3231 * by the depth of chaining and should be relatively short. Whine if it 3232 * takes too long. 3233 */ 3234 void drain_workqueue(struct workqueue_struct *wq) 3235 { 3236 unsigned int flush_cnt = 0; 3237 struct pool_workqueue *pwq; 3238 3239 /* 3240 * __queue_work() needs to test whether there are drainers, is much 3241 * hotter than drain_workqueue() and already looks at @wq->flags. 3242 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. 3243 */ 3244 mutex_lock(&wq->mutex); 3245 if (!wq->nr_drainers++) 3246 wq->flags |= __WQ_DRAINING; 3247 mutex_unlock(&wq->mutex); 3248 reflush: 3249 __flush_workqueue(wq); 3250 3251 mutex_lock(&wq->mutex); 3252 3253 for_each_pwq(pwq, wq) { 3254 bool drained; 3255 3256 raw_spin_lock_irq(&pwq->pool->lock); 3257 drained = !pwq->nr_active && list_empty(&pwq->inactive_works); 3258 raw_spin_unlock_irq(&pwq->pool->lock); 3259 3260 if (drained) 3261 continue; 3262 3263 if (++flush_cnt == 10 || 3264 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 3265 pr_warn("workqueue %s: %s() isn't complete after %u tries\n", 3266 wq->name, __func__, flush_cnt); 3267 3268 mutex_unlock(&wq->mutex); 3269 goto reflush; 3270 } 3271 3272 if (!--wq->nr_drainers) 3273 wq->flags &= ~__WQ_DRAINING; 3274 mutex_unlock(&wq->mutex); 3275 } 3276 EXPORT_SYMBOL_GPL(drain_workqueue); 3277 3278 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 3279 bool from_cancel) 3280 { 3281 struct worker *worker = NULL; 3282 struct worker_pool *pool; 3283 struct pool_workqueue *pwq; 3284 3285 might_sleep(); 3286 3287 rcu_read_lock(); 3288 pool = get_work_pool(work); 3289 if (!pool) { 3290 rcu_read_unlock(); 3291 return false; 3292 } 3293 3294 raw_spin_lock_irq(&pool->lock); 3295 /* see the comment in try_to_grab_pending() with the same code */ 3296 pwq = get_work_pwq(work); 3297 if (pwq) { 3298 if (unlikely(pwq->pool != pool)) 3299 goto already_gone; 3300 } else { 3301 worker = find_worker_executing_work(pool, work); 3302 if (!worker) 3303 goto already_gone; 3304 pwq = worker->current_pwq; 3305 } 3306 3307 check_flush_dependency(pwq->wq, work); 3308 3309 insert_wq_barrier(pwq, barr, work, worker); 3310 raw_spin_unlock_irq(&pool->lock); 3311 3312 /* 3313 * Force a lock recursion deadlock when using flush_work() inside a 3314 * single-threaded or rescuer equipped workqueue. 3315 * 3316 * For single threaded workqueues the deadlock happens when the work 3317 * is after the work issuing the flush_work(). For rescuer equipped 3318 * workqueues the deadlock happens when the rescuer stalls, blocking 3319 * forward progress. 3320 */ 3321 if (!from_cancel && 3322 (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { 3323 lock_map_acquire(&pwq->wq->lockdep_map); 3324 lock_map_release(&pwq->wq->lockdep_map); 3325 } 3326 rcu_read_unlock(); 3327 return true; 3328 already_gone: 3329 raw_spin_unlock_irq(&pool->lock); 3330 rcu_read_unlock(); 3331 return false; 3332 } 3333 3334 static bool __flush_work(struct work_struct *work, bool from_cancel) 3335 { 3336 struct wq_barrier barr; 3337 3338 if (WARN_ON(!wq_online)) 3339 return false; 3340 3341 if (WARN_ON(!work->func)) 3342 return false; 3343 3344 lock_map_acquire(&work->lockdep_map); 3345 lock_map_release(&work->lockdep_map); 3346 3347 if (start_flush_work(work, &barr, from_cancel)) { 3348 wait_for_completion(&barr.done); 3349 destroy_work_on_stack(&barr.work); 3350 return true; 3351 } else { 3352 return false; 3353 } 3354 } 3355 3356 /** 3357 * flush_work - wait for a work to finish executing the last queueing instance 3358 * @work: the work to flush 3359 * 3360 * Wait until @work has finished execution. @work is guaranteed to be idle 3361 * on return if it hasn't been requeued since flush started. 3362 * 3363 * Return: 3364 * %true if flush_work() waited for the work to finish execution, 3365 * %false if it was already idle. 3366 */ 3367 bool flush_work(struct work_struct *work) 3368 { 3369 return __flush_work(work, false); 3370 } 3371 EXPORT_SYMBOL_GPL(flush_work); 3372 3373 struct cwt_wait { 3374 wait_queue_entry_t wait; 3375 struct work_struct *work; 3376 }; 3377 3378 static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 3379 { 3380 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); 3381 3382 if (cwait->work != key) 3383 return 0; 3384 return autoremove_wake_function(wait, mode, sync, key); 3385 } 3386 3387 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) 3388 { 3389 static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); 3390 unsigned long flags; 3391 int ret; 3392 3393 do { 3394 ret = try_to_grab_pending(work, is_dwork, &flags); 3395 /* 3396 * If someone else is already canceling, wait for it to 3397 * finish. flush_work() doesn't work for PREEMPT_NONE 3398 * because we may get scheduled between @work's completion 3399 * and the other canceling task resuming and clearing 3400 * CANCELING - flush_work() will return false immediately 3401 * as @work is no longer busy, try_to_grab_pending() will 3402 * return -ENOENT as @work is still being canceled and the 3403 * other canceling task won't be able to clear CANCELING as 3404 * we're hogging the CPU. 3405 * 3406 * Let's wait for completion using a waitqueue. As this 3407 * may lead to the thundering herd problem, use a custom 3408 * wake function which matches @work along with exclusive 3409 * wait and wakeup. 3410 */ 3411 if (unlikely(ret == -ENOENT)) { 3412 struct cwt_wait cwait; 3413 3414 init_wait(&cwait.wait); 3415 cwait.wait.func = cwt_wakefn; 3416 cwait.work = work; 3417 3418 prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, 3419 TASK_UNINTERRUPTIBLE); 3420 if (work_is_canceling(work)) 3421 schedule(); 3422 finish_wait(&cancel_waitq, &cwait.wait); 3423 } 3424 } while (unlikely(ret < 0)); 3425 3426 /* tell other tasks trying to grab @work to back off */ 3427 mark_work_canceling(work); 3428 local_irq_restore(flags); 3429 3430 /* 3431 * This allows canceling during early boot. We know that @work 3432 * isn't executing. 3433 */ 3434 if (wq_online) 3435 __flush_work(work, true); 3436 3437 clear_work_data(work); 3438 3439 /* 3440 * Paired with prepare_to_wait() above so that either 3441 * waitqueue_active() is visible here or !work_is_canceling() is 3442 * visible there. 3443 */ 3444 smp_mb(); 3445 if (waitqueue_active(&cancel_waitq)) 3446 __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); 3447 3448 return ret; 3449 } 3450 3451 /** 3452 * cancel_work_sync - cancel a work and wait for it to finish 3453 * @work: the work to cancel 3454 * 3455 * Cancel @work and wait for its execution to finish. This function 3456 * can be used even if the work re-queues itself or migrates to 3457 * another workqueue. On return from this function, @work is 3458 * guaranteed to be not pending or executing on any CPU. 3459 * 3460 * cancel_work_sync(&delayed_work->work) must not be used for 3461 * delayed_work's. Use cancel_delayed_work_sync() instead. 3462 * 3463 * The caller must ensure that the workqueue on which @work was last 3464 * queued can't be destroyed before this function returns. 3465 * 3466 * Return: 3467 * %true if @work was pending, %false otherwise. 3468 */ 3469 bool cancel_work_sync(struct work_struct *work) 3470 { 3471 return __cancel_work_timer(work, false); 3472 } 3473 EXPORT_SYMBOL_GPL(cancel_work_sync); 3474 3475 /** 3476 * flush_delayed_work - wait for a dwork to finish executing the last queueing 3477 * @dwork: the delayed work to flush 3478 * 3479 * Delayed timer is cancelled and the pending work is queued for 3480 * immediate execution. Like flush_work(), this function only 3481 * considers the last queueing instance of @dwork. 3482 * 3483 * Return: 3484 * %true if flush_work() waited for the work to finish execution, 3485 * %false if it was already idle. 3486 */ 3487 bool flush_delayed_work(struct delayed_work *dwork) 3488 { 3489 local_irq_disable(); 3490 if (del_timer_sync(&dwork->timer)) 3491 __queue_work(dwork->cpu, dwork->wq, &dwork->work); 3492 local_irq_enable(); 3493 return flush_work(&dwork->work); 3494 } 3495 EXPORT_SYMBOL(flush_delayed_work); 3496 3497 /** 3498 * flush_rcu_work - wait for a rwork to finish executing the last queueing 3499 * @rwork: the rcu work to flush 3500 * 3501 * Return: 3502 * %true if flush_rcu_work() waited for the work to finish execution, 3503 * %false if it was already idle. 3504 */ 3505 bool flush_rcu_work(struct rcu_work *rwork) 3506 { 3507 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { 3508 rcu_barrier(); 3509 flush_work(&rwork->work); 3510 return true; 3511 } else { 3512 return flush_work(&rwork->work); 3513 } 3514 } 3515 EXPORT_SYMBOL(flush_rcu_work); 3516 3517 static bool __cancel_work(struct work_struct *work, bool is_dwork) 3518 { 3519 unsigned long flags; 3520 int ret; 3521 3522 do { 3523 ret = try_to_grab_pending(work, is_dwork, &flags); 3524 } while (unlikely(ret == -EAGAIN)); 3525 3526 if (unlikely(ret < 0)) 3527 return false; 3528 3529 set_work_pool_and_clear_pending(work, get_work_pool_id(work)); 3530 local_irq_restore(flags); 3531 return ret; 3532 } 3533 3534 /* 3535 * See cancel_delayed_work() 3536 */ 3537 bool cancel_work(struct work_struct *work) 3538 { 3539 return __cancel_work(work, false); 3540 } 3541 EXPORT_SYMBOL(cancel_work); 3542 3543 /** 3544 * cancel_delayed_work - cancel a delayed work 3545 * @dwork: delayed_work to cancel 3546 * 3547 * Kill off a pending delayed_work. 3548 * 3549 * Return: %true if @dwork was pending and canceled; %false if it wasn't 3550 * pending. 3551 * 3552 * Note: 3553 * The work callback function may still be running on return, unless 3554 * it returns %true and the work doesn't re-arm itself. Explicitly flush or 3555 * use cancel_delayed_work_sync() to wait on it. 3556 * 3557 * This function is safe to call from any context including IRQ handler. 3558 */ 3559 bool cancel_delayed_work(struct delayed_work *dwork) 3560 { 3561 return __cancel_work(&dwork->work, true); 3562 } 3563 EXPORT_SYMBOL(cancel_delayed_work); 3564 3565 /** 3566 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish 3567 * @dwork: the delayed work cancel 3568 * 3569 * This is cancel_work_sync() for delayed works. 3570 * 3571 * Return: 3572 * %true if @dwork was pending, %false otherwise. 3573 */ 3574 bool cancel_delayed_work_sync(struct delayed_work *dwork) 3575 { 3576 return __cancel_work_timer(&dwork->work, true); 3577 } 3578 EXPORT_SYMBOL(cancel_delayed_work_sync); 3579 3580 /** 3581 * schedule_on_each_cpu - execute a function synchronously on each online CPU 3582 * @func: the function to call 3583 * 3584 * schedule_on_each_cpu() executes @func on each online CPU using the 3585 * system workqueue and blocks until all CPUs have completed. 3586 * schedule_on_each_cpu() is very slow. 3587 * 3588 * Return: 3589 * 0 on success, -errno on failure. 3590 */ 3591 int schedule_on_each_cpu(work_func_t func) 3592 { 3593 int cpu; 3594 struct work_struct __percpu *works; 3595 3596 works = alloc_percpu(struct work_struct); 3597 if (!works) 3598 return -ENOMEM; 3599 3600 cpus_read_lock(); 3601 3602 for_each_online_cpu(cpu) { 3603 struct work_struct *work = per_cpu_ptr(works, cpu); 3604 3605 INIT_WORK(work, func); 3606 schedule_work_on(cpu, work); 3607 } 3608 3609 for_each_online_cpu(cpu) 3610 flush_work(per_cpu_ptr(works, cpu)); 3611 3612 cpus_read_unlock(); 3613 free_percpu(works); 3614 return 0; 3615 } 3616 3617 /** 3618 * execute_in_process_context - reliably execute the routine with user context 3619 * @fn: the function to execute 3620 * @ew: guaranteed storage for the execute work structure (must 3621 * be available when the work executes) 3622 * 3623 * Executes the function immediately if process context is available, 3624 * otherwise schedules the function for delayed execution. 3625 * 3626 * Return: 0 - function was executed 3627 * 1 - function was scheduled for execution 3628 */ 3629 int execute_in_process_context(work_func_t fn, struct execute_work *ew) 3630 { 3631 if (!in_interrupt()) { 3632 fn(&ew->work); 3633 return 0; 3634 } 3635 3636 INIT_WORK(&ew->work, fn); 3637 schedule_work(&ew->work); 3638 3639 return 1; 3640 } 3641 EXPORT_SYMBOL_GPL(execute_in_process_context); 3642 3643 /** 3644 * free_workqueue_attrs - free a workqueue_attrs 3645 * @attrs: workqueue_attrs to free 3646 * 3647 * Undo alloc_workqueue_attrs(). 3648 */ 3649 void free_workqueue_attrs(struct workqueue_attrs *attrs) 3650 { 3651 if (attrs) { 3652 free_cpumask_var(attrs->cpumask); 3653 kfree(attrs); 3654 } 3655 } 3656 3657 /** 3658 * alloc_workqueue_attrs - allocate a workqueue_attrs 3659 * 3660 * Allocate a new workqueue_attrs, initialize with default settings and 3661 * return it. 3662 * 3663 * Return: The allocated new workqueue_attr on success. %NULL on failure. 3664 */ 3665 struct workqueue_attrs *alloc_workqueue_attrs(void) 3666 { 3667 struct workqueue_attrs *attrs; 3668 3669 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); 3670 if (!attrs) 3671 goto fail; 3672 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) 3673 goto fail; 3674 3675 cpumask_copy(attrs->cpumask, cpu_possible_mask); 3676 attrs->affn_scope = wq_affn_dfl; 3677 return attrs; 3678 fail: 3679 free_workqueue_attrs(attrs); 3680 return NULL; 3681 } 3682 3683 static void copy_workqueue_attrs(struct workqueue_attrs *to, 3684 const struct workqueue_attrs *from) 3685 { 3686 to->nice = from->nice; 3687 cpumask_copy(to->cpumask, from->cpumask); 3688 3689 /* 3690 * Unlike hash and equality test, copying shouldn't ignore wq-only 3691 * fields as copying is used for both pool and wq attrs. Instead, 3692 * get_unbound_pool() explicitly clears the fields. 3693 */ 3694 to->affn_scope = from->affn_scope; 3695 to->ordered = from->ordered; 3696 } 3697 3698 /* 3699 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the 3700 * comments in 'struct workqueue_attrs' definition. 3701 */ 3702 static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) 3703 { 3704 attrs->affn_scope = WQ_AFFN_NR_TYPES; 3705 attrs->ordered = false; 3706 } 3707 3708 /* hash value of the content of @attr */ 3709 static u32 wqattrs_hash(const struct workqueue_attrs *attrs) 3710 { 3711 u32 hash = 0; 3712 3713 hash = jhash_1word(attrs->nice, hash); 3714 hash = jhash(cpumask_bits(attrs->cpumask), 3715 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); 3716 return hash; 3717 } 3718 3719 /* content equality test */ 3720 static bool wqattrs_equal(const struct workqueue_attrs *a, 3721 const struct workqueue_attrs *b) 3722 { 3723 if (a->nice != b->nice) 3724 return false; 3725 if (!cpumask_equal(a->cpumask, b->cpumask)) 3726 return false; 3727 return true; 3728 } 3729 3730 /* Update @attrs with actually available CPUs */ 3731 static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, 3732 const cpumask_t *unbound_cpumask) 3733 { 3734 /* 3735 * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If 3736 * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to 3737 * @unbound_cpumask. 3738 */ 3739 cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); 3740 if (unlikely(cpumask_empty(attrs->cpumask))) 3741 cpumask_copy(attrs->cpumask, unbound_cpumask); 3742 } 3743 3744 /* find wq_pod_type to use for @attrs */ 3745 static const struct wq_pod_type * 3746 wqattrs_pod_type(const struct workqueue_attrs *attrs) 3747 { 3748 struct wq_pod_type *pt = &wq_pod_types[attrs->affn_scope]; 3749 3750 if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && 3751 likely(pt->nr_pods)) 3752 return pt; 3753 3754 /* 3755 * Before workqueue_init_topology(), only SYSTEM is available which is 3756 * initialized in workqueue_init_early(). 3757 */ 3758 pt = &wq_pod_types[WQ_AFFN_SYSTEM]; 3759 BUG_ON(!pt->nr_pods); 3760 return pt; 3761 } 3762 3763 /** 3764 * init_worker_pool - initialize a newly zalloc'd worker_pool 3765 * @pool: worker_pool to initialize 3766 * 3767 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. 3768 * 3769 * Return: 0 on success, -errno on failure. Even on failure, all fields 3770 * inside @pool proper are initialized and put_unbound_pool() can be called 3771 * on @pool safely to release it. 3772 */ 3773 static int init_worker_pool(struct worker_pool *pool) 3774 { 3775 raw_spin_lock_init(&pool->lock); 3776 pool->id = -1; 3777 pool->cpu = -1; 3778 pool->node = NUMA_NO_NODE; 3779 pool->flags |= POOL_DISASSOCIATED; 3780 pool->watchdog_ts = jiffies; 3781 INIT_LIST_HEAD(&pool->worklist); 3782 INIT_LIST_HEAD(&pool->idle_list); 3783 hash_init(pool->busy_hash); 3784 3785 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); 3786 INIT_WORK(&pool->idle_cull_work, idle_cull_fn); 3787 3788 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); 3789 3790 INIT_LIST_HEAD(&pool->workers); 3791 INIT_LIST_HEAD(&pool->dying_workers); 3792 3793 ida_init(&pool->worker_ida); 3794 INIT_HLIST_NODE(&pool->hash_node); 3795 pool->refcnt = 1; 3796 3797 /* shouldn't fail above this point */ 3798 pool->attrs = alloc_workqueue_attrs(); 3799 if (!pool->attrs) 3800 return -ENOMEM; 3801 3802 wqattrs_clear_for_pool(pool->attrs); 3803 3804 return 0; 3805 } 3806 3807 #ifdef CONFIG_LOCKDEP 3808 static void wq_init_lockdep(struct workqueue_struct *wq) 3809 { 3810 char *lock_name; 3811 3812 lockdep_register_key(&wq->key); 3813 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); 3814 if (!lock_name) 3815 lock_name = wq->name; 3816 3817 wq->lock_name = lock_name; 3818 lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); 3819 } 3820 3821 static void wq_unregister_lockdep(struct workqueue_struct *wq) 3822 { 3823 lockdep_unregister_key(&wq->key); 3824 } 3825 3826 static void wq_free_lockdep(struct workqueue_struct *wq) 3827 { 3828 if (wq->lock_name != wq->name) 3829 kfree(wq->lock_name); 3830 } 3831 #else 3832 static void wq_init_lockdep(struct workqueue_struct *wq) 3833 { 3834 } 3835 3836 static void wq_unregister_lockdep(struct workqueue_struct *wq) 3837 { 3838 } 3839 3840 static void wq_free_lockdep(struct workqueue_struct *wq) 3841 { 3842 } 3843 #endif 3844 3845 static void rcu_free_wq(struct rcu_head *rcu) 3846 { 3847 struct workqueue_struct *wq = 3848 container_of(rcu, struct workqueue_struct, rcu); 3849 3850 wq_free_lockdep(wq); 3851 free_percpu(wq->cpu_pwq); 3852 free_workqueue_attrs(wq->unbound_attrs); 3853 kfree(wq); 3854 } 3855 3856 static void rcu_free_pool(struct rcu_head *rcu) 3857 { 3858 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3859 3860 ida_destroy(&pool->worker_ida); 3861 free_workqueue_attrs(pool->attrs); 3862 kfree(pool); 3863 } 3864 3865 /** 3866 * put_unbound_pool - put a worker_pool 3867 * @pool: worker_pool to put 3868 * 3869 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU 3870 * safe manner. get_unbound_pool() calls this function on its failure path 3871 * and this function should be able to release pools which went through, 3872 * successfully or not, init_worker_pool(). 3873 * 3874 * Should be called with wq_pool_mutex held. 3875 */ 3876 static void put_unbound_pool(struct worker_pool *pool) 3877 { 3878 DECLARE_COMPLETION_ONSTACK(detach_completion); 3879 struct worker *worker; 3880 LIST_HEAD(cull_list); 3881 3882 lockdep_assert_held(&wq_pool_mutex); 3883 3884 if (--pool->refcnt) 3885 return; 3886 3887 /* sanity checks */ 3888 if (WARN_ON(!(pool->cpu < 0)) || 3889 WARN_ON(!list_empty(&pool->worklist))) 3890 return; 3891 3892 /* release id and unhash */ 3893 if (pool->id >= 0) 3894 idr_remove(&worker_pool_idr, pool->id); 3895 hash_del(&pool->hash_node); 3896 3897 /* 3898 * Become the manager and destroy all workers. This prevents 3899 * @pool's workers from blocking on attach_mutex. We're the last 3900 * manager and @pool gets freed with the flag set. 3901 * 3902 * Having a concurrent manager is quite unlikely to happen as we can 3903 * only get here with 3904 * pwq->refcnt == pool->refcnt == 0 3905 * which implies no work queued to the pool, which implies no worker can 3906 * become the manager. However a worker could have taken the role of 3907 * manager before the refcnts dropped to 0, since maybe_create_worker() 3908 * drops pool->lock 3909 */ 3910 while (true) { 3911 rcuwait_wait_event(&manager_wait, 3912 !(pool->flags & POOL_MANAGER_ACTIVE), 3913 TASK_UNINTERRUPTIBLE); 3914 3915 mutex_lock(&wq_pool_attach_mutex); 3916 raw_spin_lock_irq(&pool->lock); 3917 if (!(pool->flags & POOL_MANAGER_ACTIVE)) { 3918 pool->flags |= POOL_MANAGER_ACTIVE; 3919 break; 3920 } 3921 raw_spin_unlock_irq(&pool->lock); 3922 mutex_unlock(&wq_pool_attach_mutex); 3923 } 3924 3925 while ((worker = first_idle_worker(pool))) 3926 set_worker_dying(worker, &cull_list); 3927 WARN_ON(pool->nr_workers || pool->nr_idle); 3928 raw_spin_unlock_irq(&pool->lock); 3929 3930 wake_dying_workers(&cull_list); 3931 3932 if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) 3933 pool->detach_completion = &detach_completion; 3934 mutex_unlock(&wq_pool_attach_mutex); 3935 3936 if (pool->detach_completion) 3937 wait_for_completion(pool->detach_completion); 3938 3939 /* shut down the timers */ 3940 del_timer_sync(&pool->idle_timer); 3941 cancel_work_sync(&pool->idle_cull_work); 3942 del_timer_sync(&pool->mayday_timer); 3943 3944 /* RCU protected to allow dereferences from get_work_pool() */ 3945 call_rcu(&pool->rcu, rcu_free_pool); 3946 } 3947 3948 /** 3949 * get_unbound_pool - get a worker_pool with the specified attributes 3950 * @attrs: the attributes of the worker_pool to get 3951 * 3952 * Obtain a worker_pool which has the same attributes as @attrs, bump the 3953 * reference count and return it. If there already is a matching 3954 * worker_pool, it will be used; otherwise, this function attempts to 3955 * create a new one. 3956 * 3957 * Should be called with wq_pool_mutex held. 3958 * 3959 * Return: On success, a worker_pool with the same attributes as @attrs. 3960 * On failure, %NULL. 3961 */ 3962 static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) 3963 { 3964 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; 3965 u32 hash = wqattrs_hash(attrs); 3966 struct worker_pool *pool; 3967 int pod, node = NUMA_NO_NODE; 3968 3969 lockdep_assert_held(&wq_pool_mutex); 3970 3971 /* do we already have a matching pool? */ 3972 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { 3973 if (wqattrs_equal(pool->attrs, attrs)) { 3974 pool->refcnt++; 3975 return pool; 3976 } 3977 } 3978 3979 /* If cpumask is contained inside a NUMA pod, that's our NUMA node */ 3980 for (pod = 0; pod < pt->nr_pods; pod++) { 3981 if (cpumask_subset(attrs->cpumask, pt->pod_cpus[pod])) { 3982 node = pt->pod_node[pod]; 3983 break; 3984 } 3985 } 3986 3987 /* nope, create a new one */ 3988 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); 3989 if (!pool || init_worker_pool(pool) < 0) 3990 goto fail; 3991 3992 pool->node = node; 3993 copy_workqueue_attrs(pool->attrs, attrs); 3994 wqattrs_clear_for_pool(pool->attrs); 3995 3996 if (worker_pool_assign_id(pool) < 0) 3997 goto fail; 3998 3999 /* create and start the initial worker */ 4000 if (wq_online && !create_worker(pool)) 4001 goto fail; 4002 4003 /* install */ 4004 hash_add(unbound_pool_hash, &pool->hash_node, hash); 4005 4006 return pool; 4007 fail: 4008 if (pool) 4009 put_unbound_pool(pool); 4010 return NULL; 4011 } 4012 4013 static void rcu_free_pwq(struct rcu_head *rcu) 4014 { 4015 kmem_cache_free(pwq_cache, 4016 container_of(rcu, struct pool_workqueue, rcu)); 4017 } 4018 4019 /* 4020 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero 4021 * refcnt and needs to be destroyed. 4022 */ 4023 static void pwq_release_workfn(struct kthread_work *work) 4024 { 4025 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, 4026 release_work); 4027 struct workqueue_struct *wq = pwq->wq; 4028 struct worker_pool *pool = pwq->pool; 4029 bool is_last = false; 4030 4031 /* 4032 * When @pwq is not linked, it doesn't hold any reference to the 4033 * @wq, and @wq is invalid to access. 4034 */ 4035 if (!list_empty(&pwq->pwqs_node)) { 4036 mutex_lock(&wq->mutex); 4037 list_del_rcu(&pwq->pwqs_node); 4038 is_last = list_empty(&wq->pwqs); 4039 mutex_unlock(&wq->mutex); 4040 } 4041 4042 if (wq->flags & WQ_UNBOUND) { 4043 mutex_lock(&wq_pool_mutex); 4044 put_unbound_pool(pool); 4045 mutex_unlock(&wq_pool_mutex); 4046 } 4047 4048 call_rcu(&pwq->rcu, rcu_free_pwq); 4049 4050 /* 4051 * If we're the last pwq going away, @wq is already dead and no one 4052 * is gonna access it anymore. Schedule RCU free. 4053 */ 4054 if (is_last) { 4055 wq_unregister_lockdep(wq); 4056 call_rcu(&wq->rcu, rcu_free_wq); 4057 } 4058 } 4059 4060 /** 4061 * pwq_adjust_max_active - update a pwq's max_active to the current setting 4062 * @pwq: target pool_workqueue 4063 * 4064 * If @pwq isn't freezing, set @pwq->max_active to the associated 4065 * workqueue's saved_max_active and activate inactive work items 4066 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. 4067 */ 4068 static void pwq_adjust_max_active(struct pool_workqueue *pwq) 4069 { 4070 struct workqueue_struct *wq = pwq->wq; 4071 bool freezable = wq->flags & WQ_FREEZABLE; 4072 unsigned long flags; 4073 4074 /* for @wq->saved_max_active */ 4075 lockdep_assert_held(&wq->mutex); 4076 4077 /* fast exit for non-freezable wqs */ 4078 if (!freezable && pwq->max_active == wq->saved_max_active) 4079 return; 4080 4081 /* this function can be called during early boot w/ irq disabled */ 4082 raw_spin_lock_irqsave(&pwq->pool->lock, flags); 4083 4084 /* 4085 * During [un]freezing, the caller is responsible for ensuring that 4086 * this function is called at least once after @workqueue_freezing 4087 * is updated and visible. 4088 */ 4089 if (!freezable || !workqueue_freezing) { 4090 bool kick = false; 4091 4092 pwq->max_active = wq->saved_max_active; 4093 4094 while (!list_empty(&pwq->inactive_works) && 4095 pwq->nr_active < pwq->max_active) { 4096 pwq_activate_first_inactive(pwq); 4097 kick = true; 4098 } 4099 4100 /* 4101 * Need to kick a worker after thawed or an unbound wq's 4102 * max_active is bumped. In realtime scenarios, always kicking a 4103 * worker will cause interference on the isolated cpu cores, so 4104 * let's kick iff work items were activated. 4105 */ 4106 if (kick) 4107 wake_up_worker(pwq->pool); 4108 } else { 4109 pwq->max_active = 0; 4110 } 4111 4112 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); 4113 } 4114 4115 /* initialize newly allocated @pwq which is associated with @wq and @pool */ 4116 static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, 4117 struct worker_pool *pool) 4118 { 4119 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); 4120 4121 memset(pwq, 0, sizeof(*pwq)); 4122 4123 pwq->pool = pool; 4124 pwq->wq = wq; 4125 pwq->flush_color = -1; 4126 pwq->refcnt = 1; 4127 INIT_LIST_HEAD(&pwq->inactive_works); 4128 INIT_LIST_HEAD(&pwq->pwqs_node); 4129 INIT_LIST_HEAD(&pwq->mayday_node); 4130 kthread_init_work(&pwq->release_work, pwq_release_workfn); 4131 } 4132 4133 /* sync @pwq with the current state of its associated wq and link it */ 4134 static void link_pwq(struct pool_workqueue *pwq) 4135 { 4136 struct workqueue_struct *wq = pwq->wq; 4137 4138 lockdep_assert_held(&wq->mutex); 4139 4140 /* may be called multiple times, ignore if already linked */ 4141 if (!list_empty(&pwq->pwqs_node)) 4142 return; 4143 4144 /* set the matching work_color */ 4145 pwq->work_color = wq->work_color; 4146 4147 /* sync max_active to the current setting */ 4148 pwq_adjust_max_active(pwq); 4149 4150 /* link in @pwq */ 4151 list_add_rcu(&pwq->pwqs_node, &wq->pwqs); 4152 } 4153 4154 /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ 4155 static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, 4156 const struct workqueue_attrs *attrs) 4157 { 4158 struct worker_pool *pool; 4159 struct pool_workqueue *pwq; 4160 4161 lockdep_assert_held(&wq_pool_mutex); 4162 4163 pool = get_unbound_pool(attrs); 4164 if (!pool) 4165 return NULL; 4166 4167 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); 4168 if (!pwq) { 4169 put_unbound_pool(pool); 4170 return NULL; 4171 } 4172 4173 init_pwq(pwq, wq, pool); 4174 return pwq; 4175 } 4176 4177 /** 4178 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod 4179 * @attrs: the wq_attrs of the default pwq of the target workqueue 4180 * @cpu: the target CPU 4181 * @cpu_going_down: if >= 0, the CPU to consider as offline 4182 * @cpumask: outarg, the resulting cpumask 4183 * 4184 * Calculate the cpumask a workqueue with @attrs should use on @pod. If 4185 * @cpu_going_down is >= 0, that cpu is considered offline during calculation. 4186 * The result is stored in @cpumask. 4187 * 4188 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled 4189 * and @pod has online CPUs requested by @attrs, the returned cpumask is the 4190 * intersection of the possible CPUs of @pod and @attrs->cpumask. 4191 * 4192 * The caller is responsible for ensuring that the cpumask of @pod stays stable. 4193 */ 4194 static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int cpu, 4195 int cpu_going_down, cpumask_t *cpumask) 4196 { 4197 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 4198 int pod = pt->cpu_pod[cpu]; 4199 4200 /* does @pod have any online CPUs @attrs wants? */ 4201 cpumask_and(cpumask, pt->pod_cpus[pod], attrs->cpumask); 4202 cpumask_and(cpumask, cpumask, cpu_online_mask); 4203 if (cpu_going_down >= 0) 4204 cpumask_clear_cpu(cpu_going_down, cpumask); 4205 4206 if (cpumask_empty(cpumask)) { 4207 cpumask_copy(cpumask, attrs->cpumask); 4208 return; 4209 } 4210 4211 /* yeap, return possible CPUs in @pod that @attrs wants */ 4212 cpumask_and(cpumask, attrs->cpumask, pt->pod_cpus[pod]); 4213 4214 if (cpumask_empty(cpumask)) 4215 pr_warn_once("WARNING: workqueue cpumask: online intersect > " 4216 "possible intersect\n"); 4217 } 4218 4219 /* install @pwq into @wq's cpu_pwq and return the old pwq */ 4220 static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, 4221 int cpu, struct pool_workqueue *pwq) 4222 { 4223 struct pool_workqueue *old_pwq; 4224 4225 lockdep_assert_held(&wq_pool_mutex); 4226 lockdep_assert_held(&wq->mutex); 4227 4228 /* link_pwq() can handle duplicate calls */ 4229 link_pwq(pwq); 4230 4231 old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); 4232 rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); 4233 return old_pwq; 4234 } 4235 4236 /* context to store the prepared attrs & pwqs before applying */ 4237 struct apply_wqattrs_ctx { 4238 struct workqueue_struct *wq; /* target workqueue */ 4239 struct workqueue_attrs *attrs; /* attrs to apply */ 4240 struct list_head list; /* queued for batching commit */ 4241 struct pool_workqueue *dfl_pwq; 4242 struct pool_workqueue *pwq_tbl[]; 4243 }; 4244 4245 /* free the resources after success or abort */ 4246 static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) 4247 { 4248 if (ctx) { 4249 int cpu; 4250 4251 for_each_possible_cpu(cpu) 4252 put_pwq_unlocked(ctx->pwq_tbl[cpu]); 4253 put_pwq_unlocked(ctx->dfl_pwq); 4254 4255 free_workqueue_attrs(ctx->attrs); 4256 4257 kfree(ctx); 4258 } 4259 } 4260 4261 /* allocate the attrs and pwqs for later installation */ 4262 static struct apply_wqattrs_ctx * 4263 apply_wqattrs_prepare(struct workqueue_struct *wq, 4264 const struct workqueue_attrs *attrs, 4265 const cpumask_var_t unbound_cpumask) 4266 { 4267 struct apply_wqattrs_ctx *ctx; 4268 struct workqueue_attrs *new_attrs, *tmp_attrs; 4269 int cpu; 4270 4271 lockdep_assert_held(&wq_pool_mutex); 4272 4273 if (WARN_ON(attrs->affn_scope < 0 || 4274 attrs->affn_scope >= WQ_AFFN_NR_TYPES)) 4275 return ERR_PTR(-EINVAL); 4276 4277 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); 4278 4279 new_attrs = alloc_workqueue_attrs(); 4280 tmp_attrs = alloc_workqueue_attrs(); 4281 if (!ctx || !new_attrs || !tmp_attrs) 4282 goto out_free; 4283 4284 /* 4285 * If something goes wrong during CPU up/down, we'll fall back to 4286 * the default pwq covering whole @attrs->cpumask. Always create 4287 * it even if we don't use it immediately. 4288 */ 4289 copy_workqueue_attrs(new_attrs, attrs); 4290 wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); 4291 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); 4292 if (!ctx->dfl_pwq) 4293 goto out_free; 4294 4295 /* 4296 * We may create multiple pwqs with differing cpumasks. Make a copy of 4297 * @new_attrs which will be modified and used to obtain pools. 4298 */ 4299 copy_workqueue_attrs(tmp_attrs, new_attrs); 4300 4301 for_each_possible_cpu(cpu) { 4302 if (new_attrs->ordered) { 4303 ctx->dfl_pwq->refcnt++; 4304 ctx->pwq_tbl[cpu] = ctx->dfl_pwq; 4305 } else { 4306 wq_calc_pod_cpumask(new_attrs, cpu, -1, tmp_attrs->cpumask); 4307 ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); 4308 if (!ctx->pwq_tbl[cpu]) 4309 goto out_free; 4310 } 4311 } 4312 4313 /* save the user configured attrs and sanitize it. */ 4314 copy_workqueue_attrs(new_attrs, attrs); 4315 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); 4316 ctx->attrs = new_attrs; 4317 4318 ctx->wq = wq; 4319 free_workqueue_attrs(tmp_attrs); 4320 return ctx; 4321 4322 out_free: 4323 free_workqueue_attrs(tmp_attrs); 4324 free_workqueue_attrs(new_attrs); 4325 apply_wqattrs_cleanup(ctx); 4326 return ERR_PTR(-ENOMEM); 4327 } 4328 4329 /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ 4330 static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) 4331 { 4332 int cpu; 4333 4334 /* all pwqs have been created successfully, let's install'em */ 4335 mutex_lock(&ctx->wq->mutex); 4336 4337 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); 4338 4339 /* save the previous pwq and install the new one */ 4340 for_each_possible_cpu(cpu) 4341 ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, 4342 ctx->pwq_tbl[cpu]); 4343 4344 /* @dfl_pwq might not have been used, ensure it's linked */ 4345 link_pwq(ctx->dfl_pwq); 4346 swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); 4347 4348 mutex_unlock(&ctx->wq->mutex); 4349 } 4350 4351 static void apply_wqattrs_lock(void) 4352 { 4353 /* CPUs should stay stable across pwq creations and installations */ 4354 cpus_read_lock(); 4355 mutex_lock(&wq_pool_mutex); 4356 } 4357 4358 static void apply_wqattrs_unlock(void) 4359 { 4360 mutex_unlock(&wq_pool_mutex); 4361 cpus_read_unlock(); 4362 } 4363 4364 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, 4365 const struct workqueue_attrs *attrs) 4366 { 4367 struct apply_wqattrs_ctx *ctx; 4368 4369 /* only unbound workqueues can change attributes */ 4370 if (WARN_ON(!(wq->flags & WQ_UNBOUND))) 4371 return -EINVAL; 4372 4373 /* creating multiple pwqs breaks ordering guarantee */ 4374 if (!list_empty(&wq->pwqs)) { 4375 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 4376 return -EINVAL; 4377 4378 wq->flags &= ~__WQ_ORDERED; 4379 } 4380 4381 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); 4382 if (IS_ERR(ctx)) 4383 return PTR_ERR(ctx); 4384 4385 /* the ctx has been prepared successfully, let's commit it */ 4386 apply_wqattrs_commit(ctx); 4387 apply_wqattrs_cleanup(ctx); 4388 4389 return 0; 4390 } 4391 4392 /** 4393 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue 4394 * @wq: the target workqueue 4395 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() 4396 * 4397 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps 4398 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that 4399 * work items are affine to the pod it was issued on. Older pwqs are released as 4400 * in-flight work items finish. Note that a work item which repeatedly requeues 4401 * itself back-to-back will stay on its current pwq. 4402 * 4403 * Performs GFP_KERNEL allocations. 4404 * 4405 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). 4406 * 4407 * Return: 0 on success and -errno on failure. 4408 */ 4409 int apply_workqueue_attrs(struct workqueue_struct *wq, 4410 const struct workqueue_attrs *attrs) 4411 { 4412 int ret; 4413 4414 lockdep_assert_cpus_held(); 4415 4416 mutex_lock(&wq_pool_mutex); 4417 ret = apply_workqueue_attrs_locked(wq, attrs); 4418 mutex_unlock(&wq_pool_mutex); 4419 4420 return ret; 4421 } 4422 4423 /** 4424 * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug 4425 * @wq: the target workqueue 4426 * @cpu: the CPU to update pool association for 4427 * @hotplug_cpu: the CPU coming up or going down 4428 * @online: whether @cpu is coming up or going down 4429 * 4430 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and 4431 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of 4432 * @wq accordingly. 4433 * 4434 * 4435 * If pod affinity can't be adjusted due to memory allocation failure, it falls 4436 * back to @wq->dfl_pwq which may not be optimal but is always correct. 4437 * 4438 * Note that when the last allowed CPU of a pod goes offline for a workqueue 4439 * with a cpumask spanning multiple pods, the workers which were already 4440 * executing the work items for the workqueue will lose their CPU affinity and 4441 * may execute on any CPU. This is similar to how per-cpu workqueues behave on 4442 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's 4443 * responsibility to flush the work item from CPU_DOWN_PREPARE. 4444 */ 4445 static void wq_update_pod(struct workqueue_struct *wq, int cpu, 4446 int hotplug_cpu, bool online) 4447 { 4448 int off_cpu = online ? -1 : hotplug_cpu; 4449 struct pool_workqueue *old_pwq = NULL, *pwq; 4450 struct workqueue_attrs *target_attrs; 4451 cpumask_t *cpumask; 4452 4453 lockdep_assert_held(&wq_pool_mutex); 4454 4455 if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) 4456 return; 4457 4458 /* 4459 * We don't wanna alloc/free wq_attrs for each wq for each CPU. 4460 * Let's use a preallocated one. The following buf is protected by 4461 * CPU hotplug exclusion. 4462 */ 4463 target_attrs = wq_update_pod_attrs_buf; 4464 cpumask = wq_update_pod_cpumask_buf; 4465 4466 copy_workqueue_attrs(target_attrs, wq->unbound_attrs); 4467 wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); 4468 4469 /* nothing to do if the target cpumask matches the current pwq */ 4470 wq_calc_pod_cpumask(target_attrs, cpu, off_cpu, cpumask); 4471 pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), 4472 lockdep_is_held(&wq_pool_mutex)); 4473 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) 4474 return; 4475 4476 /* create a new pwq */ 4477 cpumask_copy(target_attrs->cpumask, cpumask); 4478 pwq = alloc_unbound_pwq(wq, target_attrs); 4479 if (!pwq) { 4480 pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", 4481 wq->name); 4482 goto use_dfl_pwq; 4483 } 4484 4485 /* Install the new pwq. */ 4486 mutex_lock(&wq->mutex); 4487 old_pwq = install_unbound_pwq(wq, cpu, pwq); 4488 goto out_unlock; 4489 4490 use_dfl_pwq: 4491 mutex_lock(&wq->mutex); 4492 raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); 4493 get_pwq(wq->dfl_pwq); 4494 raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); 4495 old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); 4496 out_unlock: 4497 mutex_unlock(&wq->mutex); 4498 put_pwq_unlocked(old_pwq); 4499 } 4500 4501 static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4502 { 4503 bool highpri = wq->flags & WQ_HIGHPRI; 4504 int cpu, ret; 4505 4506 wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); 4507 if (!wq->cpu_pwq) 4508 goto enomem; 4509 4510 if (!(wq->flags & WQ_UNBOUND)) { 4511 for_each_possible_cpu(cpu) { 4512 struct pool_workqueue **pwq_p = 4513 per_cpu_ptr(wq->cpu_pwq, cpu); 4514 struct worker_pool *pool = 4515 &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); 4516 4517 *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, 4518 pool->node); 4519 if (!*pwq_p) 4520 goto enomem; 4521 4522 init_pwq(*pwq_p, wq, pool); 4523 4524 mutex_lock(&wq->mutex); 4525 link_pwq(*pwq_p); 4526 mutex_unlock(&wq->mutex); 4527 } 4528 return 0; 4529 } 4530 4531 cpus_read_lock(); 4532 if (wq->flags & __WQ_ORDERED) { 4533 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); 4534 /* there should only be single pwq for ordering guarantee */ 4535 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || 4536 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), 4537 "ordering guarantee broken for workqueue %s\n", wq->name); 4538 } else { 4539 ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); 4540 } 4541 cpus_read_unlock(); 4542 4543 return ret; 4544 4545 enomem: 4546 if (wq->cpu_pwq) { 4547 for_each_possible_cpu(cpu) 4548 kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); 4549 free_percpu(wq->cpu_pwq); 4550 wq->cpu_pwq = NULL; 4551 } 4552 return -ENOMEM; 4553 } 4554 4555 static int wq_clamp_max_active(int max_active, unsigned int flags, 4556 const char *name) 4557 { 4558 if (max_active < 1 || max_active > WQ_MAX_ACTIVE) 4559 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", 4560 max_active, name, 1, WQ_MAX_ACTIVE); 4561 4562 return clamp_val(max_active, 1, WQ_MAX_ACTIVE); 4563 } 4564 4565 /* 4566 * Workqueues which may be used during memory reclaim should have a rescuer 4567 * to guarantee forward progress. 4568 */ 4569 static int init_rescuer(struct workqueue_struct *wq) 4570 { 4571 struct worker *rescuer; 4572 int ret; 4573 4574 if (!(wq->flags & WQ_MEM_RECLAIM)) 4575 return 0; 4576 4577 rescuer = alloc_worker(NUMA_NO_NODE); 4578 if (!rescuer) { 4579 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", 4580 wq->name); 4581 return -ENOMEM; 4582 } 4583 4584 rescuer->rescue_wq = wq; 4585 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); 4586 if (IS_ERR(rescuer->task)) { 4587 ret = PTR_ERR(rescuer->task); 4588 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", 4589 wq->name, ERR_PTR(ret)); 4590 kfree(rescuer); 4591 return ret; 4592 } 4593 4594 wq->rescuer = rescuer; 4595 kthread_bind_mask(rescuer->task, cpu_possible_mask); 4596 wake_up_process(rescuer->task); 4597 4598 return 0; 4599 } 4600 4601 __printf(1, 4) 4602 struct workqueue_struct *alloc_workqueue(const char *fmt, 4603 unsigned int flags, 4604 int max_active, ...) 4605 { 4606 va_list args; 4607 struct workqueue_struct *wq; 4608 struct pool_workqueue *pwq; 4609 4610 /* 4611 * Unbound && max_active == 1 used to imply ordered, which is no longer 4612 * the case on many machines due to per-pod pools. While 4613 * alloc_ordered_workqueue() is the right way to create an ordered 4614 * workqueue, keep the previous behavior to avoid subtle breakages. 4615 */ 4616 if ((flags & WQ_UNBOUND) && max_active == 1) 4617 flags |= __WQ_ORDERED; 4618 4619 /* see the comment above the definition of WQ_POWER_EFFICIENT */ 4620 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) 4621 flags |= WQ_UNBOUND; 4622 4623 /* allocate wq and format name */ 4624 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 4625 if (!wq) 4626 return NULL; 4627 4628 if (flags & WQ_UNBOUND) { 4629 wq->unbound_attrs = alloc_workqueue_attrs(); 4630 if (!wq->unbound_attrs) 4631 goto err_free_wq; 4632 } 4633 4634 va_start(args, max_active); 4635 vsnprintf(wq->name, sizeof(wq->name), fmt, args); 4636 va_end(args); 4637 4638 max_active = max_active ?: WQ_DFL_ACTIVE; 4639 max_active = wq_clamp_max_active(max_active, flags, wq->name); 4640 4641 /* init wq */ 4642 wq->flags = flags; 4643 wq->saved_max_active = max_active; 4644 mutex_init(&wq->mutex); 4645 atomic_set(&wq->nr_pwqs_to_flush, 0); 4646 INIT_LIST_HEAD(&wq->pwqs); 4647 INIT_LIST_HEAD(&wq->flusher_queue); 4648 INIT_LIST_HEAD(&wq->flusher_overflow); 4649 INIT_LIST_HEAD(&wq->maydays); 4650 4651 wq_init_lockdep(wq); 4652 INIT_LIST_HEAD(&wq->list); 4653 4654 if (alloc_and_link_pwqs(wq) < 0) 4655 goto err_unreg_lockdep; 4656 4657 if (wq_online && init_rescuer(wq) < 0) 4658 goto err_destroy; 4659 4660 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) 4661 goto err_destroy; 4662 4663 /* 4664 * wq_pool_mutex protects global freeze state and workqueues list. 4665 * Grab it, adjust max_active and add the new @wq to workqueues 4666 * list. 4667 */ 4668 mutex_lock(&wq_pool_mutex); 4669 4670 mutex_lock(&wq->mutex); 4671 for_each_pwq(pwq, wq) 4672 pwq_adjust_max_active(pwq); 4673 mutex_unlock(&wq->mutex); 4674 4675 list_add_tail_rcu(&wq->list, &workqueues); 4676 4677 mutex_unlock(&wq_pool_mutex); 4678 4679 return wq; 4680 4681 err_unreg_lockdep: 4682 wq_unregister_lockdep(wq); 4683 wq_free_lockdep(wq); 4684 err_free_wq: 4685 free_workqueue_attrs(wq->unbound_attrs); 4686 kfree(wq); 4687 return NULL; 4688 err_destroy: 4689 destroy_workqueue(wq); 4690 return NULL; 4691 } 4692 EXPORT_SYMBOL_GPL(alloc_workqueue); 4693 4694 static bool pwq_busy(struct pool_workqueue *pwq) 4695 { 4696 int i; 4697 4698 for (i = 0; i < WORK_NR_COLORS; i++) 4699 if (pwq->nr_in_flight[i]) 4700 return true; 4701 4702 if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) 4703 return true; 4704 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) 4705 return true; 4706 4707 return false; 4708 } 4709 4710 /** 4711 * destroy_workqueue - safely terminate a workqueue 4712 * @wq: target workqueue 4713 * 4714 * Safely destroy a workqueue. All work currently pending will be done first. 4715 */ 4716 void destroy_workqueue(struct workqueue_struct *wq) 4717 { 4718 struct pool_workqueue *pwq; 4719 int cpu; 4720 4721 /* 4722 * Remove it from sysfs first so that sanity check failure doesn't 4723 * lead to sysfs name conflicts. 4724 */ 4725 workqueue_sysfs_unregister(wq); 4726 4727 /* mark the workqueue destruction is in progress */ 4728 mutex_lock(&wq->mutex); 4729 wq->flags |= __WQ_DESTROYING; 4730 mutex_unlock(&wq->mutex); 4731 4732 /* drain it before proceeding with destruction */ 4733 drain_workqueue(wq); 4734 4735 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ 4736 if (wq->rescuer) { 4737 struct worker *rescuer = wq->rescuer; 4738 4739 /* this prevents new queueing */ 4740 raw_spin_lock_irq(&wq_mayday_lock); 4741 wq->rescuer = NULL; 4742 raw_spin_unlock_irq(&wq_mayday_lock); 4743 4744 /* rescuer will empty maydays list before exiting */ 4745 kthread_stop(rescuer->task); 4746 kfree(rescuer); 4747 } 4748 4749 /* 4750 * Sanity checks - grab all the locks so that we wait for all 4751 * in-flight operations which may do put_pwq(). 4752 */ 4753 mutex_lock(&wq_pool_mutex); 4754 mutex_lock(&wq->mutex); 4755 for_each_pwq(pwq, wq) { 4756 raw_spin_lock_irq(&pwq->pool->lock); 4757 if (WARN_ON(pwq_busy(pwq))) { 4758 pr_warn("%s: %s has the following busy pwq\n", 4759 __func__, wq->name); 4760 show_pwq(pwq); 4761 raw_spin_unlock_irq(&pwq->pool->lock); 4762 mutex_unlock(&wq->mutex); 4763 mutex_unlock(&wq_pool_mutex); 4764 show_one_workqueue(wq); 4765 return; 4766 } 4767 raw_spin_unlock_irq(&pwq->pool->lock); 4768 } 4769 mutex_unlock(&wq->mutex); 4770 4771 /* 4772 * wq list is used to freeze wq, remove from list after 4773 * flushing is complete in case freeze races us. 4774 */ 4775 list_del_rcu(&wq->list); 4776 mutex_unlock(&wq_pool_mutex); 4777 4778 /* 4779 * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq 4780 * to put the base refs. @wq will be auto-destroyed from the last 4781 * pwq_put. RCU read lock prevents @wq from going away from under us. 4782 */ 4783 rcu_read_lock(); 4784 4785 for_each_possible_cpu(cpu) { 4786 pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); 4787 RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); 4788 put_pwq_unlocked(pwq); 4789 } 4790 4791 put_pwq_unlocked(wq->dfl_pwq); 4792 wq->dfl_pwq = NULL; 4793 4794 rcu_read_unlock(); 4795 } 4796 EXPORT_SYMBOL_GPL(destroy_workqueue); 4797 4798 /** 4799 * workqueue_set_max_active - adjust max_active of a workqueue 4800 * @wq: target workqueue 4801 * @max_active: new max_active value. 4802 * 4803 * Set max_active of @wq to @max_active. 4804 * 4805 * CONTEXT: 4806 * Don't call from IRQ context. 4807 */ 4808 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 4809 { 4810 struct pool_workqueue *pwq; 4811 4812 /* disallow meddling with max_active for ordered workqueues */ 4813 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 4814 return; 4815 4816 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 4817 4818 mutex_lock(&wq->mutex); 4819 4820 wq->flags &= ~__WQ_ORDERED; 4821 wq->saved_max_active = max_active; 4822 4823 for_each_pwq(pwq, wq) 4824 pwq_adjust_max_active(pwq); 4825 4826 mutex_unlock(&wq->mutex); 4827 } 4828 EXPORT_SYMBOL_GPL(workqueue_set_max_active); 4829 4830 /** 4831 * current_work - retrieve %current task's work struct 4832 * 4833 * Determine if %current task is a workqueue worker and what it's working on. 4834 * Useful to find out the context that the %current task is running in. 4835 * 4836 * Return: work struct if %current task is a workqueue worker, %NULL otherwise. 4837 */ 4838 struct work_struct *current_work(void) 4839 { 4840 struct worker *worker = current_wq_worker(); 4841 4842 return worker ? worker->current_work : NULL; 4843 } 4844 EXPORT_SYMBOL(current_work); 4845 4846 /** 4847 * current_is_workqueue_rescuer - is %current workqueue rescuer? 4848 * 4849 * Determine whether %current is a workqueue rescuer. Can be used from 4850 * work functions to determine whether it's being run off the rescuer task. 4851 * 4852 * Return: %true if %current is a workqueue rescuer. %false otherwise. 4853 */ 4854 bool current_is_workqueue_rescuer(void) 4855 { 4856 struct worker *worker = current_wq_worker(); 4857 4858 return worker && worker->rescue_wq; 4859 } 4860 4861 /** 4862 * workqueue_congested - test whether a workqueue is congested 4863 * @cpu: CPU in question 4864 * @wq: target workqueue 4865 * 4866 * Test whether @wq's cpu workqueue for @cpu is congested. There is 4867 * no synchronization around this function and the test result is 4868 * unreliable and only useful as advisory hints or for debugging. 4869 * 4870 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. 4871 * 4872 * With the exception of ordered workqueues, all workqueues have per-cpu 4873 * pool_workqueues, each with its own congested state. A workqueue being 4874 * congested on one CPU doesn't mean that the workqueue is contested on any 4875 * other CPUs. 4876 * 4877 * Return: 4878 * %true if congested, %false otherwise. 4879 */ 4880 bool workqueue_congested(int cpu, struct workqueue_struct *wq) 4881 { 4882 struct pool_workqueue *pwq; 4883 bool ret; 4884 4885 rcu_read_lock(); 4886 preempt_disable(); 4887 4888 if (cpu == WORK_CPU_UNBOUND) 4889 cpu = smp_processor_id(); 4890 4891 pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); 4892 ret = !list_empty(&pwq->inactive_works); 4893 4894 preempt_enable(); 4895 rcu_read_unlock(); 4896 4897 return ret; 4898 } 4899 EXPORT_SYMBOL_GPL(workqueue_congested); 4900 4901 /** 4902 * work_busy - test whether a work is currently pending or running 4903 * @work: the work to be tested 4904 * 4905 * Test whether @work is currently pending or running. There is no 4906 * synchronization around this function and the test result is 4907 * unreliable and only useful as advisory hints or for debugging. 4908 * 4909 * Return: 4910 * OR'd bitmask of WORK_BUSY_* bits. 4911 */ 4912 unsigned int work_busy(struct work_struct *work) 4913 { 4914 struct worker_pool *pool; 4915 unsigned long flags; 4916 unsigned int ret = 0; 4917 4918 if (work_pending(work)) 4919 ret |= WORK_BUSY_PENDING; 4920 4921 rcu_read_lock(); 4922 pool = get_work_pool(work); 4923 if (pool) { 4924 raw_spin_lock_irqsave(&pool->lock, flags); 4925 if (find_worker_executing_work(pool, work)) 4926 ret |= WORK_BUSY_RUNNING; 4927 raw_spin_unlock_irqrestore(&pool->lock, flags); 4928 } 4929 rcu_read_unlock(); 4930 4931 return ret; 4932 } 4933 EXPORT_SYMBOL_GPL(work_busy); 4934 4935 /** 4936 * set_worker_desc - set description for the current work item 4937 * @fmt: printf-style format string 4938 * @...: arguments for the format string 4939 * 4940 * This function can be called by a running work function to describe what 4941 * the work item is about. If the worker task gets dumped, this 4942 * information will be printed out together to help debugging. The 4943 * description can be at most WORKER_DESC_LEN including the trailing '\0'. 4944 */ 4945 void set_worker_desc(const char *fmt, ...) 4946 { 4947 struct worker *worker = current_wq_worker(); 4948 va_list args; 4949 4950 if (worker) { 4951 va_start(args, fmt); 4952 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); 4953 va_end(args); 4954 } 4955 } 4956 EXPORT_SYMBOL_GPL(set_worker_desc); 4957 4958 /** 4959 * print_worker_info - print out worker information and description 4960 * @log_lvl: the log level to use when printing 4961 * @task: target task 4962 * 4963 * If @task is a worker and currently executing a work item, print out the 4964 * name of the workqueue being serviced and worker description set with 4965 * set_worker_desc() by the currently executing work item. 4966 * 4967 * This function can be safely called on any task as long as the 4968 * task_struct itself is accessible. While safe, this function isn't 4969 * synchronized and may print out mixups or garbages of limited length. 4970 */ 4971 void print_worker_info(const char *log_lvl, struct task_struct *task) 4972 { 4973 work_func_t *fn = NULL; 4974 char name[WQ_NAME_LEN] = { }; 4975 char desc[WORKER_DESC_LEN] = { }; 4976 struct pool_workqueue *pwq = NULL; 4977 struct workqueue_struct *wq = NULL; 4978 struct worker *worker; 4979 4980 if (!(task->flags & PF_WQ_WORKER)) 4981 return; 4982 4983 /* 4984 * This function is called without any synchronization and @task 4985 * could be in any state. Be careful with dereferences. 4986 */ 4987 worker = kthread_probe_data(task); 4988 4989 /* 4990 * Carefully copy the associated workqueue's workfn, name and desc. 4991 * Keep the original last '\0' in case the original is garbage. 4992 */ 4993 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn)); 4994 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq)); 4995 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq)); 4996 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1); 4997 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1); 4998 4999 if (fn || name[0] || desc[0]) { 5000 printk("%sWorkqueue: %s %ps", log_lvl, name, fn); 5001 if (strcmp(name, desc)) 5002 pr_cont(" (%s)", desc); 5003 pr_cont("\n"); 5004 } 5005 } 5006 5007 static void pr_cont_pool_info(struct worker_pool *pool) 5008 { 5009 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); 5010 if (pool->node != NUMA_NO_NODE) 5011 pr_cont(" node=%d", pool->node); 5012 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); 5013 } 5014 5015 struct pr_cont_work_struct { 5016 bool comma; 5017 work_func_t func; 5018 long ctr; 5019 }; 5020 5021 static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) 5022 { 5023 if (!pcwsp->ctr) 5024 goto out_record; 5025 if (func == pcwsp->func) { 5026 pcwsp->ctr++; 5027 return; 5028 } 5029 if (pcwsp->ctr == 1) 5030 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); 5031 else 5032 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); 5033 pcwsp->ctr = 0; 5034 out_record: 5035 if ((long)func == -1L) 5036 return; 5037 pcwsp->comma = comma; 5038 pcwsp->func = func; 5039 pcwsp->ctr = 1; 5040 } 5041 5042 static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) 5043 { 5044 if (work->func == wq_barrier_func) { 5045 struct wq_barrier *barr; 5046 5047 barr = container_of(work, struct wq_barrier, work); 5048 5049 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 5050 pr_cont("%s BAR(%d)", comma ? "," : "", 5051 task_pid_nr(barr->task)); 5052 } else { 5053 if (!comma) 5054 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); 5055 pr_cont_work_flush(comma, work->func, pcwsp); 5056 } 5057 } 5058 5059 static void show_pwq(struct pool_workqueue *pwq) 5060 { 5061 struct pr_cont_work_struct pcws = { .ctr = 0, }; 5062 struct worker_pool *pool = pwq->pool; 5063 struct work_struct *work; 5064 struct worker *worker; 5065 bool has_in_flight = false, has_pending = false; 5066 int bkt; 5067 5068 pr_info(" pwq %d:", pool->id); 5069 pr_cont_pool_info(pool); 5070 5071 pr_cont(" active=%d/%d refcnt=%d%s\n", 5072 pwq->nr_active, pwq->max_active, pwq->refcnt, 5073 !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); 5074 5075 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 5076 if (worker->current_pwq == pwq) { 5077 has_in_flight = true; 5078 break; 5079 } 5080 } 5081 if (has_in_flight) { 5082 bool comma = false; 5083 5084 pr_info(" in-flight:"); 5085 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 5086 if (worker->current_pwq != pwq) 5087 continue; 5088 5089 pr_cont("%s %d%s:%ps", comma ? "," : "", 5090 task_pid_nr(worker->task), 5091 worker->rescue_wq ? "(RESCUER)" : "", 5092 worker->current_func); 5093 list_for_each_entry(work, &worker->scheduled, entry) 5094 pr_cont_work(false, work, &pcws); 5095 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5096 comma = true; 5097 } 5098 pr_cont("\n"); 5099 } 5100 5101 list_for_each_entry(work, &pool->worklist, entry) { 5102 if (get_work_pwq(work) == pwq) { 5103 has_pending = true; 5104 break; 5105 } 5106 } 5107 if (has_pending) { 5108 bool comma = false; 5109 5110 pr_info(" pending:"); 5111 list_for_each_entry(work, &pool->worklist, entry) { 5112 if (get_work_pwq(work) != pwq) 5113 continue; 5114 5115 pr_cont_work(comma, work, &pcws); 5116 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 5117 } 5118 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5119 pr_cont("\n"); 5120 } 5121 5122 if (!list_empty(&pwq->inactive_works)) { 5123 bool comma = false; 5124 5125 pr_info(" inactive:"); 5126 list_for_each_entry(work, &pwq->inactive_works, entry) { 5127 pr_cont_work(comma, work, &pcws); 5128 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); 5129 } 5130 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); 5131 pr_cont("\n"); 5132 } 5133 } 5134 5135 /** 5136 * show_one_workqueue - dump state of specified workqueue 5137 * @wq: workqueue whose state will be printed 5138 */ 5139 void show_one_workqueue(struct workqueue_struct *wq) 5140 { 5141 struct pool_workqueue *pwq; 5142 bool idle = true; 5143 unsigned long flags; 5144 5145 for_each_pwq(pwq, wq) { 5146 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { 5147 idle = false; 5148 break; 5149 } 5150 } 5151 if (idle) /* Nothing to print for idle workqueue */ 5152 return; 5153 5154 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); 5155 5156 for_each_pwq(pwq, wq) { 5157 raw_spin_lock_irqsave(&pwq->pool->lock, flags); 5158 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { 5159 /* 5160 * Defer printing to avoid deadlocks in console 5161 * drivers that queue work while holding locks 5162 * also taken in their write paths. 5163 */ 5164 printk_deferred_enter(); 5165 show_pwq(pwq); 5166 printk_deferred_exit(); 5167 } 5168 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); 5169 /* 5170 * We could be printing a lot from atomic context, e.g. 5171 * sysrq-t -> show_all_workqueues(). Avoid triggering 5172 * hard lockup. 5173 */ 5174 touch_nmi_watchdog(); 5175 } 5176 5177 } 5178 5179 /** 5180 * show_one_worker_pool - dump state of specified worker pool 5181 * @pool: worker pool whose state will be printed 5182 */ 5183 static void show_one_worker_pool(struct worker_pool *pool) 5184 { 5185 struct worker *worker; 5186 bool first = true; 5187 unsigned long flags; 5188 unsigned long hung = 0; 5189 5190 raw_spin_lock_irqsave(&pool->lock, flags); 5191 if (pool->nr_workers == pool->nr_idle) 5192 goto next_pool; 5193 5194 /* How long the first pending work is waiting for a worker. */ 5195 if (!list_empty(&pool->worklist)) 5196 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; 5197 5198 /* 5199 * Defer printing to avoid deadlocks in console drivers that 5200 * queue work while holding locks also taken in their write 5201 * paths. 5202 */ 5203 printk_deferred_enter(); 5204 pr_info("pool %d:", pool->id); 5205 pr_cont_pool_info(pool); 5206 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); 5207 if (pool->manager) 5208 pr_cont(" manager: %d", 5209 task_pid_nr(pool->manager->task)); 5210 list_for_each_entry(worker, &pool->idle_list, entry) { 5211 pr_cont(" %s%d", first ? "idle: " : "", 5212 task_pid_nr(worker->task)); 5213 first = false; 5214 } 5215 pr_cont("\n"); 5216 printk_deferred_exit(); 5217 next_pool: 5218 raw_spin_unlock_irqrestore(&pool->lock, flags); 5219 /* 5220 * We could be printing a lot from atomic context, e.g. 5221 * sysrq-t -> show_all_workqueues(). Avoid triggering 5222 * hard lockup. 5223 */ 5224 touch_nmi_watchdog(); 5225 5226 } 5227 5228 /** 5229 * show_all_workqueues - dump workqueue state 5230 * 5231 * Called from a sysrq handler and prints out all busy workqueues and pools. 5232 */ 5233 void show_all_workqueues(void) 5234 { 5235 struct workqueue_struct *wq; 5236 struct worker_pool *pool; 5237 int pi; 5238 5239 rcu_read_lock(); 5240 5241 pr_info("Showing busy workqueues and worker pools:\n"); 5242 5243 list_for_each_entry_rcu(wq, &workqueues, list) 5244 show_one_workqueue(wq); 5245 5246 for_each_pool(pool, pi) 5247 show_one_worker_pool(pool); 5248 5249 rcu_read_unlock(); 5250 } 5251 5252 /** 5253 * show_freezable_workqueues - dump freezable workqueue state 5254 * 5255 * Called from try_to_freeze_tasks() and prints out all freezable workqueues 5256 * still busy. 5257 */ 5258 void show_freezable_workqueues(void) 5259 { 5260 struct workqueue_struct *wq; 5261 5262 rcu_read_lock(); 5263 5264 pr_info("Showing freezable workqueues that are still busy:\n"); 5265 5266 list_for_each_entry_rcu(wq, &workqueues, list) { 5267 if (!(wq->flags & WQ_FREEZABLE)) 5268 continue; 5269 show_one_workqueue(wq); 5270 } 5271 5272 rcu_read_unlock(); 5273 } 5274 5275 /* used to show worker information through /proc/PID/{comm,stat,status} */ 5276 void wq_worker_comm(char *buf, size_t size, struct task_struct *task) 5277 { 5278 int off; 5279 5280 /* always show the actual comm */ 5281 off = strscpy(buf, task->comm, size); 5282 if (off < 0) 5283 return; 5284 5285 /* stabilize PF_WQ_WORKER and worker pool association */ 5286 mutex_lock(&wq_pool_attach_mutex); 5287 5288 if (task->flags & PF_WQ_WORKER) { 5289 struct worker *worker = kthread_data(task); 5290 struct worker_pool *pool = worker->pool; 5291 5292 if (pool) { 5293 raw_spin_lock_irq(&pool->lock); 5294 /* 5295 * ->desc tracks information (wq name or 5296 * set_worker_desc()) for the latest execution. If 5297 * current, prepend '+', otherwise '-'. 5298 */ 5299 if (worker->desc[0] != '\0') { 5300 if (worker->current_work) 5301 scnprintf(buf + off, size - off, "+%s", 5302 worker->desc); 5303 else 5304 scnprintf(buf + off, size - off, "-%s", 5305 worker->desc); 5306 } 5307 raw_spin_unlock_irq(&pool->lock); 5308 } 5309 } 5310 5311 mutex_unlock(&wq_pool_attach_mutex); 5312 } 5313 5314 #ifdef CONFIG_SMP 5315 5316 /* 5317 * CPU hotplug. 5318 * 5319 * There are two challenges in supporting CPU hotplug. Firstly, there 5320 * are a lot of assumptions on strong associations among work, pwq and 5321 * pool which make migrating pending and scheduled works very 5322 * difficult to implement without impacting hot paths. Secondly, 5323 * worker pools serve mix of short, long and very long running works making 5324 * blocked draining impractical. 5325 * 5326 * This is solved by allowing the pools to be disassociated from the CPU 5327 * running as an unbound one and allowing it to be reattached later if the 5328 * cpu comes back online. 5329 */ 5330 5331 static void unbind_workers(int cpu) 5332 { 5333 struct worker_pool *pool; 5334 struct worker *worker; 5335 5336 for_each_cpu_worker_pool(pool, cpu) { 5337 mutex_lock(&wq_pool_attach_mutex); 5338 raw_spin_lock_irq(&pool->lock); 5339 5340 /* 5341 * We've blocked all attach/detach operations. Make all workers 5342 * unbound and set DISASSOCIATED. Before this, all workers 5343 * must be on the cpu. After this, they may become diasporas. 5344 * And the preemption disabled section in their sched callbacks 5345 * are guaranteed to see WORKER_UNBOUND since the code here 5346 * is on the same cpu. 5347 */ 5348 for_each_pool_worker(worker, pool) 5349 worker->flags |= WORKER_UNBOUND; 5350 5351 pool->flags |= POOL_DISASSOCIATED; 5352 5353 /* 5354 * The handling of nr_running in sched callbacks are disabled 5355 * now. Zap nr_running. After this, nr_running stays zero and 5356 * need_more_worker() and keep_working() are always true as 5357 * long as the worklist is not empty. This pool now behaves as 5358 * an unbound (in terms of concurrency management) pool which 5359 * are served by workers tied to the pool. 5360 */ 5361 pool->nr_running = 0; 5362 5363 /* 5364 * With concurrency management just turned off, a busy 5365 * worker blocking could lead to lengthy stalls. Kick off 5366 * unbound chain execution of currently pending work items. 5367 */ 5368 wake_up_worker(pool); 5369 5370 raw_spin_unlock_irq(&pool->lock); 5371 5372 for_each_pool_worker(worker, pool) 5373 unbind_worker(worker); 5374 5375 mutex_unlock(&wq_pool_attach_mutex); 5376 } 5377 } 5378 5379 /** 5380 * rebind_workers - rebind all workers of a pool to the associated CPU 5381 * @pool: pool of interest 5382 * 5383 * @pool->cpu is coming online. Rebind all workers to the CPU. 5384 */ 5385 static void rebind_workers(struct worker_pool *pool) 5386 { 5387 struct worker *worker; 5388 5389 lockdep_assert_held(&wq_pool_attach_mutex); 5390 5391 /* 5392 * Restore CPU affinity of all workers. As all idle workers should 5393 * be on the run-queue of the associated CPU before any local 5394 * wake-ups for concurrency management happen, restore CPU affinity 5395 * of all workers first and then clear UNBOUND. As we're called 5396 * from CPU_ONLINE, the following shouldn't fail. 5397 */ 5398 for_each_pool_worker(worker, pool) { 5399 kthread_set_per_cpu(worker->task, pool->cpu); 5400 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 5401 pool->attrs->cpumask) < 0); 5402 } 5403 5404 raw_spin_lock_irq(&pool->lock); 5405 5406 pool->flags &= ~POOL_DISASSOCIATED; 5407 5408 for_each_pool_worker(worker, pool) { 5409 unsigned int worker_flags = worker->flags; 5410 5411 /* 5412 * We want to clear UNBOUND but can't directly call 5413 * worker_clr_flags() or adjust nr_running. Atomically 5414 * replace UNBOUND with another NOT_RUNNING flag REBOUND. 5415 * @worker will clear REBOUND using worker_clr_flags() when 5416 * it initiates the next execution cycle thus restoring 5417 * concurrency management. Note that when or whether 5418 * @worker clears REBOUND doesn't affect correctness. 5419 * 5420 * WRITE_ONCE() is necessary because @worker->flags may be 5421 * tested without holding any lock in 5422 * wq_worker_running(). Without it, NOT_RUNNING test may 5423 * fail incorrectly leading to premature concurrency 5424 * management operations. 5425 */ 5426 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); 5427 worker_flags |= WORKER_REBOUND; 5428 worker_flags &= ~WORKER_UNBOUND; 5429 WRITE_ONCE(worker->flags, worker_flags); 5430 } 5431 5432 raw_spin_unlock_irq(&pool->lock); 5433 } 5434 5435 /** 5436 * restore_unbound_workers_cpumask - restore cpumask of unbound workers 5437 * @pool: unbound pool of interest 5438 * @cpu: the CPU which is coming up 5439 * 5440 * An unbound pool may end up with a cpumask which doesn't have any online 5441 * CPUs. When a worker of such pool get scheduled, the scheduler resets 5442 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any 5443 * online CPU before, cpus_allowed of all its workers should be restored. 5444 */ 5445 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) 5446 { 5447 static cpumask_t cpumask; 5448 struct worker *worker; 5449 5450 lockdep_assert_held(&wq_pool_attach_mutex); 5451 5452 /* is @cpu allowed for @pool? */ 5453 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 5454 return; 5455 5456 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); 5457 5458 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 5459 for_each_pool_worker(worker, pool) 5460 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); 5461 } 5462 5463 int workqueue_prepare_cpu(unsigned int cpu) 5464 { 5465 struct worker_pool *pool; 5466 5467 for_each_cpu_worker_pool(pool, cpu) { 5468 if (pool->nr_workers) 5469 continue; 5470 if (!create_worker(pool)) 5471 return -ENOMEM; 5472 } 5473 return 0; 5474 } 5475 5476 int workqueue_online_cpu(unsigned int cpu) 5477 { 5478 struct worker_pool *pool; 5479 struct workqueue_struct *wq; 5480 int pi; 5481 5482 mutex_lock(&wq_pool_mutex); 5483 5484 for_each_pool(pool, pi) { 5485 mutex_lock(&wq_pool_attach_mutex); 5486 5487 if (pool->cpu == cpu) 5488 rebind_workers(pool); 5489 else if (pool->cpu < 0) 5490 restore_unbound_workers_cpumask(pool, cpu); 5491 5492 mutex_unlock(&wq_pool_attach_mutex); 5493 } 5494 5495 /* update pod affinity of unbound workqueues */ 5496 list_for_each_entry(wq, &workqueues, list) { 5497 struct workqueue_attrs *attrs = wq->unbound_attrs; 5498 5499 if (attrs) { 5500 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 5501 int tcpu; 5502 5503 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) 5504 wq_update_pod(wq, tcpu, cpu, true); 5505 } 5506 } 5507 5508 mutex_unlock(&wq_pool_mutex); 5509 return 0; 5510 } 5511 5512 int workqueue_offline_cpu(unsigned int cpu) 5513 { 5514 struct workqueue_struct *wq; 5515 5516 /* unbinding per-cpu workers should happen on the local CPU */ 5517 if (WARN_ON(cpu != smp_processor_id())) 5518 return -1; 5519 5520 unbind_workers(cpu); 5521 5522 /* update pod affinity of unbound workqueues */ 5523 mutex_lock(&wq_pool_mutex); 5524 list_for_each_entry(wq, &workqueues, list) { 5525 struct workqueue_attrs *attrs = wq->unbound_attrs; 5526 5527 if (attrs) { 5528 const struct wq_pod_type *pt = wqattrs_pod_type(attrs); 5529 int tcpu; 5530 5531 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) 5532 wq_update_pod(wq, tcpu, cpu, false); 5533 } 5534 } 5535 mutex_unlock(&wq_pool_mutex); 5536 5537 return 0; 5538 } 5539 5540 struct work_for_cpu { 5541 struct work_struct work; 5542 long (*fn)(void *); 5543 void *arg; 5544 long ret; 5545 }; 5546 5547 static void work_for_cpu_fn(struct work_struct *work) 5548 { 5549 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); 5550 5551 wfc->ret = wfc->fn(wfc->arg); 5552 } 5553 5554 /** 5555 * work_on_cpu - run a function in thread context on a particular cpu 5556 * @cpu: the cpu to run on 5557 * @fn: the function to run 5558 * @arg: the function arg 5559 * 5560 * It is up to the caller to ensure that the cpu doesn't go offline. 5561 * The caller must not hold any locks which would prevent @fn from completing. 5562 * 5563 * Return: The value @fn returns. 5564 */ 5565 long work_on_cpu(int cpu, long (*fn)(void *), void *arg) 5566 { 5567 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 5568 5569 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 5570 schedule_work_on(cpu, &wfc.work); 5571 flush_work(&wfc.work); 5572 destroy_work_on_stack(&wfc.work); 5573 return wfc.ret; 5574 } 5575 EXPORT_SYMBOL_GPL(work_on_cpu); 5576 5577 /** 5578 * work_on_cpu_safe - run a function in thread context on a particular cpu 5579 * @cpu: the cpu to run on 5580 * @fn: the function to run 5581 * @arg: the function argument 5582 * 5583 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold 5584 * any locks which would prevent @fn from completing. 5585 * 5586 * Return: The value @fn returns. 5587 */ 5588 long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) 5589 { 5590 long ret = -ENODEV; 5591 5592 cpus_read_lock(); 5593 if (cpu_online(cpu)) 5594 ret = work_on_cpu(cpu, fn, arg); 5595 cpus_read_unlock(); 5596 return ret; 5597 } 5598 EXPORT_SYMBOL_GPL(work_on_cpu_safe); 5599 #endif /* CONFIG_SMP */ 5600 5601 #ifdef CONFIG_FREEZER 5602 5603 /** 5604 * freeze_workqueues_begin - begin freezing workqueues 5605 * 5606 * Start freezing workqueues. After this function returns, all freezable 5607 * workqueues will queue new works to their inactive_works list instead of 5608 * pool->worklist. 5609 * 5610 * CONTEXT: 5611 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 5612 */ 5613 void freeze_workqueues_begin(void) 5614 { 5615 struct workqueue_struct *wq; 5616 struct pool_workqueue *pwq; 5617 5618 mutex_lock(&wq_pool_mutex); 5619 5620 WARN_ON_ONCE(workqueue_freezing); 5621 workqueue_freezing = true; 5622 5623 list_for_each_entry(wq, &workqueues, list) { 5624 mutex_lock(&wq->mutex); 5625 for_each_pwq(pwq, wq) 5626 pwq_adjust_max_active(pwq); 5627 mutex_unlock(&wq->mutex); 5628 } 5629 5630 mutex_unlock(&wq_pool_mutex); 5631 } 5632 5633 /** 5634 * freeze_workqueues_busy - are freezable workqueues still busy? 5635 * 5636 * Check whether freezing is complete. This function must be called 5637 * between freeze_workqueues_begin() and thaw_workqueues(). 5638 * 5639 * CONTEXT: 5640 * Grabs and releases wq_pool_mutex. 5641 * 5642 * Return: 5643 * %true if some freezable workqueues are still busy. %false if freezing 5644 * is complete. 5645 */ 5646 bool freeze_workqueues_busy(void) 5647 { 5648 bool busy = false; 5649 struct workqueue_struct *wq; 5650 struct pool_workqueue *pwq; 5651 5652 mutex_lock(&wq_pool_mutex); 5653 5654 WARN_ON_ONCE(!workqueue_freezing); 5655 5656 list_for_each_entry(wq, &workqueues, list) { 5657 if (!(wq->flags & WQ_FREEZABLE)) 5658 continue; 5659 /* 5660 * nr_active is monotonically decreasing. It's safe 5661 * to peek without lock. 5662 */ 5663 rcu_read_lock(); 5664 for_each_pwq(pwq, wq) { 5665 WARN_ON_ONCE(pwq->nr_active < 0); 5666 if (pwq->nr_active) { 5667 busy = true; 5668 rcu_read_unlock(); 5669 goto out_unlock; 5670 } 5671 } 5672 rcu_read_unlock(); 5673 } 5674 out_unlock: 5675 mutex_unlock(&wq_pool_mutex); 5676 return busy; 5677 } 5678 5679 /** 5680 * thaw_workqueues - thaw workqueues 5681 * 5682 * Thaw workqueues. Normal queueing is restored and all collected 5683 * frozen works are transferred to their respective pool worklists. 5684 * 5685 * CONTEXT: 5686 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. 5687 */ 5688 void thaw_workqueues(void) 5689 { 5690 struct workqueue_struct *wq; 5691 struct pool_workqueue *pwq; 5692 5693 mutex_lock(&wq_pool_mutex); 5694 5695 if (!workqueue_freezing) 5696 goto out_unlock; 5697 5698 workqueue_freezing = false; 5699 5700 /* restore max_active and repopulate worklist */ 5701 list_for_each_entry(wq, &workqueues, list) { 5702 mutex_lock(&wq->mutex); 5703 for_each_pwq(pwq, wq) 5704 pwq_adjust_max_active(pwq); 5705 mutex_unlock(&wq->mutex); 5706 } 5707 5708 out_unlock: 5709 mutex_unlock(&wq_pool_mutex); 5710 } 5711 #endif /* CONFIG_FREEZER */ 5712 5713 static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) 5714 { 5715 LIST_HEAD(ctxs); 5716 int ret = 0; 5717 struct workqueue_struct *wq; 5718 struct apply_wqattrs_ctx *ctx, *n; 5719 5720 lockdep_assert_held(&wq_pool_mutex); 5721 5722 list_for_each_entry(wq, &workqueues, list) { 5723 if (!(wq->flags & WQ_UNBOUND)) 5724 continue; 5725 /* creating multiple pwqs breaks ordering guarantee */ 5726 if (wq->flags & __WQ_ORDERED) 5727 continue; 5728 5729 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); 5730 if (IS_ERR(ctx)) { 5731 ret = PTR_ERR(ctx); 5732 break; 5733 } 5734 5735 list_add_tail(&ctx->list, &ctxs); 5736 } 5737 5738 list_for_each_entry_safe(ctx, n, &ctxs, list) { 5739 if (!ret) 5740 apply_wqattrs_commit(ctx); 5741 apply_wqattrs_cleanup(ctx); 5742 } 5743 5744 if (!ret) { 5745 mutex_lock(&wq_pool_attach_mutex); 5746 cpumask_copy(wq_unbound_cpumask, unbound_cpumask); 5747 mutex_unlock(&wq_pool_attach_mutex); 5748 } 5749 return ret; 5750 } 5751 5752 /** 5753 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask 5754 * @cpumask: the cpumask to set 5755 * 5756 * The low-level workqueues cpumask is a global cpumask that limits 5757 * the affinity of all unbound workqueues. This function check the @cpumask 5758 * and apply it to all unbound workqueues and updates all pwqs of them. 5759 * 5760 * Return: 0 - Success 5761 * -EINVAL - Invalid @cpumask 5762 * -ENOMEM - Failed to allocate memory for attrs or pwqs. 5763 */ 5764 int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) 5765 { 5766 int ret = -EINVAL; 5767 5768 /* 5769 * Not excluding isolated cpus on purpose. 5770 * If the user wishes to include them, we allow that. 5771 */ 5772 cpumask_and(cpumask, cpumask, cpu_possible_mask); 5773 if (!cpumask_empty(cpumask)) { 5774 apply_wqattrs_lock(); 5775 if (cpumask_equal(cpumask, wq_unbound_cpumask)) { 5776 ret = 0; 5777 goto out_unlock; 5778 } 5779 5780 ret = workqueue_apply_unbound_cpumask(cpumask); 5781 5782 out_unlock: 5783 apply_wqattrs_unlock(); 5784 } 5785 5786 return ret; 5787 } 5788 5789 static int parse_affn_scope(const char *val) 5790 { 5791 int i; 5792 5793 for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { 5794 if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) 5795 return i; 5796 } 5797 return -EINVAL; 5798 } 5799 5800 static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) 5801 { 5802 int affn; 5803 5804 affn = parse_affn_scope(val); 5805 if (affn < 0) 5806 return affn; 5807 5808 wq_affn_dfl = affn; 5809 return 0; 5810 } 5811 5812 static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) 5813 { 5814 return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); 5815 } 5816 5817 static const struct kernel_param_ops wq_affn_dfl_ops = { 5818 .set = wq_affn_dfl_set, 5819 .get = wq_affn_dfl_get, 5820 }; 5821 5822 module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); 5823 5824 #ifdef CONFIG_SYSFS 5825 /* 5826 * Workqueues with WQ_SYSFS flag set is visible to userland via 5827 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the 5828 * following attributes. 5829 * 5830 * per_cpu RO bool : whether the workqueue is per-cpu or unbound 5831 * max_active RW int : maximum number of in-flight work items 5832 * 5833 * Unbound workqueues have the following extra attributes. 5834 * 5835 * nice RW int : nice value of the workers 5836 * cpumask RW mask : bitmask of allowed CPUs for the workers 5837 * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) 5838 */ 5839 struct wq_device { 5840 struct workqueue_struct *wq; 5841 struct device dev; 5842 }; 5843 5844 static struct workqueue_struct *dev_to_wq(struct device *dev) 5845 { 5846 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 5847 5848 return wq_dev->wq; 5849 } 5850 5851 static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, 5852 char *buf) 5853 { 5854 struct workqueue_struct *wq = dev_to_wq(dev); 5855 5856 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 5857 } 5858 static DEVICE_ATTR_RO(per_cpu); 5859 5860 static ssize_t max_active_show(struct device *dev, 5861 struct device_attribute *attr, char *buf) 5862 { 5863 struct workqueue_struct *wq = dev_to_wq(dev); 5864 5865 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 5866 } 5867 5868 static ssize_t max_active_store(struct device *dev, 5869 struct device_attribute *attr, const char *buf, 5870 size_t count) 5871 { 5872 struct workqueue_struct *wq = dev_to_wq(dev); 5873 int val; 5874 5875 if (sscanf(buf, "%d", &val) != 1 || val <= 0) 5876 return -EINVAL; 5877 5878 workqueue_set_max_active(wq, val); 5879 return count; 5880 } 5881 static DEVICE_ATTR_RW(max_active); 5882 5883 static struct attribute *wq_sysfs_attrs[] = { 5884 &dev_attr_per_cpu.attr, 5885 &dev_attr_max_active.attr, 5886 NULL, 5887 }; 5888 ATTRIBUTE_GROUPS(wq_sysfs); 5889 5890 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, 5891 char *buf) 5892 { 5893 struct workqueue_struct *wq = dev_to_wq(dev); 5894 int written; 5895 5896 mutex_lock(&wq->mutex); 5897 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); 5898 mutex_unlock(&wq->mutex); 5899 5900 return written; 5901 } 5902 5903 /* prepare workqueue_attrs for sysfs store operations */ 5904 static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) 5905 { 5906 struct workqueue_attrs *attrs; 5907 5908 lockdep_assert_held(&wq_pool_mutex); 5909 5910 attrs = alloc_workqueue_attrs(); 5911 if (!attrs) 5912 return NULL; 5913 5914 copy_workqueue_attrs(attrs, wq->unbound_attrs); 5915 return attrs; 5916 } 5917 5918 static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, 5919 const char *buf, size_t count) 5920 { 5921 struct workqueue_struct *wq = dev_to_wq(dev); 5922 struct workqueue_attrs *attrs; 5923 int ret = -ENOMEM; 5924 5925 apply_wqattrs_lock(); 5926 5927 attrs = wq_sysfs_prep_attrs(wq); 5928 if (!attrs) 5929 goto out_unlock; 5930 5931 if (sscanf(buf, "%d", &attrs->nice) == 1 && 5932 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) 5933 ret = apply_workqueue_attrs_locked(wq, attrs); 5934 else 5935 ret = -EINVAL; 5936 5937 out_unlock: 5938 apply_wqattrs_unlock(); 5939 free_workqueue_attrs(attrs); 5940 return ret ?: count; 5941 } 5942 5943 static ssize_t wq_cpumask_show(struct device *dev, 5944 struct device_attribute *attr, char *buf) 5945 { 5946 struct workqueue_struct *wq = dev_to_wq(dev); 5947 int written; 5948 5949 mutex_lock(&wq->mutex); 5950 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", 5951 cpumask_pr_args(wq->unbound_attrs->cpumask)); 5952 mutex_unlock(&wq->mutex); 5953 return written; 5954 } 5955 5956 static ssize_t wq_cpumask_store(struct device *dev, 5957 struct device_attribute *attr, 5958 const char *buf, size_t count) 5959 { 5960 struct workqueue_struct *wq = dev_to_wq(dev); 5961 struct workqueue_attrs *attrs; 5962 int ret = -ENOMEM; 5963 5964 apply_wqattrs_lock(); 5965 5966 attrs = wq_sysfs_prep_attrs(wq); 5967 if (!attrs) 5968 goto out_unlock; 5969 5970 ret = cpumask_parse(buf, attrs->cpumask); 5971 if (!ret) 5972 ret = apply_workqueue_attrs_locked(wq, attrs); 5973 5974 out_unlock: 5975 apply_wqattrs_unlock(); 5976 free_workqueue_attrs(attrs); 5977 return ret ?: count; 5978 } 5979 5980 static ssize_t wq_affn_scope_show(struct device *dev, 5981 struct device_attribute *attr, char *buf) 5982 { 5983 struct workqueue_struct *wq = dev_to_wq(dev); 5984 int written; 5985 5986 mutex_lock(&wq->mutex); 5987 written = scnprintf(buf, PAGE_SIZE, "%s\n", 5988 wq_affn_names[wq->unbound_attrs->affn_scope]); 5989 mutex_unlock(&wq->mutex); 5990 5991 return written; 5992 } 5993 5994 static ssize_t wq_affn_scope_store(struct device *dev, 5995 struct device_attribute *attr, 5996 const char *buf, size_t count) 5997 { 5998 struct workqueue_struct *wq = dev_to_wq(dev); 5999 struct workqueue_attrs *attrs; 6000 int affn, ret = -ENOMEM; 6001 6002 affn = parse_affn_scope(buf); 6003 if (affn < 0) 6004 return affn; 6005 6006 apply_wqattrs_lock(); 6007 attrs = wq_sysfs_prep_attrs(wq); 6008 if (attrs) { 6009 attrs->affn_scope = affn; 6010 ret = apply_workqueue_attrs_locked(wq, attrs); 6011 } 6012 apply_wqattrs_unlock(); 6013 free_workqueue_attrs(attrs); 6014 return ret ?: count; 6015 } 6016 6017 static struct device_attribute wq_sysfs_unbound_attrs[] = { 6018 __ATTR(nice, 0644, wq_nice_show, wq_nice_store), 6019 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), 6020 __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), 6021 __ATTR_NULL, 6022 }; 6023 6024 static struct bus_type wq_subsys = { 6025 .name = "workqueue", 6026 .dev_groups = wq_sysfs_groups, 6027 }; 6028 6029 static ssize_t wq_unbound_cpumask_show(struct device *dev, 6030 struct device_attribute *attr, char *buf) 6031 { 6032 int written; 6033 6034 mutex_lock(&wq_pool_mutex); 6035 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", 6036 cpumask_pr_args(wq_unbound_cpumask)); 6037 mutex_unlock(&wq_pool_mutex); 6038 6039 return written; 6040 } 6041 6042 static ssize_t wq_unbound_cpumask_store(struct device *dev, 6043 struct device_attribute *attr, const char *buf, size_t count) 6044 { 6045 cpumask_var_t cpumask; 6046 int ret; 6047 6048 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 6049 return -ENOMEM; 6050 6051 ret = cpumask_parse(buf, cpumask); 6052 if (!ret) 6053 ret = workqueue_set_unbound_cpumask(cpumask); 6054 6055 free_cpumask_var(cpumask); 6056 return ret ? ret : count; 6057 } 6058 6059 static struct device_attribute wq_sysfs_cpumask_attr = 6060 __ATTR(cpumask, 0644, wq_unbound_cpumask_show, 6061 wq_unbound_cpumask_store); 6062 6063 static int __init wq_sysfs_init(void) 6064 { 6065 struct device *dev_root; 6066 int err; 6067 6068 err = subsys_virtual_register(&wq_subsys, NULL); 6069 if (err) 6070 return err; 6071 6072 dev_root = bus_get_dev_root(&wq_subsys); 6073 if (dev_root) { 6074 err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); 6075 put_device(dev_root); 6076 } 6077 return err; 6078 } 6079 core_initcall(wq_sysfs_init); 6080 6081 static void wq_device_release(struct device *dev) 6082 { 6083 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); 6084 6085 kfree(wq_dev); 6086 } 6087 6088 /** 6089 * workqueue_sysfs_register - make a workqueue visible in sysfs 6090 * @wq: the workqueue to register 6091 * 6092 * Expose @wq in sysfs under /sys/bus/workqueue/devices. 6093 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set 6094 * which is the preferred method. 6095 * 6096 * Workqueue user should use this function directly iff it wants to apply 6097 * workqueue_attrs before making the workqueue visible in sysfs; otherwise, 6098 * apply_workqueue_attrs() may race against userland updating the 6099 * attributes. 6100 * 6101 * Return: 0 on success, -errno on failure. 6102 */ 6103 int workqueue_sysfs_register(struct workqueue_struct *wq) 6104 { 6105 struct wq_device *wq_dev; 6106 int ret; 6107 6108 /* 6109 * Adjusting max_active or creating new pwqs by applying 6110 * attributes breaks ordering guarantee. Disallow exposing ordered 6111 * workqueues. 6112 */ 6113 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) 6114 return -EINVAL; 6115 6116 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); 6117 if (!wq_dev) 6118 return -ENOMEM; 6119 6120 wq_dev->wq = wq; 6121 wq_dev->dev.bus = &wq_subsys; 6122 wq_dev->dev.release = wq_device_release; 6123 dev_set_name(&wq_dev->dev, "%s", wq->name); 6124 6125 /* 6126 * unbound_attrs are created separately. Suppress uevent until 6127 * everything is ready. 6128 */ 6129 dev_set_uevent_suppress(&wq_dev->dev, true); 6130 6131 ret = device_register(&wq_dev->dev); 6132 if (ret) { 6133 put_device(&wq_dev->dev); 6134 wq->wq_dev = NULL; 6135 return ret; 6136 } 6137 6138 if (wq->flags & WQ_UNBOUND) { 6139 struct device_attribute *attr; 6140 6141 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { 6142 ret = device_create_file(&wq_dev->dev, attr); 6143 if (ret) { 6144 device_unregister(&wq_dev->dev); 6145 wq->wq_dev = NULL; 6146 return ret; 6147 } 6148 } 6149 } 6150 6151 dev_set_uevent_suppress(&wq_dev->dev, false); 6152 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 6153 return 0; 6154 } 6155 6156 /** 6157 * workqueue_sysfs_unregister - undo workqueue_sysfs_register() 6158 * @wq: the workqueue to unregister 6159 * 6160 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. 6161 */ 6162 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) 6163 { 6164 struct wq_device *wq_dev = wq->wq_dev; 6165 6166 if (!wq->wq_dev) 6167 return; 6168 6169 wq->wq_dev = NULL; 6170 device_unregister(&wq_dev->dev); 6171 } 6172 #else /* CONFIG_SYSFS */ 6173 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } 6174 #endif /* CONFIG_SYSFS */ 6175 6176 /* 6177 * Workqueue watchdog. 6178 * 6179 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal 6180 * flush dependency, a concurrency managed work item which stays RUNNING 6181 * indefinitely. Workqueue stalls can be very difficult to debug as the 6182 * usual warning mechanisms don't trigger and internal workqueue state is 6183 * largely opaque. 6184 * 6185 * Workqueue watchdog monitors all worker pools periodically and dumps 6186 * state if some pools failed to make forward progress for a while where 6187 * forward progress is defined as the first item on ->worklist changing. 6188 * 6189 * This mechanism is controlled through the kernel parameter 6190 * "workqueue.watchdog_thresh" which can be updated at runtime through the 6191 * corresponding sysfs parameter file. 6192 */ 6193 #ifdef CONFIG_WQ_WATCHDOG 6194 6195 static unsigned long wq_watchdog_thresh = 30; 6196 static struct timer_list wq_watchdog_timer; 6197 6198 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; 6199 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; 6200 6201 /* 6202 * Show workers that might prevent the processing of pending work items. 6203 * The only candidates are CPU-bound workers in the running state. 6204 * Pending work items should be handled by another idle worker 6205 * in all other situations. 6206 */ 6207 static void show_cpu_pool_hog(struct worker_pool *pool) 6208 { 6209 struct worker *worker; 6210 unsigned long flags; 6211 int bkt; 6212 6213 raw_spin_lock_irqsave(&pool->lock, flags); 6214 6215 hash_for_each(pool->busy_hash, bkt, worker, hentry) { 6216 if (task_is_running(worker->task)) { 6217 /* 6218 * Defer printing to avoid deadlocks in console 6219 * drivers that queue work while holding locks 6220 * also taken in their write paths. 6221 */ 6222 printk_deferred_enter(); 6223 6224 pr_info("pool %d:\n", pool->id); 6225 sched_show_task(worker->task); 6226 6227 printk_deferred_exit(); 6228 } 6229 } 6230 6231 raw_spin_unlock_irqrestore(&pool->lock, flags); 6232 } 6233 6234 static void show_cpu_pools_hogs(void) 6235 { 6236 struct worker_pool *pool; 6237 int pi; 6238 6239 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); 6240 6241 rcu_read_lock(); 6242 6243 for_each_pool(pool, pi) { 6244 if (pool->cpu_stall) 6245 show_cpu_pool_hog(pool); 6246 6247 } 6248 6249 rcu_read_unlock(); 6250 } 6251 6252 static void wq_watchdog_reset_touched(void) 6253 { 6254 int cpu; 6255 6256 wq_watchdog_touched = jiffies; 6257 for_each_possible_cpu(cpu) 6258 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; 6259 } 6260 6261 static void wq_watchdog_timer_fn(struct timer_list *unused) 6262 { 6263 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; 6264 bool lockup_detected = false; 6265 bool cpu_pool_stall = false; 6266 unsigned long now = jiffies; 6267 struct worker_pool *pool; 6268 int pi; 6269 6270 if (!thresh) 6271 return; 6272 6273 rcu_read_lock(); 6274 6275 for_each_pool(pool, pi) { 6276 unsigned long pool_ts, touched, ts; 6277 6278 pool->cpu_stall = false; 6279 if (list_empty(&pool->worklist)) 6280 continue; 6281 6282 /* 6283 * If a virtual machine is stopped by the host it can look to 6284 * the watchdog like a stall. 6285 */ 6286 kvm_check_and_clear_guest_paused(); 6287 6288 /* get the latest of pool and touched timestamps */ 6289 if (pool->cpu >= 0) 6290 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); 6291 else 6292 touched = READ_ONCE(wq_watchdog_touched); 6293 pool_ts = READ_ONCE(pool->watchdog_ts); 6294 6295 if (time_after(pool_ts, touched)) 6296 ts = pool_ts; 6297 else 6298 ts = touched; 6299 6300 /* did we stall? */ 6301 if (time_after(now, ts + thresh)) { 6302 lockup_detected = true; 6303 if (pool->cpu >= 0) { 6304 pool->cpu_stall = true; 6305 cpu_pool_stall = true; 6306 } 6307 pr_emerg("BUG: workqueue lockup - pool"); 6308 pr_cont_pool_info(pool); 6309 pr_cont(" stuck for %us!\n", 6310 jiffies_to_msecs(now - pool_ts) / 1000); 6311 } 6312 6313 6314 } 6315 6316 rcu_read_unlock(); 6317 6318 if (lockup_detected) 6319 show_all_workqueues(); 6320 6321 if (cpu_pool_stall) 6322 show_cpu_pools_hogs(); 6323 6324 wq_watchdog_reset_touched(); 6325 mod_timer(&wq_watchdog_timer, jiffies + thresh); 6326 } 6327 6328 notrace void wq_watchdog_touch(int cpu) 6329 { 6330 if (cpu >= 0) 6331 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; 6332 6333 wq_watchdog_touched = jiffies; 6334 } 6335 6336 static void wq_watchdog_set_thresh(unsigned long thresh) 6337 { 6338 wq_watchdog_thresh = 0; 6339 del_timer_sync(&wq_watchdog_timer); 6340 6341 if (thresh) { 6342 wq_watchdog_thresh = thresh; 6343 wq_watchdog_reset_touched(); 6344 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); 6345 } 6346 } 6347 6348 static int wq_watchdog_param_set_thresh(const char *val, 6349 const struct kernel_param *kp) 6350 { 6351 unsigned long thresh; 6352 int ret; 6353 6354 ret = kstrtoul(val, 0, &thresh); 6355 if (ret) 6356 return ret; 6357 6358 if (system_wq) 6359 wq_watchdog_set_thresh(thresh); 6360 else 6361 wq_watchdog_thresh = thresh; 6362 6363 return 0; 6364 } 6365 6366 static const struct kernel_param_ops wq_watchdog_thresh_ops = { 6367 .set = wq_watchdog_param_set_thresh, 6368 .get = param_get_ulong, 6369 }; 6370 6371 module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, 6372 0644); 6373 6374 static void wq_watchdog_init(void) 6375 { 6376 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); 6377 wq_watchdog_set_thresh(wq_watchdog_thresh); 6378 } 6379 6380 #else /* CONFIG_WQ_WATCHDOG */ 6381 6382 static inline void wq_watchdog_init(void) { } 6383 6384 #endif /* CONFIG_WQ_WATCHDOG */ 6385 6386 /** 6387 * workqueue_init_early - early init for workqueue subsystem 6388 * 6389 * This is the first step of three-staged workqueue subsystem initialization and 6390 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are 6391 * up. It sets up all the data structures and system workqueues and allows early 6392 * boot code to create workqueues and queue/cancel work items. Actual work item 6393 * execution starts only after kthreads can be created and scheduled right 6394 * before early initcalls. 6395 */ 6396 void __init workqueue_init_early(void) 6397 { 6398 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; 6399 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 6400 int i, cpu; 6401 6402 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 6403 6404 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 6405 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); 6406 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); 6407 6408 if (!cpumask_empty(&wq_cmdline_cpumask)) 6409 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); 6410 6411 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 6412 6413 wq_update_pod_attrs_buf = alloc_workqueue_attrs(); 6414 BUG_ON(!wq_update_pod_attrs_buf); 6415 6416 BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL)); 6417 6418 /* initialize WQ_AFFN_SYSTEM pods */ 6419 pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); 6420 pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); 6421 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); 6422 BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); 6423 6424 BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); 6425 6426 wq_update_pod_attrs_buf = alloc_workqueue_attrs(); 6427 BUG_ON(!wq_update_pod_attrs_buf); 6428 6429 pt->nr_pods = 1; 6430 cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); 6431 pt->pod_node[0] = NUMA_NO_NODE; 6432 pt->cpu_pod[0] = 0; 6433 6434 /* initialize CPU pools */ 6435 for_each_possible_cpu(cpu) { 6436 struct worker_pool *pool; 6437 6438 i = 0; 6439 for_each_cpu_worker_pool(pool, cpu) { 6440 BUG_ON(init_worker_pool(pool)); 6441 pool->cpu = cpu; 6442 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); 6443 pool->attrs->nice = std_nice[i++]; 6444 pool->node = cpu_to_node(cpu); 6445 6446 /* alloc pool ID */ 6447 mutex_lock(&wq_pool_mutex); 6448 BUG_ON(worker_pool_assign_id(pool)); 6449 mutex_unlock(&wq_pool_mutex); 6450 } 6451 } 6452 6453 /* create default unbound and ordered wq attrs */ 6454 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 6455 struct workqueue_attrs *attrs; 6456 6457 BUG_ON(!(attrs = alloc_workqueue_attrs())); 6458 attrs->nice = std_nice[i]; 6459 unbound_std_wq_attrs[i] = attrs; 6460 6461 /* 6462 * An ordered wq should have only one pwq as ordering is 6463 * guaranteed by max_active which is enforced by pwqs. 6464 */ 6465 BUG_ON(!(attrs = alloc_workqueue_attrs())); 6466 attrs->nice = std_nice[i]; 6467 attrs->ordered = true; 6468 ordered_wq_attrs[i] = attrs; 6469 } 6470 6471 system_wq = alloc_workqueue("events", 0, 0); 6472 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); 6473 system_long_wq = alloc_workqueue("events_long", 0, 0); 6474 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 6475 WQ_MAX_ACTIVE); 6476 system_freezable_wq = alloc_workqueue("events_freezable", 6477 WQ_FREEZABLE, 0); 6478 system_power_efficient_wq = alloc_workqueue("events_power_efficient", 6479 WQ_POWER_EFFICIENT, 0); 6480 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", 6481 WQ_FREEZABLE | WQ_POWER_EFFICIENT, 6482 0); 6483 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 6484 !system_unbound_wq || !system_freezable_wq || 6485 !system_power_efficient_wq || 6486 !system_freezable_power_efficient_wq); 6487 } 6488 6489 static void __init wq_cpu_intensive_thresh_init(void) 6490 { 6491 unsigned long thresh; 6492 unsigned long bogo; 6493 6494 /* if the user set it to a specific value, keep it */ 6495 if (wq_cpu_intensive_thresh_us != ULONG_MAX) 6496 return; 6497 6498 pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); 6499 BUG_ON(IS_ERR(pwq_release_worker)); 6500 6501 /* 6502 * The default of 10ms is derived from the fact that most modern (as of 6503 * 2023) processors can do a lot in 10ms and that it's just below what 6504 * most consider human-perceivable. However, the kernel also runs on a 6505 * lot slower CPUs including microcontrollers where the threshold is way 6506 * too low. 6507 * 6508 * Let's scale up the threshold upto 1 second if BogoMips is below 4000. 6509 * This is by no means accurate but it doesn't have to be. The mechanism 6510 * is still useful even when the threshold is fully scaled up. Also, as 6511 * the reports would usually be applicable to everyone, some machines 6512 * operating on longer thresholds won't significantly diminish their 6513 * usefulness. 6514 */ 6515 thresh = 10 * USEC_PER_MSEC; 6516 6517 /* see init/calibrate.c for lpj -> BogoMIPS calculation */ 6518 bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); 6519 if (bogo < 4000) 6520 thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); 6521 6522 pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", 6523 loops_per_jiffy, bogo, thresh); 6524 6525 wq_cpu_intensive_thresh_us = thresh; 6526 } 6527 6528 /** 6529 * workqueue_init - bring workqueue subsystem fully online 6530 * 6531 * This is the second step of three-staged workqueue subsystem initialization 6532 * and invoked as soon as kthreads can be created and scheduled. Workqueues have 6533 * been created and work items queued on them, but there are no kworkers 6534 * executing the work items yet. Populate the worker pools with the initial 6535 * workers and enable future kworker creations. 6536 */ 6537 void __init workqueue_init(void) 6538 { 6539 struct workqueue_struct *wq; 6540 struct worker_pool *pool; 6541 int cpu, bkt; 6542 6543 wq_cpu_intensive_thresh_init(); 6544 6545 mutex_lock(&wq_pool_mutex); 6546 6547 /* 6548 * Per-cpu pools created earlier could be missing node hint. Fix them 6549 * up. Also, create a rescuer for workqueues that requested it. 6550 */ 6551 for_each_possible_cpu(cpu) { 6552 for_each_cpu_worker_pool(pool, cpu) { 6553 pool->node = cpu_to_node(cpu); 6554 } 6555 } 6556 6557 list_for_each_entry(wq, &workqueues, list) { 6558 WARN(init_rescuer(wq), 6559 "workqueue: failed to create early rescuer for %s", 6560 wq->name); 6561 } 6562 6563 mutex_unlock(&wq_pool_mutex); 6564 6565 /* create the initial workers */ 6566 for_each_online_cpu(cpu) { 6567 for_each_cpu_worker_pool(pool, cpu) { 6568 pool->flags &= ~POOL_DISASSOCIATED; 6569 BUG_ON(!create_worker(pool)); 6570 } 6571 } 6572 6573 hash_for_each(unbound_pool_hash, bkt, pool, hash_node) 6574 BUG_ON(!create_worker(pool)); 6575 6576 wq_online = true; 6577 wq_watchdog_init(); 6578 } 6579 6580 /* 6581 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to 6582 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique 6583 * and consecutive pod ID. The rest of @pt is initialized accordingly. 6584 */ 6585 static void __init init_pod_type(struct wq_pod_type *pt, 6586 bool (*cpus_share_pod)(int, int)) 6587 { 6588 int cur, pre, cpu, pod; 6589 6590 pt->nr_pods = 0; 6591 6592 /* init @pt->cpu_pod[] according to @cpus_share_pod() */ 6593 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); 6594 BUG_ON(!pt->cpu_pod); 6595 6596 for_each_possible_cpu(cur) { 6597 for_each_possible_cpu(pre) { 6598 if (pre >= cur) { 6599 pt->cpu_pod[cur] = pt->nr_pods++; 6600 break; 6601 } 6602 if (cpus_share_pod(cur, pre)) { 6603 pt->cpu_pod[cur] = pt->cpu_pod[pre]; 6604 break; 6605 } 6606 } 6607 } 6608 6609 /* init the rest to match @pt->cpu_pod[] */ 6610 pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); 6611 pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); 6612 BUG_ON(!pt->pod_cpus || !pt->pod_node); 6613 6614 for (pod = 0; pod < pt->nr_pods; pod++) 6615 BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); 6616 6617 for_each_possible_cpu(cpu) { 6618 cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); 6619 pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); 6620 } 6621 } 6622 6623 static bool __init cpus_dont_share(int cpu0, int cpu1) 6624 { 6625 return false; 6626 } 6627 6628 static bool __init cpus_share_smt(int cpu0, int cpu1) 6629 { 6630 #ifdef CONFIG_SCHED_SMT 6631 return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); 6632 #else 6633 return false; 6634 #endif 6635 } 6636 6637 static bool __init cpus_share_numa(int cpu0, int cpu1) 6638 { 6639 return cpu_to_node(cpu0) == cpu_to_node(cpu1); 6640 } 6641 6642 /** 6643 * workqueue_init_topology - initialize CPU pods for unbound workqueues 6644 * 6645 * This is the third step of there-staged workqueue subsystem initialization and 6646 * invoked after SMP and topology information are fully initialized. It 6647 * initializes the unbound CPU pods accordingly. 6648 */ 6649 void __init workqueue_init_topology(void) 6650 { 6651 struct workqueue_struct *wq; 6652 int cpu; 6653 6654 init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); 6655 init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); 6656 init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); 6657 init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); 6658 6659 mutex_lock(&wq_pool_mutex); 6660 6661 /* 6662 * Workqueues allocated earlier would have all CPUs sharing the default 6663 * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU 6664 * combinations to apply per-pod sharing. 6665 */ 6666 list_for_each_entry(wq, &workqueues, list) { 6667 for_each_online_cpu(cpu) { 6668 wq_update_pod(wq, cpu, cpu, true); 6669 } 6670 } 6671 6672 mutex_unlock(&wq_pool_mutex); 6673 } 6674 6675 void __warn_flushing_systemwide_wq(void) 6676 { 6677 pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); 6678 dump_stack(); 6679 } 6680 EXPORT_SYMBOL(__warn_flushing_systemwide_wq); 6681 6682 static int __init workqueue_unbound_cpus_setup(char *str) 6683 { 6684 if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { 6685 cpumask_clear(&wq_cmdline_cpumask); 6686 pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); 6687 } 6688 6689 return 1; 6690 } 6691 __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); 6692