xref: /linux-6.15/kernel/workqueue.c (revision 6038f373)
1 /*
2  * linux/kernel/workqueue.c
3  *
4  * Generic mechanism for defining kernel helper threads for running
5  * arbitrary tasks in process context.
6  *
7  * Started by Ingo Molnar, Copyright (C) 2002
8  *
9  * Derived from the taskqueue/keventd code by:
10  *
11  *   David Woodhouse <[email protected]>
12  *   Andrew Morton
13  *   Kai Petzke <[email protected]>
14  *   Theodore Ts'o <[email protected]>
15  *
16  * Made to use alloc_percpu by Christoph Lameter.
17  */
18 
19 #include <linux/module.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/init.h>
23 #include <linux/signal.h>
24 #include <linux/completion.h>
25 #include <linux/workqueue.h>
26 #include <linux/slab.h>
27 #include <linux/cpu.h>
28 #include <linux/notifier.h>
29 #include <linux/kthread.h>
30 #include <linux/hardirq.h>
31 #include <linux/mempolicy.h>
32 #include <linux/freezer.h>
33 #include <linux/kallsyms.h>
34 #include <linux/debug_locks.h>
35 #include <linux/lockdep.h>
36 #include <linux/idr.h>
37 
38 #define CREATE_TRACE_POINTS
39 #include <trace/events/workqueue.h>
40 
41 #include "workqueue_sched.h"
42 
43 enum {
44 	/* global_cwq flags */
45 	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
46 	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
47 	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
48 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
49 	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
50 
51 	/* worker flags */
52 	WORKER_STARTED		= 1 << 0,	/* started */
53 	WORKER_DIE		= 1 << 1,	/* die die die */
54 	WORKER_IDLE		= 1 << 2,	/* is idle */
55 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
56 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
57 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
58 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
59 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
60 
61 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
62 				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
63 
64 	/* gcwq->trustee_state */
65 	TRUSTEE_START		= 0,		/* start */
66 	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
67 	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
68 	TRUSTEE_RELEASE		= 3,		/* release workers */
69 	TRUSTEE_DONE		= 4,		/* trustee is done */
70 
71 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
72 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
73 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
74 
75 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
76 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
77 
78 	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
79 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
80 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
81 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
82 
83 	/*
84 	 * Rescue workers are used only on emergencies and shared by
85 	 * all cpus.  Give -20.
86 	 */
87 	RESCUER_NICE_LEVEL	= -20,
88 };
89 
90 /*
91  * Structure fields follow one of the following exclusion rules.
92  *
93  * I: Modifiable by initialization/destruction paths and read-only for
94  *    everyone else.
95  *
96  * P: Preemption protected.  Disabling preemption is enough and should
97  *    only be modified and accessed from the local cpu.
98  *
99  * L: gcwq->lock protected.  Access with gcwq->lock held.
100  *
101  * X: During normal operation, modification requires gcwq->lock and
102  *    should be done only from local cpu.  Either disabling preemption
103  *    on local cpu or grabbing gcwq->lock is enough for read access.
104  *    If GCWQ_DISASSOCIATED is set, it's identical to L.
105  *
106  * F: wq->flush_mutex protected.
107  *
108  * W: workqueue_lock protected.
109  */
110 
111 struct global_cwq;
112 
113 /*
114  * The poor guys doing the actual heavy lifting.  All on-duty workers
115  * are either serving the manager role, on idle list or on busy hash.
116  */
117 struct worker {
118 	/* on idle list while idle, on busy hash table while busy */
119 	union {
120 		struct list_head	entry;	/* L: while idle */
121 		struct hlist_node	hentry;	/* L: while busy */
122 	};
123 
124 	struct work_struct	*current_work;	/* L: work being processed */
125 	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
126 	struct list_head	scheduled;	/* L: scheduled works */
127 	struct task_struct	*task;		/* I: worker task */
128 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
129 	/* 64 bytes boundary on 64bit, 32 on 32bit */
130 	unsigned long		last_active;	/* L: last active timestamp */
131 	unsigned int		flags;		/* X: flags */
132 	int			id;		/* I: worker id */
133 	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
134 };
135 
136 /*
137  * Global per-cpu workqueue.  There's one and only one for each cpu
138  * and all works are queued and processed here regardless of their
139  * target workqueues.
140  */
141 struct global_cwq {
142 	spinlock_t		lock;		/* the gcwq lock */
143 	struct list_head	worklist;	/* L: list of pending works */
144 	unsigned int		cpu;		/* I: the associated cpu */
145 	unsigned int		flags;		/* L: GCWQ_* flags */
146 
147 	int			nr_workers;	/* L: total number of workers */
148 	int			nr_idle;	/* L: currently idle ones */
149 
150 	/* workers are chained either in the idle_list or busy_hash */
151 	struct list_head	idle_list;	/* X: list of idle workers */
152 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
153 						/* L: hash of busy workers */
154 
155 	struct timer_list	idle_timer;	/* L: worker idle timeout */
156 	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
157 
158 	struct ida		worker_ida;	/* L: for worker IDs */
159 
160 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
161 	unsigned int		trustee_state;	/* L: trustee state */
162 	wait_queue_head_t	trustee_wait;	/* trustee wait */
163 	struct worker		*first_idle;	/* L: first idle worker */
164 } ____cacheline_aligned_in_smp;
165 
166 /*
167  * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
168  * work_struct->data are used for flags and thus cwqs need to be
169  * aligned at two's power of the number of flag bits.
170  */
171 struct cpu_workqueue_struct {
172 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
173 	struct workqueue_struct *wq;		/* I: the owning workqueue */
174 	int			work_color;	/* L: current color */
175 	int			flush_color;	/* L: flushing color */
176 	int			nr_in_flight[WORK_NR_COLORS];
177 						/* L: nr of in_flight works */
178 	int			nr_active;	/* L: nr of active works */
179 	int			max_active;	/* L: max active works */
180 	struct list_head	delayed_works;	/* L: delayed works */
181 };
182 
183 /*
184  * Structure used to wait for workqueue flush.
185  */
186 struct wq_flusher {
187 	struct list_head	list;		/* F: list of flushers */
188 	int			flush_color;	/* F: flush color waiting for */
189 	struct completion	done;		/* flush completion */
190 };
191 
192 /*
193  * All cpumasks are assumed to be always set on UP and thus can't be
194  * used to determine whether there's something to be done.
195  */
196 #ifdef CONFIG_SMP
197 typedef cpumask_var_t mayday_mask_t;
198 #define mayday_test_and_set_cpu(cpu, mask)	\
199 	cpumask_test_and_set_cpu((cpu), (mask))
200 #define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask))
201 #define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask))
202 #define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp))
203 #define free_mayday_mask(mask)			free_cpumask_var((mask))
204 #else
205 typedef unsigned long mayday_mask_t;
206 #define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask))
207 #define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask))
208 #define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask))
209 #define alloc_mayday_mask(maskp, gfp)		true
210 #define free_mayday_mask(mask)			do { } while (0)
211 #endif
212 
213 /*
214  * The externally visible workqueue abstraction is an array of
215  * per-CPU workqueues:
216  */
217 struct workqueue_struct {
218 	unsigned int		flags;		/* I: WQ_* flags */
219 	union {
220 		struct cpu_workqueue_struct __percpu	*pcpu;
221 		struct cpu_workqueue_struct		*single;
222 		unsigned long				v;
223 	} cpu_wq;				/* I: cwq's */
224 	struct list_head	list;		/* W: list of all workqueues */
225 
226 	struct mutex		flush_mutex;	/* protects wq flushing */
227 	int			work_color;	/* F: current work color */
228 	int			flush_color;	/* F: current flush color */
229 	atomic_t		nr_cwqs_to_flush; /* flush in progress */
230 	struct wq_flusher	*first_flusher;	/* F: first flusher */
231 	struct list_head	flusher_queue;	/* F: flush waiters */
232 	struct list_head	flusher_overflow; /* F: flush overflow list */
233 
234 	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
235 	struct worker		*rescuer;	/* I: rescue worker */
236 
237 	int			saved_max_active; /* W: saved cwq max_active */
238 	const char		*name;		/* I: workqueue name */
239 #ifdef CONFIG_LOCKDEP
240 	struct lockdep_map	lockdep_map;
241 #endif
242 };
243 
244 struct workqueue_struct *system_wq __read_mostly;
245 struct workqueue_struct *system_long_wq __read_mostly;
246 struct workqueue_struct *system_nrt_wq __read_mostly;
247 struct workqueue_struct *system_unbound_wq __read_mostly;
248 EXPORT_SYMBOL_GPL(system_wq);
249 EXPORT_SYMBOL_GPL(system_long_wq);
250 EXPORT_SYMBOL_GPL(system_nrt_wq);
251 EXPORT_SYMBOL_GPL(system_unbound_wq);
252 
253 #define for_each_busy_worker(worker, i, pos, gcwq)			\
254 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
255 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
256 
257 static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
258 				  unsigned int sw)
259 {
260 	if (cpu < nr_cpu_ids) {
261 		if (sw & 1) {
262 			cpu = cpumask_next(cpu, mask);
263 			if (cpu < nr_cpu_ids)
264 				return cpu;
265 		}
266 		if (sw & 2)
267 			return WORK_CPU_UNBOUND;
268 	}
269 	return WORK_CPU_NONE;
270 }
271 
272 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
273 				struct workqueue_struct *wq)
274 {
275 	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
276 }
277 
278 /*
279  * CPU iterators
280  *
281  * An extra gcwq is defined for an invalid cpu number
282  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
283  * specific CPU.  The following iterators are similar to
284  * for_each_*_cpu() iterators but also considers the unbound gcwq.
285  *
286  * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
287  * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
288  * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
289  *				  WORK_CPU_UNBOUND for unbound workqueues
290  */
291 #define for_each_gcwq_cpu(cpu)						\
292 	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
293 	     (cpu) < WORK_CPU_NONE;					\
294 	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
295 
296 #define for_each_online_gcwq_cpu(cpu)					\
297 	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
298 	     (cpu) < WORK_CPU_NONE;					\
299 	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
300 
301 #define for_each_cwq_cpu(cpu, wq)					\
302 	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
303 	     (cpu) < WORK_CPU_NONE;					\
304 	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
305 
306 #ifdef CONFIG_LOCKDEP
307 /**
308  * in_workqueue_context() - in context of specified workqueue?
309  * @wq: the workqueue of interest
310  *
311  * Checks lockdep state to see if the current task is executing from
312  * within a workqueue item.  This function exists only if lockdep is
313  * enabled.
314  */
315 int in_workqueue_context(struct workqueue_struct *wq)
316 {
317 	return lock_is_held(&wq->lockdep_map);
318 }
319 #endif
320 
321 #ifdef CONFIG_DEBUG_OBJECTS_WORK
322 
323 static struct debug_obj_descr work_debug_descr;
324 
325 /*
326  * fixup_init is called when:
327  * - an active object is initialized
328  */
329 static int work_fixup_init(void *addr, enum debug_obj_state state)
330 {
331 	struct work_struct *work = addr;
332 
333 	switch (state) {
334 	case ODEBUG_STATE_ACTIVE:
335 		cancel_work_sync(work);
336 		debug_object_init(work, &work_debug_descr);
337 		return 1;
338 	default:
339 		return 0;
340 	}
341 }
342 
343 /*
344  * fixup_activate is called when:
345  * - an active object is activated
346  * - an unknown object is activated (might be a statically initialized object)
347  */
348 static int work_fixup_activate(void *addr, enum debug_obj_state state)
349 {
350 	struct work_struct *work = addr;
351 
352 	switch (state) {
353 
354 	case ODEBUG_STATE_NOTAVAILABLE:
355 		/*
356 		 * This is not really a fixup. The work struct was
357 		 * statically initialized. We just make sure that it
358 		 * is tracked in the object tracker.
359 		 */
360 		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
361 			debug_object_init(work, &work_debug_descr);
362 			debug_object_activate(work, &work_debug_descr);
363 			return 0;
364 		}
365 		WARN_ON_ONCE(1);
366 		return 0;
367 
368 	case ODEBUG_STATE_ACTIVE:
369 		WARN_ON(1);
370 
371 	default:
372 		return 0;
373 	}
374 }
375 
376 /*
377  * fixup_free is called when:
378  * - an active object is freed
379  */
380 static int work_fixup_free(void *addr, enum debug_obj_state state)
381 {
382 	struct work_struct *work = addr;
383 
384 	switch (state) {
385 	case ODEBUG_STATE_ACTIVE:
386 		cancel_work_sync(work);
387 		debug_object_free(work, &work_debug_descr);
388 		return 1;
389 	default:
390 		return 0;
391 	}
392 }
393 
394 static struct debug_obj_descr work_debug_descr = {
395 	.name		= "work_struct",
396 	.fixup_init	= work_fixup_init,
397 	.fixup_activate	= work_fixup_activate,
398 	.fixup_free	= work_fixup_free,
399 };
400 
401 static inline void debug_work_activate(struct work_struct *work)
402 {
403 	debug_object_activate(work, &work_debug_descr);
404 }
405 
406 static inline void debug_work_deactivate(struct work_struct *work)
407 {
408 	debug_object_deactivate(work, &work_debug_descr);
409 }
410 
411 void __init_work(struct work_struct *work, int onstack)
412 {
413 	if (onstack)
414 		debug_object_init_on_stack(work, &work_debug_descr);
415 	else
416 		debug_object_init(work, &work_debug_descr);
417 }
418 EXPORT_SYMBOL_GPL(__init_work);
419 
420 void destroy_work_on_stack(struct work_struct *work)
421 {
422 	debug_object_free(work, &work_debug_descr);
423 }
424 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
425 
426 #else
427 static inline void debug_work_activate(struct work_struct *work) { }
428 static inline void debug_work_deactivate(struct work_struct *work) { }
429 #endif
430 
431 /* Serializes the accesses to the list of workqueues. */
432 static DEFINE_SPINLOCK(workqueue_lock);
433 static LIST_HEAD(workqueues);
434 static bool workqueue_freezing;		/* W: have wqs started freezing? */
435 
436 /*
437  * The almighty global cpu workqueues.  nr_running is the only field
438  * which is expected to be used frequently by other cpus via
439  * try_to_wake_up().  Put it in a separate cacheline.
440  */
441 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
442 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
443 
444 /*
445  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
446  * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
447  * workers have WORKER_UNBOUND set.
448  */
449 static struct global_cwq unbound_global_cwq;
450 static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
451 
452 static int worker_thread(void *__worker);
453 
454 static struct global_cwq *get_gcwq(unsigned int cpu)
455 {
456 	if (cpu != WORK_CPU_UNBOUND)
457 		return &per_cpu(global_cwq, cpu);
458 	else
459 		return &unbound_global_cwq;
460 }
461 
462 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
463 {
464 	if (cpu != WORK_CPU_UNBOUND)
465 		return &per_cpu(gcwq_nr_running, cpu);
466 	else
467 		return &unbound_gcwq_nr_running;
468 }
469 
470 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
471 					    struct workqueue_struct *wq)
472 {
473 	if (!(wq->flags & WQ_UNBOUND)) {
474 		if (likely(cpu < nr_cpu_ids)) {
475 #ifdef CONFIG_SMP
476 			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
477 #else
478 			return wq->cpu_wq.single;
479 #endif
480 		}
481 	} else if (likely(cpu == WORK_CPU_UNBOUND))
482 		return wq->cpu_wq.single;
483 	return NULL;
484 }
485 
486 static unsigned int work_color_to_flags(int color)
487 {
488 	return color << WORK_STRUCT_COLOR_SHIFT;
489 }
490 
491 static int get_work_color(struct work_struct *work)
492 {
493 	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
494 		((1 << WORK_STRUCT_COLOR_BITS) - 1);
495 }
496 
497 static int work_next_color(int color)
498 {
499 	return (color + 1) % WORK_NR_COLORS;
500 }
501 
502 /*
503  * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
504  * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
505  * cleared and the work data contains the cpu number it was last on.
506  *
507  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
508  * cwq, cpu or clear work->data.  These functions should only be
509  * called while the work is owned - ie. while the PENDING bit is set.
510  *
511  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
512  * corresponding to a work.  gcwq is available once the work has been
513  * queued anywhere after initialization.  cwq is available only from
514  * queueing until execution starts.
515  */
516 static inline void set_work_data(struct work_struct *work, unsigned long data,
517 				 unsigned long flags)
518 {
519 	BUG_ON(!work_pending(work));
520 	atomic_long_set(&work->data, data | flags | work_static(work));
521 }
522 
523 static void set_work_cwq(struct work_struct *work,
524 			 struct cpu_workqueue_struct *cwq,
525 			 unsigned long extra_flags)
526 {
527 	set_work_data(work, (unsigned long)cwq,
528 		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
529 }
530 
531 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
532 {
533 	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
534 }
535 
536 static void clear_work_data(struct work_struct *work)
537 {
538 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
539 }
540 
541 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
542 {
543 	unsigned long data = atomic_long_read(&work->data);
544 
545 	if (data & WORK_STRUCT_CWQ)
546 		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
547 	else
548 		return NULL;
549 }
550 
551 static struct global_cwq *get_work_gcwq(struct work_struct *work)
552 {
553 	unsigned long data = atomic_long_read(&work->data);
554 	unsigned int cpu;
555 
556 	if (data & WORK_STRUCT_CWQ)
557 		return ((struct cpu_workqueue_struct *)
558 			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
559 
560 	cpu = data >> WORK_STRUCT_FLAG_BITS;
561 	if (cpu == WORK_CPU_NONE)
562 		return NULL;
563 
564 	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
565 	return get_gcwq(cpu);
566 }
567 
568 /*
569  * Policy functions.  These define the policies on how the global
570  * worker pool is managed.  Unless noted otherwise, these functions
571  * assume that they're being called with gcwq->lock held.
572  */
573 
574 static bool __need_more_worker(struct global_cwq *gcwq)
575 {
576 	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
577 		gcwq->flags & GCWQ_HIGHPRI_PENDING;
578 }
579 
580 /*
581  * Need to wake up a worker?  Called from anything but currently
582  * running workers.
583  */
584 static bool need_more_worker(struct global_cwq *gcwq)
585 {
586 	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
587 }
588 
589 /* Can I start working?  Called from busy but !running workers. */
590 static bool may_start_working(struct global_cwq *gcwq)
591 {
592 	return gcwq->nr_idle;
593 }
594 
595 /* Do I need to keep working?  Called from currently running workers. */
596 static bool keep_working(struct global_cwq *gcwq)
597 {
598 	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
599 
600 	return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
601 }
602 
603 /* Do we need a new worker?  Called from manager. */
604 static bool need_to_create_worker(struct global_cwq *gcwq)
605 {
606 	return need_more_worker(gcwq) && !may_start_working(gcwq);
607 }
608 
609 /* Do I need to be the manager? */
610 static bool need_to_manage_workers(struct global_cwq *gcwq)
611 {
612 	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
613 }
614 
615 /* Do we have too many workers and should some go away? */
616 static bool too_many_workers(struct global_cwq *gcwq)
617 {
618 	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
619 	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
620 	int nr_busy = gcwq->nr_workers - nr_idle;
621 
622 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
623 }
624 
625 /*
626  * Wake up functions.
627  */
628 
629 /* Return the first worker.  Safe with preemption disabled */
630 static struct worker *first_worker(struct global_cwq *gcwq)
631 {
632 	if (unlikely(list_empty(&gcwq->idle_list)))
633 		return NULL;
634 
635 	return list_first_entry(&gcwq->idle_list, struct worker, entry);
636 }
637 
638 /**
639  * wake_up_worker - wake up an idle worker
640  * @gcwq: gcwq to wake worker for
641  *
642  * Wake up the first idle worker of @gcwq.
643  *
644  * CONTEXT:
645  * spin_lock_irq(gcwq->lock).
646  */
647 static void wake_up_worker(struct global_cwq *gcwq)
648 {
649 	struct worker *worker = first_worker(gcwq);
650 
651 	if (likely(worker))
652 		wake_up_process(worker->task);
653 }
654 
655 /**
656  * wq_worker_waking_up - a worker is waking up
657  * @task: task waking up
658  * @cpu: CPU @task is waking up to
659  *
660  * This function is called during try_to_wake_up() when a worker is
661  * being awoken.
662  *
663  * CONTEXT:
664  * spin_lock_irq(rq->lock)
665  */
666 void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
667 {
668 	struct worker *worker = kthread_data(task);
669 
670 	if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
671 		atomic_inc(get_gcwq_nr_running(cpu));
672 }
673 
674 /**
675  * wq_worker_sleeping - a worker is going to sleep
676  * @task: task going to sleep
677  * @cpu: CPU in question, must be the current CPU number
678  *
679  * This function is called during schedule() when a busy worker is
680  * going to sleep.  Worker on the same cpu can be woken up by
681  * returning pointer to its task.
682  *
683  * CONTEXT:
684  * spin_lock_irq(rq->lock)
685  *
686  * RETURNS:
687  * Worker task on @cpu to wake up, %NULL if none.
688  */
689 struct task_struct *wq_worker_sleeping(struct task_struct *task,
690 				       unsigned int cpu)
691 {
692 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
693 	struct global_cwq *gcwq = get_gcwq(cpu);
694 	atomic_t *nr_running = get_gcwq_nr_running(cpu);
695 
696 	if (unlikely(worker->flags & WORKER_NOT_RUNNING))
697 		return NULL;
698 
699 	/* this can only happen on the local cpu */
700 	BUG_ON(cpu != raw_smp_processor_id());
701 
702 	/*
703 	 * The counterpart of the following dec_and_test, implied mb,
704 	 * worklist not empty test sequence is in insert_work().
705 	 * Please read comment there.
706 	 *
707 	 * NOT_RUNNING is clear.  This means that trustee is not in
708 	 * charge and we're running on the local cpu w/ rq lock held
709 	 * and preemption disabled, which in turn means that none else
710 	 * could be manipulating idle_list, so dereferencing idle_list
711 	 * without gcwq lock is safe.
712 	 */
713 	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
714 		to_wakeup = first_worker(gcwq);
715 	return to_wakeup ? to_wakeup->task : NULL;
716 }
717 
718 /**
719  * worker_set_flags - set worker flags and adjust nr_running accordingly
720  * @worker: self
721  * @flags: flags to set
722  * @wakeup: wakeup an idle worker if necessary
723  *
724  * Set @flags in @worker->flags and adjust nr_running accordingly.  If
725  * nr_running becomes zero and @wakeup is %true, an idle worker is
726  * woken up.
727  *
728  * CONTEXT:
729  * spin_lock_irq(gcwq->lock)
730  */
731 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
732 				    bool wakeup)
733 {
734 	struct global_cwq *gcwq = worker->gcwq;
735 
736 	WARN_ON_ONCE(worker->task != current);
737 
738 	/*
739 	 * If transitioning into NOT_RUNNING, adjust nr_running and
740 	 * wake up an idle worker as necessary if requested by
741 	 * @wakeup.
742 	 */
743 	if ((flags & WORKER_NOT_RUNNING) &&
744 	    !(worker->flags & WORKER_NOT_RUNNING)) {
745 		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
746 
747 		if (wakeup) {
748 			if (atomic_dec_and_test(nr_running) &&
749 			    !list_empty(&gcwq->worklist))
750 				wake_up_worker(gcwq);
751 		} else
752 			atomic_dec(nr_running);
753 	}
754 
755 	worker->flags |= flags;
756 }
757 
758 /**
759  * worker_clr_flags - clear worker flags and adjust nr_running accordingly
760  * @worker: self
761  * @flags: flags to clear
762  *
763  * Clear @flags in @worker->flags and adjust nr_running accordingly.
764  *
765  * CONTEXT:
766  * spin_lock_irq(gcwq->lock)
767  */
768 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
769 {
770 	struct global_cwq *gcwq = worker->gcwq;
771 	unsigned int oflags = worker->flags;
772 
773 	WARN_ON_ONCE(worker->task != current);
774 
775 	worker->flags &= ~flags;
776 
777 	/* if transitioning out of NOT_RUNNING, increment nr_running */
778 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
779 		if (!(worker->flags & WORKER_NOT_RUNNING))
780 			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
781 }
782 
783 /**
784  * busy_worker_head - return the busy hash head for a work
785  * @gcwq: gcwq of interest
786  * @work: work to be hashed
787  *
788  * Return hash head of @gcwq for @work.
789  *
790  * CONTEXT:
791  * spin_lock_irq(gcwq->lock).
792  *
793  * RETURNS:
794  * Pointer to the hash head.
795  */
796 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
797 					   struct work_struct *work)
798 {
799 	const int base_shift = ilog2(sizeof(struct work_struct));
800 	unsigned long v = (unsigned long)work;
801 
802 	/* simple shift and fold hash, do we need something better? */
803 	v >>= base_shift;
804 	v += v >> BUSY_WORKER_HASH_ORDER;
805 	v &= BUSY_WORKER_HASH_MASK;
806 
807 	return &gcwq->busy_hash[v];
808 }
809 
810 /**
811  * __find_worker_executing_work - find worker which is executing a work
812  * @gcwq: gcwq of interest
813  * @bwh: hash head as returned by busy_worker_head()
814  * @work: work to find worker for
815  *
816  * Find a worker which is executing @work on @gcwq.  @bwh should be
817  * the hash head obtained by calling busy_worker_head() with the same
818  * work.
819  *
820  * CONTEXT:
821  * spin_lock_irq(gcwq->lock).
822  *
823  * RETURNS:
824  * Pointer to worker which is executing @work if found, NULL
825  * otherwise.
826  */
827 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
828 						   struct hlist_head *bwh,
829 						   struct work_struct *work)
830 {
831 	struct worker *worker;
832 	struct hlist_node *tmp;
833 
834 	hlist_for_each_entry(worker, tmp, bwh, hentry)
835 		if (worker->current_work == work)
836 			return worker;
837 	return NULL;
838 }
839 
840 /**
841  * find_worker_executing_work - find worker which is executing a work
842  * @gcwq: gcwq of interest
843  * @work: work to find worker for
844  *
845  * Find a worker which is executing @work on @gcwq.  This function is
846  * identical to __find_worker_executing_work() except that this
847  * function calculates @bwh itself.
848  *
849  * CONTEXT:
850  * spin_lock_irq(gcwq->lock).
851  *
852  * RETURNS:
853  * Pointer to worker which is executing @work if found, NULL
854  * otherwise.
855  */
856 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
857 						 struct work_struct *work)
858 {
859 	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
860 					    work);
861 }
862 
863 /**
864  * gcwq_determine_ins_pos - find insertion position
865  * @gcwq: gcwq of interest
866  * @cwq: cwq a work is being queued for
867  *
868  * A work for @cwq is about to be queued on @gcwq, determine insertion
869  * position for the work.  If @cwq is for HIGHPRI wq, the work is
870  * queued at the head of the queue but in FIFO order with respect to
871  * other HIGHPRI works; otherwise, at the end of the queue.  This
872  * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
873  * there are HIGHPRI works pending.
874  *
875  * CONTEXT:
876  * spin_lock_irq(gcwq->lock).
877  *
878  * RETURNS:
879  * Pointer to inserstion position.
880  */
881 static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
882 					       struct cpu_workqueue_struct *cwq)
883 {
884 	struct work_struct *twork;
885 
886 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
887 		return &gcwq->worklist;
888 
889 	list_for_each_entry(twork, &gcwq->worklist, entry) {
890 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
891 
892 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
893 			break;
894 	}
895 
896 	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
897 	return &twork->entry;
898 }
899 
900 /**
901  * insert_work - insert a work into gcwq
902  * @cwq: cwq @work belongs to
903  * @work: work to insert
904  * @head: insertion point
905  * @extra_flags: extra WORK_STRUCT_* flags to set
906  *
907  * Insert @work which belongs to @cwq into @gcwq after @head.
908  * @extra_flags is or'd to work_struct flags.
909  *
910  * CONTEXT:
911  * spin_lock_irq(gcwq->lock).
912  */
913 static void insert_work(struct cpu_workqueue_struct *cwq,
914 			struct work_struct *work, struct list_head *head,
915 			unsigned int extra_flags)
916 {
917 	struct global_cwq *gcwq = cwq->gcwq;
918 
919 	/* we own @work, set data and link */
920 	set_work_cwq(work, cwq, extra_flags);
921 
922 	/*
923 	 * Ensure that we get the right work->data if we see the
924 	 * result of list_add() below, see try_to_grab_pending().
925 	 */
926 	smp_wmb();
927 
928 	list_add_tail(&work->entry, head);
929 
930 	/*
931 	 * Ensure either worker_sched_deactivated() sees the above
932 	 * list_add_tail() or we see zero nr_running to avoid workers
933 	 * lying around lazily while there are works to be processed.
934 	 */
935 	smp_mb();
936 
937 	if (__need_more_worker(gcwq))
938 		wake_up_worker(gcwq);
939 }
940 
941 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
942 			 struct work_struct *work)
943 {
944 	struct global_cwq *gcwq;
945 	struct cpu_workqueue_struct *cwq;
946 	struct list_head *worklist;
947 	unsigned int work_flags;
948 	unsigned long flags;
949 
950 	debug_work_activate(work);
951 
952 	if (WARN_ON_ONCE(wq->flags & WQ_DYING))
953 		return;
954 
955 	/* determine gcwq to use */
956 	if (!(wq->flags & WQ_UNBOUND)) {
957 		struct global_cwq *last_gcwq;
958 
959 		if (unlikely(cpu == WORK_CPU_UNBOUND))
960 			cpu = raw_smp_processor_id();
961 
962 		/*
963 		 * It's multi cpu.  If @wq is non-reentrant and @work
964 		 * was previously on a different cpu, it might still
965 		 * be running there, in which case the work needs to
966 		 * be queued on that cpu to guarantee non-reentrance.
967 		 */
968 		gcwq = get_gcwq(cpu);
969 		if (wq->flags & WQ_NON_REENTRANT &&
970 		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
971 			struct worker *worker;
972 
973 			spin_lock_irqsave(&last_gcwq->lock, flags);
974 
975 			worker = find_worker_executing_work(last_gcwq, work);
976 
977 			if (worker && worker->current_cwq->wq == wq)
978 				gcwq = last_gcwq;
979 			else {
980 				/* meh... not running there, queue here */
981 				spin_unlock_irqrestore(&last_gcwq->lock, flags);
982 				spin_lock_irqsave(&gcwq->lock, flags);
983 			}
984 		} else
985 			spin_lock_irqsave(&gcwq->lock, flags);
986 	} else {
987 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
988 		spin_lock_irqsave(&gcwq->lock, flags);
989 	}
990 
991 	/* gcwq determined, get cwq and queue */
992 	cwq = get_cwq(gcwq->cpu, wq);
993 
994 	BUG_ON(!list_empty(&work->entry));
995 
996 	cwq->nr_in_flight[cwq->work_color]++;
997 	work_flags = work_color_to_flags(cwq->work_color);
998 
999 	if (likely(cwq->nr_active < cwq->max_active)) {
1000 		cwq->nr_active++;
1001 		worklist = gcwq_determine_ins_pos(gcwq, cwq);
1002 	} else {
1003 		work_flags |= WORK_STRUCT_DELAYED;
1004 		worklist = &cwq->delayed_works;
1005 	}
1006 
1007 	insert_work(cwq, work, worklist, work_flags);
1008 
1009 	spin_unlock_irqrestore(&gcwq->lock, flags);
1010 }
1011 
1012 /**
1013  * queue_work - queue work on a workqueue
1014  * @wq: workqueue to use
1015  * @work: work to queue
1016  *
1017  * Returns 0 if @work was already on a queue, non-zero otherwise.
1018  *
1019  * We queue the work to the CPU on which it was submitted, but if the CPU dies
1020  * it can be processed by another CPU.
1021  */
1022 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1023 {
1024 	int ret;
1025 
1026 	ret = queue_work_on(get_cpu(), wq, work);
1027 	put_cpu();
1028 
1029 	return ret;
1030 }
1031 EXPORT_SYMBOL_GPL(queue_work);
1032 
1033 /**
1034  * queue_work_on - queue work on specific cpu
1035  * @cpu: CPU number to execute work on
1036  * @wq: workqueue to use
1037  * @work: work to queue
1038  *
1039  * Returns 0 if @work was already on a queue, non-zero otherwise.
1040  *
1041  * We queue the work to a specific CPU, the caller must ensure it
1042  * can't go away.
1043  */
1044 int
1045 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1046 {
1047 	int ret = 0;
1048 
1049 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1050 		__queue_work(cpu, wq, work);
1051 		ret = 1;
1052 	}
1053 	return ret;
1054 }
1055 EXPORT_SYMBOL_GPL(queue_work_on);
1056 
1057 static void delayed_work_timer_fn(unsigned long __data)
1058 {
1059 	struct delayed_work *dwork = (struct delayed_work *)__data;
1060 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1061 
1062 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1063 }
1064 
1065 /**
1066  * queue_delayed_work - queue work on a workqueue after delay
1067  * @wq: workqueue to use
1068  * @dwork: delayable work to queue
1069  * @delay: number of jiffies to wait before queueing
1070  *
1071  * Returns 0 if @work was already on a queue, non-zero otherwise.
1072  */
1073 int queue_delayed_work(struct workqueue_struct *wq,
1074 			struct delayed_work *dwork, unsigned long delay)
1075 {
1076 	if (delay == 0)
1077 		return queue_work(wq, &dwork->work);
1078 
1079 	return queue_delayed_work_on(-1, wq, dwork, delay);
1080 }
1081 EXPORT_SYMBOL_GPL(queue_delayed_work);
1082 
1083 /**
1084  * queue_delayed_work_on - queue work on specific CPU after delay
1085  * @cpu: CPU number to execute work on
1086  * @wq: workqueue to use
1087  * @dwork: work to queue
1088  * @delay: number of jiffies to wait before queueing
1089  *
1090  * Returns 0 if @work was already on a queue, non-zero otherwise.
1091  */
1092 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1093 			struct delayed_work *dwork, unsigned long delay)
1094 {
1095 	int ret = 0;
1096 	struct timer_list *timer = &dwork->timer;
1097 	struct work_struct *work = &dwork->work;
1098 
1099 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1100 		unsigned int lcpu;
1101 
1102 		BUG_ON(timer_pending(timer));
1103 		BUG_ON(!list_empty(&work->entry));
1104 
1105 		timer_stats_timer_set_start_info(&dwork->timer);
1106 
1107 		/*
1108 		 * This stores cwq for the moment, for the timer_fn.
1109 		 * Note that the work's gcwq is preserved to allow
1110 		 * reentrance detection for delayed works.
1111 		 */
1112 		if (!(wq->flags & WQ_UNBOUND)) {
1113 			struct global_cwq *gcwq = get_work_gcwq(work);
1114 
1115 			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1116 				lcpu = gcwq->cpu;
1117 			else
1118 				lcpu = raw_smp_processor_id();
1119 		} else
1120 			lcpu = WORK_CPU_UNBOUND;
1121 
1122 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
1123 
1124 		timer->expires = jiffies + delay;
1125 		timer->data = (unsigned long)dwork;
1126 		timer->function = delayed_work_timer_fn;
1127 
1128 		if (unlikely(cpu >= 0))
1129 			add_timer_on(timer, cpu);
1130 		else
1131 			add_timer(timer);
1132 		ret = 1;
1133 	}
1134 	return ret;
1135 }
1136 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1137 
1138 /**
1139  * worker_enter_idle - enter idle state
1140  * @worker: worker which is entering idle state
1141  *
1142  * @worker is entering idle state.  Update stats and idle timer if
1143  * necessary.
1144  *
1145  * LOCKING:
1146  * spin_lock_irq(gcwq->lock).
1147  */
1148 static void worker_enter_idle(struct worker *worker)
1149 {
1150 	struct global_cwq *gcwq = worker->gcwq;
1151 
1152 	BUG_ON(worker->flags & WORKER_IDLE);
1153 	BUG_ON(!list_empty(&worker->entry) &&
1154 	       (worker->hentry.next || worker->hentry.pprev));
1155 
1156 	/* can't use worker_set_flags(), also called from start_worker() */
1157 	worker->flags |= WORKER_IDLE;
1158 	gcwq->nr_idle++;
1159 	worker->last_active = jiffies;
1160 
1161 	/* idle_list is LIFO */
1162 	list_add(&worker->entry, &gcwq->idle_list);
1163 
1164 	if (likely(!(worker->flags & WORKER_ROGUE))) {
1165 		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1166 			mod_timer(&gcwq->idle_timer,
1167 				  jiffies + IDLE_WORKER_TIMEOUT);
1168 	} else
1169 		wake_up_all(&gcwq->trustee_wait);
1170 
1171 	/* sanity check nr_running */
1172 	WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1173 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1174 }
1175 
1176 /**
1177  * worker_leave_idle - leave idle state
1178  * @worker: worker which is leaving idle state
1179  *
1180  * @worker is leaving idle state.  Update stats.
1181  *
1182  * LOCKING:
1183  * spin_lock_irq(gcwq->lock).
1184  */
1185 static void worker_leave_idle(struct worker *worker)
1186 {
1187 	struct global_cwq *gcwq = worker->gcwq;
1188 
1189 	BUG_ON(!(worker->flags & WORKER_IDLE));
1190 	worker_clr_flags(worker, WORKER_IDLE);
1191 	gcwq->nr_idle--;
1192 	list_del_init(&worker->entry);
1193 }
1194 
1195 /**
1196  * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1197  * @worker: self
1198  *
1199  * Works which are scheduled while the cpu is online must at least be
1200  * scheduled to a worker which is bound to the cpu so that if they are
1201  * flushed from cpu callbacks while cpu is going down, they are
1202  * guaranteed to execute on the cpu.
1203  *
1204  * This function is to be used by rogue workers and rescuers to bind
1205  * themselves to the target cpu and may race with cpu going down or
1206  * coming online.  kthread_bind() can't be used because it may put the
1207  * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1208  * verbatim as it's best effort and blocking and gcwq may be
1209  * [dis]associated in the meantime.
1210  *
1211  * This function tries set_cpus_allowed() and locks gcwq and verifies
1212  * the binding against GCWQ_DISASSOCIATED which is set during
1213  * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1214  * idle state or fetches works without dropping lock, it can guarantee
1215  * the scheduling requirement described in the first paragraph.
1216  *
1217  * CONTEXT:
1218  * Might sleep.  Called without any lock but returns with gcwq->lock
1219  * held.
1220  *
1221  * RETURNS:
1222  * %true if the associated gcwq is online (@worker is successfully
1223  * bound), %false if offline.
1224  */
1225 static bool worker_maybe_bind_and_lock(struct worker *worker)
1226 __acquires(&gcwq->lock)
1227 {
1228 	struct global_cwq *gcwq = worker->gcwq;
1229 	struct task_struct *task = worker->task;
1230 
1231 	while (true) {
1232 		/*
1233 		 * The following call may fail, succeed or succeed
1234 		 * without actually migrating the task to the cpu if
1235 		 * it races with cpu hotunplug operation.  Verify
1236 		 * against GCWQ_DISASSOCIATED.
1237 		 */
1238 		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1239 			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1240 
1241 		spin_lock_irq(&gcwq->lock);
1242 		if (gcwq->flags & GCWQ_DISASSOCIATED)
1243 			return false;
1244 		if (task_cpu(task) == gcwq->cpu &&
1245 		    cpumask_equal(&current->cpus_allowed,
1246 				  get_cpu_mask(gcwq->cpu)))
1247 			return true;
1248 		spin_unlock_irq(&gcwq->lock);
1249 
1250 		/* CPU has come up inbetween, retry migration */
1251 		cpu_relax();
1252 	}
1253 }
1254 
1255 /*
1256  * Function for worker->rebind_work used to rebind rogue busy workers
1257  * to the associated cpu which is coming back online.  This is
1258  * scheduled by cpu up but can race with other cpu hotplug operations
1259  * and may be executed twice without intervening cpu down.
1260  */
1261 static void worker_rebind_fn(struct work_struct *work)
1262 {
1263 	struct worker *worker = container_of(work, struct worker, rebind_work);
1264 	struct global_cwq *gcwq = worker->gcwq;
1265 
1266 	if (worker_maybe_bind_and_lock(worker))
1267 		worker_clr_flags(worker, WORKER_REBIND);
1268 
1269 	spin_unlock_irq(&gcwq->lock);
1270 }
1271 
1272 static struct worker *alloc_worker(void)
1273 {
1274 	struct worker *worker;
1275 
1276 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1277 	if (worker) {
1278 		INIT_LIST_HEAD(&worker->entry);
1279 		INIT_LIST_HEAD(&worker->scheduled);
1280 		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1281 		/* on creation a worker is in !idle && prep state */
1282 		worker->flags = WORKER_PREP;
1283 	}
1284 	return worker;
1285 }
1286 
1287 /**
1288  * create_worker - create a new workqueue worker
1289  * @gcwq: gcwq the new worker will belong to
1290  * @bind: whether to set affinity to @cpu or not
1291  *
1292  * Create a new worker which is bound to @gcwq.  The returned worker
1293  * can be started by calling start_worker() or destroyed using
1294  * destroy_worker().
1295  *
1296  * CONTEXT:
1297  * Might sleep.  Does GFP_KERNEL allocations.
1298  *
1299  * RETURNS:
1300  * Pointer to the newly created worker.
1301  */
1302 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1303 {
1304 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1305 	struct worker *worker = NULL;
1306 	int id = -1;
1307 
1308 	spin_lock_irq(&gcwq->lock);
1309 	while (ida_get_new(&gcwq->worker_ida, &id)) {
1310 		spin_unlock_irq(&gcwq->lock);
1311 		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1312 			goto fail;
1313 		spin_lock_irq(&gcwq->lock);
1314 	}
1315 	spin_unlock_irq(&gcwq->lock);
1316 
1317 	worker = alloc_worker();
1318 	if (!worker)
1319 		goto fail;
1320 
1321 	worker->gcwq = gcwq;
1322 	worker->id = id;
1323 
1324 	if (!on_unbound_cpu)
1325 		worker->task = kthread_create(worker_thread, worker,
1326 					      "kworker/%u:%d", gcwq->cpu, id);
1327 	else
1328 		worker->task = kthread_create(worker_thread, worker,
1329 					      "kworker/u:%d", id);
1330 	if (IS_ERR(worker->task))
1331 		goto fail;
1332 
1333 	/*
1334 	 * A rogue worker will become a regular one if CPU comes
1335 	 * online later on.  Make sure every worker has
1336 	 * PF_THREAD_BOUND set.
1337 	 */
1338 	if (bind && !on_unbound_cpu)
1339 		kthread_bind(worker->task, gcwq->cpu);
1340 	else {
1341 		worker->task->flags |= PF_THREAD_BOUND;
1342 		if (on_unbound_cpu)
1343 			worker->flags |= WORKER_UNBOUND;
1344 	}
1345 
1346 	return worker;
1347 fail:
1348 	if (id >= 0) {
1349 		spin_lock_irq(&gcwq->lock);
1350 		ida_remove(&gcwq->worker_ida, id);
1351 		spin_unlock_irq(&gcwq->lock);
1352 	}
1353 	kfree(worker);
1354 	return NULL;
1355 }
1356 
1357 /**
1358  * start_worker - start a newly created worker
1359  * @worker: worker to start
1360  *
1361  * Make the gcwq aware of @worker and start it.
1362  *
1363  * CONTEXT:
1364  * spin_lock_irq(gcwq->lock).
1365  */
1366 static void start_worker(struct worker *worker)
1367 {
1368 	worker->flags |= WORKER_STARTED;
1369 	worker->gcwq->nr_workers++;
1370 	worker_enter_idle(worker);
1371 	wake_up_process(worker->task);
1372 }
1373 
1374 /**
1375  * destroy_worker - destroy a workqueue worker
1376  * @worker: worker to be destroyed
1377  *
1378  * Destroy @worker and adjust @gcwq stats accordingly.
1379  *
1380  * CONTEXT:
1381  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1382  */
1383 static void destroy_worker(struct worker *worker)
1384 {
1385 	struct global_cwq *gcwq = worker->gcwq;
1386 	int id = worker->id;
1387 
1388 	/* sanity check frenzy */
1389 	BUG_ON(worker->current_work);
1390 	BUG_ON(!list_empty(&worker->scheduled));
1391 
1392 	if (worker->flags & WORKER_STARTED)
1393 		gcwq->nr_workers--;
1394 	if (worker->flags & WORKER_IDLE)
1395 		gcwq->nr_idle--;
1396 
1397 	list_del_init(&worker->entry);
1398 	worker->flags |= WORKER_DIE;
1399 
1400 	spin_unlock_irq(&gcwq->lock);
1401 
1402 	kthread_stop(worker->task);
1403 	kfree(worker);
1404 
1405 	spin_lock_irq(&gcwq->lock);
1406 	ida_remove(&gcwq->worker_ida, id);
1407 }
1408 
1409 static void idle_worker_timeout(unsigned long __gcwq)
1410 {
1411 	struct global_cwq *gcwq = (void *)__gcwq;
1412 
1413 	spin_lock_irq(&gcwq->lock);
1414 
1415 	if (too_many_workers(gcwq)) {
1416 		struct worker *worker;
1417 		unsigned long expires;
1418 
1419 		/* idle_list is kept in LIFO order, check the last one */
1420 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1421 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1422 
1423 		if (time_before(jiffies, expires))
1424 			mod_timer(&gcwq->idle_timer, expires);
1425 		else {
1426 			/* it's been idle for too long, wake up manager */
1427 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
1428 			wake_up_worker(gcwq);
1429 		}
1430 	}
1431 
1432 	spin_unlock_irq(&gcwq->lock);
1433 }
1434 
1435 static bool send_mayday(struct work_struct *work)
1436 {
1437 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1438 	struct workqueue_struct *wq = cwq->wq;
1439 	unsigned int cpu;
1440 
1441 	if (!(wq->flags & WQ_RESCUER))
1442 		return false;
1443 
1444 	/* mayday mayday mayday */
1445 	cpu = cwq->gcwq->cpu;
1446 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1447 	if (cpu == WORK_CPU_UNBOUND)
1448 		cpu = 0;
1449 	if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1450 		wake_up_process(wq->rescuer->task);
1451 	return true;
1452 }
1453 
1454 static void gcwq_mayday_timeout(unsigned long __gcwq)
1455 {
1456 	struct global_cwq *gcwq = (void *)__gcwq;
1457 	struct work_struct *work;
1458 
1459 	spin_lock_irq(&gcwq->lock);
1460 
1461 	if (need_to_create_worker(gcwq)) {
1462 		/*
1463 		 * We've been trying to create a new worker but
1464 		 * haven't been successful.  We might be hitting an
1465 		 * allocation deadlock.  Send distress signals to
1466 		 * rescuers.
1467 		 */
1468 		list_for_each_entry(work, &gcwq->worklist, entry)
1469 			send_mayday(work);
1470 	}
1471 
1472 	spin_unlock_irq(&gcwq->lock);
1473 
1474 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1475 }
1476 
1477 /**
1478  * maybe_create_worker - create a new worker if necessary
1479  * @gcwq: gcwq to create a new worker for
1480  *
1481  * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
1482  * have at least one idle worker on return from this function.  If
1483  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1484  * sent to all rescuers with works scheduled on @gcwq to resolve
1485  * possible allocation deadlock.
1486  *
1487  * On return, need_to_create_worker() is guaranteed to be false and
1488  * may_start_working() true.
1489  *
1490  * LOCKING:
1491  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1492  * multiple times.  Does GFP_KERNEL allocations.  Called only from
1493  * manager.
1494  *
1495  * RETURNS:
1496  * false if no action was taken and gcwq->lock stayed locked, true
1497  * otherwise.
1498  */
1499 static bool maybe_create_worker(struct global_cwq *gcwq)
1500 __releases(&gcwq->lock)
1501 __acquires(&gcwq->lock)
1502 {
1503 	if (!need_to_create_worker(gcwq))
1504 		return false;
1505 restart:
1506 	spin_unlock_irq(&gcwq->lock);
1507 
1508 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1509 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1510 
1511 	while (true) {
1512 		struct worker *worker;
1513 
1514 		worker = create_worker(gcwq, true);
1515 		if (worker) {
1516 			del_timer_sync(&gcwq->mayday_timer);
1517 			spin_lock_irq(&gcwq->lock);
1518 			start_worker(worker);
1519 			BUG_ON(need_to_create_worker(gcwq));
1520 			return true;
1521 		}
1522 
1523 		if (!need_to_create_worker(gcwq))
1524 			break;
1525 
1526 		__set_current_state(TASK_INTERRUPTIBLE);
1527 		schedule_timeout(CREATE_COOLDOWN);
1528 
1529 		if (!need_to_create_worker(gcwq))
1530 			break;
1531 	}
1532 
1533 	del_timer_sync(&gcwq->mayday_timer);
1534 	spin_lock_irq(&gcwq->lock);
1535 	if (need_to_create_worker(gcwq))
1536 		goto restart;
1537 	return true;
1538 }
1539 
1540 /**
1541  * maybe_destroy_worker - destroy workers which have been idle for a while
1542  * @gcwq: gcwq to destroy workers for
1543  *
1544  * Destroy @gcwq workers which have been idle for longer than
1545  * IDLE_WORKER_TIMEOUT.
1546  *
1547  * LOCKING:
1548  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1549  * multiple times.  Called only from manager.
1550  *
1551  * RETURNS:
1552  * false if no action was taken and gcwq->lock stayed locked, true
1553  * otherwise.
1554  */
1555 static bool maybe_destroy_workers(struct global_cwq *gcwq)
1556 {
1557 	bool ret = false;
1558 
1559 	while (too_many_workers(gcwq)) {
1560 		struct worker *worker;
1561 		unsigned long expires;
1562 
1563 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1564 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1565 
1566 		if (time_before(jiffies, expires)) {
1567 			mod_timer(&gcwq->idle_timer, expires);
1568 			break;
1569 		}
1570 
1571 		destroy_worker(worker);
1572 		ret = true;
1573 	}
1574 
1575 	return ret;
1576 }
1577 
1578 /**
1579  * manage_workers - manage worker pool
1580  * @worker: self
1581  *
1582  * Assume the manager role and manage gcwq worker pool @worker belongs
1583  * to.  At any given time, there can be only zero or one manager per
1584  * gcwq.  The exclusion is handled automatically by this function.
1585  *
1586  * The caller can safely start processing works on false return.  On
1587  * true return, it's guaranteed that need_to_create_worker() is false
1588  * and may_start_working() is true.
1589  *
1590  * CONTEXT:
1591  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1592  * multiple times.  Does GFP_KERNEL allocations.
1593  *
1594  * RETURNS:
1595  * false if no action was taken and gcwq->lock stayed locked, true if
1596  * some action was taken.
1597  */
1598 static bool manage_workers(struct worker *worker)
1599 {
1600 	struct global_cwq *gcwq = worker->gcwq;
1601 	bool ret = false;
1602 
1603 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1604 		return ret;
1605 
1606 	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1607 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
1608 
1609 	/*
1610 	 * Destroy and then create so that may_start_working() is true
1611 	 * on return.
1612 	 */
1613 	ret |= maybe_destroy_workers(gcwq);
1614 	ret |= maybe_create_worker(gcwq);
1615 
1616 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1617 
1618 	/*
1619 	 * The trustee might be waiting to take over the manager
1620 	 * position, tell it we're done.
1621 	 */
1622 	if (unlikely(gcwq->trustee))
1623 		wake_up_all(&gcwq->trustee_wait);
1624 
1625 	return ret;
1626 }
1627 
1628 /**
1629  * move_linked_works - move linked works to a list
1630  * @work: start of series of works to be scheduled
1631  * @head: target list to append @work to
1632  * @nextp: out paramter for nested worklist walking
1633  *
1634  * Schedule linked works starting from @work to @head.  Work series to
1635  * be scheduled starts at @work and includes any consecutive work with
1636  * WORK_STRUCT_LINKED set in its predecessor.
1637  *
1638  * If @nextp is not NULL, it's updated to point to the next work of
1639  * the last scheduled work.  This allows move_linked_works() to be
1640  * nested inside outer list_for_each_entry_safe().
1641  *
1642  * CONTEXT:
1643  * spin_lock_irq(gcwq->lock).
1644  */
1645 static void move_linked_works(struct work_struct *work, struct list_head *head,
1646 			      struct work_struct **nextp)
1647 {
1648 	struct work_struct *n;
1649 
1650 	/*
1651 	 * Linked worklist will always end before the end of the list,
1652 	 * use NULL for list head.
1653 	 */
1654 	list_for_each_entry_safe_from(work, n, NULL, entry) {
1655 		list_move_tail(&work->entry, head);
1656 		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1657 			break;
1658 	}
1659 
1660 	/*
1661 	 * If we're already inside safe list traversal and have moved
1662 	 * multiple works to the scheduled queue, the next position
1663 	 * needs to be updated.
1664 	 */
1665 	if (nextp)
1666 		*nextp = n;
1667 }
1668 
1669 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1670 {
1671 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
1672 						    struct work_struct, entry);
1673 	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1674 
1675 	move_linked_works(work, pos, NULL);
1676 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1677 	cwq->nr_active++;
1678 }
1679 
1680 /**
1681  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1682  * @cwq: cwq of interest
1683  * @color: color of work which left the queue
1684  * @delayed: for a delayed work
1685  *
1686  * A work either has completed or is removed from pending queue,
1687  * decrement nr_in_flight of its cwq and handle workqueue flushing.
1688  *
1689  * CONTEXT:
1690  * spin_lock_irq(gcwq->lock).
1691  */
1692 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1693 				 bool delayed)
1694 {
1695 	/* ignore uncolored works */
1696 	if (color == WORK_NO_COLOR)
1697 		return;
1698 
1699 	cwq->nr_in_flight[color]--;
1700 
1701 	if (!delayed) {
1702 		cwq->nr_active--;
1703 		if (!list_empty(&cwq->delayed_works)) {
1704 			/* one down, submit a delayed one */
1705 			if (cwq->nr_active < cwq->max_active)
1706 				cwq_activate_first_delayed(cwq);
1707 		}
1708 	}
1709 
1710 	/* is flush in progress and are we at the flushing tip? */
1711 	if (likely(cwq->flush_color != color))
1712 		return;
1713 
1714 	/* are there still in-flight works? */
1715 	if (cwq->nr_in_flight[color])
1716 		return;
1717 
1718 	/* this cwq is done, clear flush_color */
1719 	cwq->flush_color = -1;
1720 
1721 	/*
1722 	 * If this was the last cwq, wake up the first flusher.  It
1723 	 * will handle the rest.
1724 	 */
1725 	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1726 		complete(&cwq->wq->first_flusher->done);
1727 }
1728 
1729 /**
1730  * process_one_work - process single work
1731  * @worker: self
1732  * @work: work to process
1733  *
1734  * Process @work.  This function contains all the logics necessary to
1735  * process a single work including synchronization against and
1736  * interaction with other workers on the same cpu, queueing and
1737  * flushing.  As long as context requirement is met, any worker can
1738  * call this function to process a work.
1739  *
1740  * CONTEXT:
1741  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1742  */
1743 static void process_one_work(struct worker *worker, struct work_struct *work)
1744 __releases(&gcwq->lock)
1745 __acquires(&gcwq->lock)
1746 {
1747 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1748 	struct global_cwq *gcwq = cwq->gcwq;
1749 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
1750 	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1751 	work_func_t f = work->func;
1752 	int work_color;
1753 	struct worker *collision;
1754 #ifdef CONFIG_LOCKDEP
1755 	/*
1756 	 * It is permissible to free the struct work_struct from
1757 	 * inside the function that is called from it, this we need to
1758 	 * take into account for lockdep too.  To avoid bogus "held
1759 	 * lock freed" warnings as well as problems when looking into
1760 	 * work->lockdep_map, make a copy and use that here.
1761 	 */
1762 	struct lockdep_map lockdep_map = work->lockdep_map;
1763 #endif
1764 	/*
1765 	 * A single work shouldn't be executed concurrently by
1766 	 * multiple workers on a single cpu.  Check whether anyone is
1767 	 * already processing the work.  If so, defer the work to the
1768 	 * currently executing one.
1769 	 */
1770 	collision = __find_worker_executing_work(gcwq, bwh, work);
1771 	if (unlikely(collision)) {
1772 		move_linked_works(work, &collision->scheduled, NULL);
1773 		return;
1774 	}
1775 
1776 	/* claim and process */
1777 	debug_work_deactivate(work);
1778 	hlist_add_head(&worker->hentry, bwh);
1779 	worker->current_work = work;
1780 	worker->current_cwq = cwq;
1781 	work_color = get_work_color(work);
1782 
1783 	/* record the current cpu number in the work data and dequeue */
1784 	set_work_cpu(work, gcwq->cpu);
1785 	list_del_init(&work->entry);
1786 
1787 	/*
1788 	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1789 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1790 	 */
1791 	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1792 		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1793 						struct work_struct, entry);
1794 
1795 		if (!list_empty(&gcwq->worklist) &&
1796 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1797 			wake_up_worker(gcwq);
1798 		else
1799 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1800 	}
1801 
1802 	/*
1803 	 * CPU intensive works don't participate in concurrency
1804 	 * management.  They're the scheduler's responsibility.
1805 	 */
1806 	if (unlikely(cpu_intensive))
1807 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1808 
1809 	spin_unlock_irq(&gcwq->lock);
1810 
1811 	work_clear_pending(work);
1812 	lock_map_acquire(&cwq->wq->lockdep_map);
1813 	lock_map_acquire(&lockdep_map);
1814 	trace_workqueue_execute_start(work);
1815 	f(work);
1816 	/*
1817 	 * While we must be careful to not use "work" after this, the trace
1818 	 * point will only record its address.
1819 	 */
1820 	trace_workqueue_execute_end(work);
1821 	lock_map_release(&lockdep_map);
1822 	lock_map_release(&cwq->wq->lockdep_map);
1823 
1824 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1825 		printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1826 		       "%s/0x%08x/%d\n",
1827 		       current->comm, preempt_count(), task_pid_nr(current));
1828 		printk(KERN_ERR "    last function: ");
1829 		print_symbol("%s\n", (unsigned long)f);
1830 		debug_show_held_locks(current);
1831 		dump_stack();
1832 	}
1833 
1834 	spin_lock_irq(&gcwq->lock);
1835 
1836 	/* clear cpu intensive status */
1837 	if (unlikely(cpu_intensive))
1838 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1839 
1840 	/* we're done with it, release */
1841 	hlist_del_init(&worker->hentry);
1842 	worker->current_work = NULL;
1843 	worker->current_cwq = NULL;
1844 	cwq_dec_nr_in_flight(cwq, work_color, false);
1845 }
1846 
1847 /**
1848  * process_scheduled_works - process scheduled works
1849  * @worker: self
1850  *
1851  * Process all scheduled works.  Please note that the scheduled list
1852  * may change while processing a work, so this function repeatedly
1853  * fetches a work from the top and executes it.
1854  *
1855  * CONTEXT:
1856  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1857  * multiple times.
1858  */
1859 static void process_scheduled_works(struct worker *worker)
1860 {
1861 	while (!list_empty(&worker->scheduled)) {
1862 		struct work_struct *work = list_first_entry(&worker->scheduled,
1863 						struct work_struct, entry);
1864 		process_one_work(worker, work);
1865 	}
1866 }
1867 
1868 /**
1869  * worker_thread - the worker thread function
1870  * @__worker: self
1871  *
1872  * The gcwq worker thread function.  There's a single dynamic pool of
1873  * these per each cpu.  These workers process all works regardless of
1874  * their specific target workqueue.  The only exception is works which
1875  * belong to workqueues with a rescuer which will be explained in
1876  * rescuer_thread().
1877  */
1878 static int worker_thread(void *__worker)
1879 {
1880 	struct worker *worker = __worker;
1881 	struct global_cwq *gcwq = worker->gcwq;
1882 
1883 	/* tell the scheduler that this is a workqueue worker */
1884 	worker->task->flags |= PF_WQ_WORKER;
1885 woke_up:
1886 	spin_lock_irq(&gcwq->lock);
1887 
1888 	/* DIE can be set only while we're idle, checking here is enough */
1889 	if (worker->flags & WORKER_DIE) {
1890 		spin_unlock_irq(&gcwq->lock);
1891 		worker->task->flags &= ~PF_WQ_WORKER;
1892 		return 0;
1893 	}
1894 
1895 	worker_leave_idle(worker);
1896 recheck:
1897 	/* no more worker necessary? */
1898 	if (!need_more_worker(gcwq))
1899 		goto sleep;
1900 
1901 	/* do we need to manage? */
1902 	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1903 		goto recheck;
1904 
1905 	/*
1906 	 * ->scheduled list can only be filled while a worker is
1907 	 * preparing to process a work or actually processing it.
1908 	 * Make sure nobody diddled with it while I was sleeping.
1909 	 */
1910 	BUG_ON(!list_empty(&worker->scheduled));
1911 
1912 	/*
1913 	 * When control reaches this point, we're guaranteed to have
1914 	 * at least one idle worker or that someone else has already
1915 	 * assumed the manager role.
1916 	 */
1917 	worker_clr_flags(worker, WORKER_PREP);
1918 
1919 	do {
1920 		struct work_struct *work =
1921 			list_first_entry(&gcwq->worklist,
1922 					 struct work_struct, entry);
1923 
1924 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1925 			/* optimization path, not strictly necessary */
1926 			process_one_work(worker, work);
1927 			if (unlikely(!list_empty(&worker->scheduled)))
1928 				process_scheduled_works(worker);
1929 		} else {
1930 			move_linked_works(work, &worker->scheduled, NULL);
1931 			process_scheduled_works(worker);
1932 		}
1933 	} while (keep_working(gcwq));
1934 
1935 	worker_set_flags(worker, WORKER_PREP, false);
1936 sleep:
1937 	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1938 		goto recheck;
1939 
1940 	/*
1941 	 * gcwq->lock is held and there's no work to process and no
1942 	 * need to manage, sleep.  Workers are woken up only while
1943 	 * holding gcwq->lock or from local cpu, so setting the
1944 	 * current state before releasing gcwq->lock is enough to
1945 	 * prevent losing any event.
1946 	 */
1947 	worker_enter_idle(worker);
1948 	__set_current_state(TASK_INTERRUPTIBLE);
1949 	spin_unlock_irq(&gcwq->lock);
1950 	schedule();
1951 	goto woke_up;
1952 }
1953 
1954 /**
1955  * rescuer_thread - the rescuer thread function
1956  * @__wq: the associated workqueue
1957  *
1958  * Workqueue rescuer thread function.  There's one rescuer for each
1959  * workqueue which has WQ_RESCUER set.
1960  *
1961  * Regular work processing on a gcwq may block trying to create a new
1962  * worker which uses GFP_KERNEL allocation which has slight chance of
1963  * developing into deadlock if some works currently on the same queue
1964  * need to be processed to satisfy the GFP_KERNEL allocation.  This is
1965  * the problem rescuer solves.
1966  *
1967  * When such condition is possible, the gcwq summons rescuers of all
1968  * workqueues which have works queued on the gcwq and let them process
1969  * those works so that forward progress can be guaranteed.
1970  *
1971  * This should happen rarely.
1972  */
1973 static int rescuer_thread(void *__wq)
1974 {
1975 	struct workqueue_struct *wq = __wq;
1976 	struct worker *rescuer = wq->rescuer;
1977 	struct list_head *scheduled = &rescuer->scheduled;
1978 	bool is_unbound = wq->flags & WQ_UNBOUND;
1979 	unsigned int cpu;
1980 
1981 	set_user_nice(current, RESCUER_NICE_LEVEL);
1982 repeat:
1983 	set_current_state(TASK_INTERRUPTIBLE);
1984 
1985 	if (kthread_should_stop())
1986 		return 0;
1987 
1988 	/*
1989 	 * See whether any cpu is asking for help.  Unbounded
1990 	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
1991 	 */
1992 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
1993 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
1994 		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
1995 		struct global_cwq *gcwq = cwq->gcwq;
1996 		struct work_struct *work, *n;
1997 
1998 		__set_current_state(TASK_RUNNING);
1999 		mayday_clear_cpu(cpu, wq->mayday_mask);
2000 
2001 		/* migrate to the target cpu if possible */
2002 		rescuer->gcwq = gcwq;
2003 		worker_maybe_bind_and_lock(rescuer);
2004 
2005 		/*
2006 		 * Slurp in all works issued via this workqueue and
2007 		 * process'em.
2008 		 */
2009 		BUG_ON(!list_empty(&rescuer->scheduled));
2010 		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2011 			if (get_work_cwq(work) == cwq)
2012 				move_linked_works(work, scheduled, &n);
2013 
2014 		process_scheduled_works(rescuer);
2015 		spin_unlock_irq(&gcwq->lock);
2016 	}
2017 
2018 	schedule();
2019 	goto repeat;
2020 }
2021 
2022 struct wq_barrier {
2023 	struct work_struct	work;
2024 	struct completion	done;
2025 };
2026 
2027 static void wq_barrier_func(struct work_struct *work)
2028 {
2029 	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2030 	complete(&barr->done);
2031 }
2032 
2033 /**
2034  * insert_wq_barrier - insert a barrier work
2035  * @cwq: cwq to insert barrier into
2036  * @barr: wq_barrier to insert
2037  * @target: target work to attach @barr to
2038  * @worker: worker currently executing @target, NULL if @target is not executing
2039  *
2040  * @barr is linked to @target such that @barr is completed only after
2041  * @target finishes execution.  Please note that the ordering
2042  * guarantee is observed only with respect to @target and on the local
2043  * cpu.
2044  *
2045  * Currently, a queued barrier can't be canceled.  This is because
2046  * try_to_grab_pending() can't determine whether the work to be
2047  * grabbed is at the head of the queue and thus can't clear LINKED
2048  * flag of the previous work while there must be a valid next work
2049  * after a work with LINKED flag set.
2050  *
2051  * Note that when @worker is non-NULL, @target may be modified
2052  * underneath us, so we can't reliably determine cwq from @target.
2053  *
2054  * CONTEXT:
2055  * spin_lock_irq(gcwq->lock).
2056  */
2057 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2058 			      struct wq_barrier *barr,
2059 			      struct work_struct *target, struct worker *worker)
2060 {
2061 	struct list_head *head;
2062 	unsigned int linked = 0;
2063 
2064 	/*
2065 	 * debugobject calls are safe here even with gcwq->lock locked
2066 	 * as we know for sure that this will not trigger any of the
2067 	 * checks and call back into the fixup functions where we
2068 	 * might deadlock.
2069 	 */
2070 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
2071 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2072 	init_completion(&barr->done);
2073 
2074 	/*
2075 	 * If @target is currently being executed, schedule the
2076 	 * barrier to the worker; otherwise, put it after @target.
2077 	 */
2078 	if (worker)
2079 		head = worker->scheduled.next;
2080 	else {
2081 		unsigned long *bits = work_data_bits(target);
2082 
2083 		head = target->entry.next;
2084 		/* there can already be other linked works, inherit and set */
2085 		linked = *bits & WORK_STRUCT_LINKED;
2086 		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
2087 	}
2088 
2089 	debug_work_activate(&barr->work);
2090 	insert_work(cwq, &barr->work, head,
2091 		    work_color_to_flags(WORK_NO_COLOR) | linked);
2092 }
2093 
2094 /**
2095  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2096  * @wq: workqueue being flushed
2097  * @flush_color: new flush color, < 0 for no-op
2098  * @work_color: new work color, < 0 for no-op
2099  *
2100  * Prepare cwqs for workqueue flushing.
2101  *
2102  * If @flush_color is non-negative, flush_color on all cwqs should be
2103  * -1.  If no cwq has in-flight commands at the specified color, all
2104  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
2105  * has in flight commands, its cwq->flush_color is set to
2106  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2107  * wakeup logic is armed and %true is returned.
2108  *
2109  * The caller should have initialized @wq->first_flusher prior to
2110  * calling this function with non-negative @flush_color.  If
2111  * @flush_color is negative, no flush color update is done and %false
2112  * is returned.
2113  *
2114  * If @work_color is non-negative, all cwqs should have the same
2115  * work_color which is previous to @work_color and all will be
2116  * advanced to @work_color.
2117  *
2118  * CONTEXT:
2119  * mutex_lock(wq->flush_mutex).
2120  *
2121  * RETURNS:
2122  * %true if @flush_color >= 0 and there's something to flush.  %false
2123  * otherwise.
2124  */
2125 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2126 				      int flush_color, int work_color)
2127 {
2128 	bool wait = false;
2129 	unsigned int cpu;
2130 
2131 	if (flush_color >= 0) {
2132 		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
2133 		atomic_set(&wq->nr_cwqs_to_flush, 1);
2134 	}
2135 
2136 	for_each_cwq_cpu(cpu, wq) {
2137 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2138 		struct global_cwq *gcwq = cwq->gcwq;
2139 
2140 		spin_lock_irq(&gcwq->lock);
2141 
2142 		if (flush_color >= 0) {
2143 			BUG_ON(cwq->flush_color != -1);
2144 
2145 			if (cwq->nr_in_flight[flush_color]) {
2146 				cwq->flush_color = flush_color;
2147 				atomic_inc(&wq->nr_cwqs_to_flush);
2148 				wait = true;
2149 			}
2150 		}
2151 
2152 		if (work_color >= 0) {
2153 			BUG_ON(work_color != work_next_color(cwq->work_color));
2154 			cwq->work_color = work_color;
2155 		}
2156 
2157 		spin_unlock_irq(&gcwq->lock);
2158 	}
2159 
2160 	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2161 		complete(&wq->first_flusher->done);
2162 
2163 	return wait;
2164 }
2165 
2166 /**
2167  * flush_workqueue - ensure that any scheduled work has run to completion.
2168  * @wq: workqueue to flush
2169  *
2170  * Forces execution of the workqueue and blocks until its completion.
2171  * This is typically used in driver shutdown handlers.
2172  *
2173  * We sleep until all works which were queued on entry have been handled,
2174  * but we are not livelocked by new incoming ones.
2175  */
2176 void flush_workqueue(struct workqueue_struct *wq)
2177 {
2178 	struct wq_flusher this_flusher = {
2179 		.list = LIST_HEAD_INIT(this_flusher.list),
2180 		.flush_color = -1,
2181 		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2182 	};
2183 	int next_color;
2184 
2185 	lock_map_acquire(&wq->lockdep_map);
2186 	lock_map_release(&wq->lockdep_map);
2187 
2188 	mutex_lock(&wq->flush_mutex);
2189 
2190 	/*
2191 	 * Start-to-wait phase
2192 	 */
2193 	next_color = work_next_color(wq->work_color);
2194 
2195 	if (next_color != wq->flush_color) {
2196 		/*
2197 		 * Color space is not full.  The current work_color
2198 		 * becomes our flush_color and work_color is advanced
2199 		 * by one.
2200 		 */
2201 		BUG_ON(!list_empty(&wq->flusher_overflow));
2202 		this_flusher.flush_color = wq->work_color;
2203 		wq->work_color = next_color;
2204 
2205 		if (!wq->first_flusher) {
2206 			/* no flush in progress, become the first flusher */
2207 			BUG_ON(wq->flush_color != this_flusher.flush_color);
2208 
2209 			wq->first_flusher = &this_flusher;
2210 
2211 			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2212 						       wq->work_color)) {
2213 				/* nothing to flush, done */
2214 				wq->flush_color = next_color;
2215 				wq->first_flusher = NULL;
2216 				goto out_unlock;
2217 			}
2218 		} else {
2219 			/* wait in queue */
2220 			BUG_ON(wq->flush_color == this_flusher.flush_color);
2221 			list_add_tail(&this_flusher.list, &wq->flusher_queue);
2222 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2223 		}
2224 	} else {
2225 		/*
2226 		 * Oops, color space is full, wait on overflow queue.
2227 		 * The next flush completion will assign us
2228 		 * flush_color and transfer to flusher_queue.
2229 		 */
2230 		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2231 	}
2232 
2233 	mutex_unlock(&wq->flush_mutex);
2234 
2235 	wait_for_completion(&this_flusher.done);
2236 
2237 	/*
2238 	 * Wake-up-and-cascade phase
2239 	 *
2240 	 * First flushers are responsible for cascading flushes and
2241 	 * handling overflow.  Non-first flushers can simply return.
2242 	 */
2243 	if (wq->first_flusher != &this_flusher)
2244 		return;
2245 
2246 	mutex_lock(&wq->flush_mutex);
2247 
2248 	/* we might have raced, check again with mutex held */
2249 	if (wq->first_flusher != &this_flusher)
2250 		goto out_unlock;
2251 
2252 	wq->first_flusher = NULL;
2253 
2254 	BUG_ON(!list_empty(&this_flusher.list));
2255 	BUG_ON(wq->flush_color != this_flusher.flush_color);
2256 
2257 	while (true) {
2258 		struct wq_flusher *next, *tmp;
2259 
2260 		/* complete all the flushers sharing the current flush color */
2261 		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2262 			if (next->flush_color != wq->flush_color)
2263 				break;
2264 			list_del_init(&next->list);
2265 			complete(&next->done);
2266 		}
2267 
2268 		BUG_ON(!list_empty(&wq->flusher_overflow) &&
2269 		       wq->flush_color != work_next_color(wq->work_color));
2270 
2271 		/* this flush_color is finished, advance by one */
2272 		wq->flush_color = work_next_color(wq->flush_color);
2273 
2274 		/* one color has been freed, handle overflow queue */
2275 		if (!list_empty(&wq->flusher_overflow)) {
2276 			/*
2277 			 * Assign the same color to all overflowed
2278 			 * flushers, advance work_color and append to
2279 			 * flusher_queue.  This is the start-to-wait
2280 			 * phase for these overflowed flushers.
2281 			 */
2282 			list_for_each_entry(tmp, &wq->flusher_overflow, list)
2283 				tmp->flush_color = wq->work_color;
2284 
2285 			wq->work_color = work_next_color(wq->work_color);
2286 
2287 			list_splice_tail_init(&wq->flusher_overflow,
2288 					      &wq->flusher_queue);
2289 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2290 		}
2291 
2292 		if (list_empty(&wq->flusher_queue)) {
2293 			BUG_ON(wq->flush_color != wq->work_color);
2294 			break;
2295 		}
2296 
2297 		/*
2298 		 * Need to flush more colors.  Make the next flusher
2299 		 * the new first flusher and arm cwqs.
2300 		 */
2301 		BUG_ON(wq->flush_color == wq->work_color);
2302 		BUG_ON(wq->flush_color != next->flush_color);
2303 
2304 		list_del_init(&next->list);
2305 		wq->first_flusher = next;
2306 
2307 		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2308 			break;
2309 
2310 		/*
2311 		 * Meh... this color is already done, clear first
2312 		 * flusher and repeat cascading.
2313 		 */
2314 		wq->first_flusher = NULL;
2315 	}
2316 
2317 out_unlock:
2318 	mutex_unlock(&wq->flush_mutex);
2319 }
2320 EXPORT_SYMBOL_GPL(flush_workqueue);
2321 
2322 /**
2323  * flush_work - block until a work_struct's callback has terminated
2324  * @work: the work which is to be flushed
2325  *
2326  * Returns false if @work has already terminated.
2327  *
2328  * It is expected that, prior to calling flush_work(), the caller has
2329  * arranged for the work to not be requeued, otherwise it doesn't make
2330  * sense to use this function.
2331  */
2332 int flush_work(struct work_struct *work)
2333 {
2334 	struct worker *worker = NULL;
2335 	struct global_cwq *gcwq;
2336 	struct cpu_workqueue_struct *cwq;
2337 	struct wq_barrier barr;
2338 
2339 	might_sleep();
2340 	gcwq = get_work_gcwq(work);
2341 	if (!gcwq)
2342 		return 0;
2343 
2344 	spin_lock_irq(&gcwq->lock);
2345 	if (!list_empty(&work->entry)) {
2346 		/*
2347 		 * See the comment near try_to_grab_pending()->smp_rmb().
2348 		 * If it was re-queued to a different gcwq under us, we
2349 		 * are not going to wait.
2350 		 */
2351 		smp_rmb();
2352 		cwq = get_work_cwq(work);
2353 		if (unlikely(!cwq || gcwq != cwq->gcwq))
2354 			goto already_gone;
2355 	} else {
2356 		worker = find_worker_executing_work(gcwq, work);
2357 		if (!worker)
2358 			goto already_gone;
2359 		cwq = worker->current_cwq;
2360 	}
2361 
2362 	insert_wq_barrier(cwq, &barr, work, worker);
2363 	spin_unlock_irq(&gcwq->lock);
2364 
2365 	lock_map_acquire(&cwq->wq->lockdep_map);
2366 	lock_map_release(&cwq->wq->lockdep_map);
2367 
2368 	wait_for_completion(&barr.done);
2369 	destroy_work_on_stack(&barr.work);
2370 	return 1;
2371 already_gone:
2372 	spin_unlock_irq(&gcwq->lock);
2373 	return 0;
2374 }
2375 EXPORT_SYMBOL_GPL(flush_work);
2376 
2377 /*
2378  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2379  * so this work can't be re-armed in any way.
2380  */
2381 static int try_to_grab_pending(struct work_struct *work)
2382 {
2383 	struct global_cwq *gcwq;
2384 	int ret = -1;
2385 
2386 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2387 		return 0;
2388 
2389 	/*
2390 	 * The queueing is in progress, or it is already queued. Try to
2391 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2392 	 */
2393 	gcwq = get_work_gcwq(work);
2394 	if (!gcwq)
2395 		return ret;
2396 
2397 	spin_lock_irq(&gcwq->lock);
2398 	if (!list_empty(&work->entry)) {
2399 		/*
2400 		 * This work is queued, but perhaps we locked the wrong gcwq.
2401 		 * In that case we must see the new value after rmb(), see
2402 		 * insert_work()->wmb().
2403 		 */
2404 		smp_rmb();
2405 		if (gcwq == get_work_gcwq(work)) {
2406 			debug_work_deactivate(work);
2407 			list_del_init(&work->entry);
2408 			cwq_dec_nr_in_flight(get_work_cwq(work),
2409 				get_work_color(work),
2410 				*work_data_bits(work) & WORK_STRUCT_DELAYED);
2411 			ret = 1;
2412 		}
2413 	}
2414 	spin_unlock_irq(&gcwq->lock);
2415 
2416 	return ret;
2417 }
2418 
2419 static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2420 {
2421 	struct wq_barrier barr;
2422 	struct worker *worker;
2423 
2424 	spin_lock_irq(&gcwq->lock);
2425 
2426 	worker = find_worker_executing_work(gcwq, work);
2427 	if (unlikely(worker))
2428 		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2429 
2430 	spin_unlock_irq(&gcwq->lock);
2431 
2432 	if (unlikely(worker)) {
2433 		wait_for_completion(&barr.done);
2434 		destroy_work_on_stack(&barr.work);
2435 	}
2436 }
2437 
2438 static void wait_on_work(struct work_struct *work)
2439 {
2440 	int cpu;
2441 
2442 	might_sleep();
2443 
2444 	lock_map_acquire(&work->lockdep_map);
2445 	lock_map_release(&work->lockdep_map);
2446 
2447 	for_each_gcwq_cpu(cpu)
2448 		wait_on_cpu_work(get_gcwq(cpu), work);
2449 }
2450 
2451 static int __cancel_work_timer(struct work_struct *work,
2452 				struct timer_list* timer)
2453 {
2454 	int ret;
2455 
2456 	do {
2457 		ret = (timer && likely(del_timer(timer)));
2458 		if (!ret)
2459 			ret = try_to_grab_pending(work);
2460 		wait_on_work(work);
2461 	} while (unlikely(ret < 0));
2462 
2463 	clear_work_data(work);
2464 	return ret;
2465 }
2466 
2467 /**
2468  * cancel_work_sync - block until a work_struct's callback has terminated
2469  * @work: the work which is to be flushed
2470  *
2471  * Returns true if @work was pending.
2472  *
2473  * cancel_work_sync() will cancel the work if it is queued. If the work's
2474  * callback appears to be running, cancel_work_sync() will block until it
2475  * has completed.
2476  *
2477  * It is possible to use this function if the work re-queues itself. It can
2478  * cancel the work even if it migrates to another workqueue, however in that
2479  * case it only guarantees that work->func() has completed on the last queued
2480  * workqueue.
2481  *
2482  * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
2483  * pending, otherwise it goes into a busy-wait loop until the timer expires.
2484  *
2485  * The caller must ensure that workqueue_struct on which this work was last
2486  * queued can't be destroyed before this function returns.
2487  */
2488 int cancel_work_sync(struct work_struct *work)
2489 {
2490 	return __cancel_work_timer(work, NULL);
2491 }
2492 EXPORT_SYMBOL_GPL(cancel_work_sync);
2493 
2494 /**
2495  * cancel_delayed_work_sync - reliably kill off a delayed work.
2496  * @dwork: the delayed work struct
2497  *
2498  * Returns true if @dwork was pending.
2499  *
2500  * It is possible to use this function if @dwork rearms itself via queue_work()
2501  * or queue_delayed_work(). See also the comment for cancel_work_sync().
2502  */
2503 int cancel_delayed_work_sync(struct delayed_work *dwork)
2504 {
2505 	return __cancel_work_timer(&dwork->work, &dwork->timer);
2506 }
2507 EXPORT_SYMBOL(cancel_delayed_work_sync);
2508 
2509 /**
2510  * schedule_work - put work task in global workqueue
2511  * @work: job to be done
2512  *
2513  * Returns zero if @work was already on the kernel-global workqueue and
2514  * non-zero otherwise.
2515  *
2516  * This puts a job in the kernel-global workqueue if it was not already
2517  * queued and leaves it in the same position on the kernel-global
2518  * workqueue otherwise.
2519  */
2520 int schedule_work(struct work_struct *work)
2521 {
2522 	return queue_work(system_wq, work);
2523 }
2524 EXPORT_SYMBOL(schedule_work);
2525 
2526 /*
2527  * schedule_work_on - put work task on a specific cpu
2528  * @cpu: cpu to put the work task on
2529  * @work: job to be done
2530  *
2531  * This puts a job on a specific cpu
2532  */
2533 int schedule_work_on(int cpu, struct work_struct *work)
2534 {
2535 	return queue_work_on(cpu, system_wq, work);
2536 }
2537 EXPORT_SYMBOL(schedule_work_on);
2538 
2539 /**
2540  * schedule_delayed_work - put work task in global workqueue after delay
2541  * @dwork: job to be done
2542  * @delay: number of jiffies to wait or 0 for immediate execution
2543  *
2544  * After waiting for a given time this puts a job in the kernel-global
2545  * workqueue.
2546  */
2547 int schedule_delayed_work(struct delayed_work *dwork,
2548 					unsigned long delay)
2549 {
2550 	return queue_delayed_work(system_wq, dwork, delay);
2551 }
2552 EXPORT_SYMBOL(schedule_delayed_work);
2553 
2554 /**
2555  * flush_delayed_work - block until a dwork_struct's callback has terminated
2556  * @dwork: the delayed work which is to be flushed
2557  *
2558  * Any timeout is cancelled, and any pending work is run immediately.
2559  */
2560 void flush_delayed_work(struct delayed_work *dwork)
2561 {
2562 	if (del_timer_sync(&dwork->timer)) {
2563 		__queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
2564 			     &dwork->work);
2565 		put_cpu();
2566 	}
2567 	flush_work(&dwork->work);
2568 }
2569 EXPORT_SYMBOL(flush_delayed_work);
2570 
2571 /**
2572  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2573  * @cpu: cpu to use
2574  * @dwork: job to be done
2575  * @delay: number of jiffies to wait
2576  *
2577  * After waiting for a given time this puts a job in the kernel-global
2578  * workqueue on the specified CPU.
2579  */
2580 int schedule_delayed_work_on(int cpu,
2581 			struct delayed_work *dwork, unsigned long delay)
2582 {
2583 	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2584 }
2585 EXPORT_SYMBOL(schedule_delayed_work_on);
2586 
2587 /**
2588  * schedule_on_each_cpu - call a function on each online CPU from keventd
2589  * @func: the function to call
2590  *
2591  * Returns zero on success.
2592  * Returns -ve errno on failure.
2593  *
2594  * schedule_on_each_cpu() is very slow.
2595  */
2596 int schedule_on_each_cpu(work_func_t func)
2597 {
2598 	int cpu;
2599 	struct work_struct __percpu *works;
2600 
2601 	works = alloc_percpu(struct work_struct);
2602 	if (!works)
2603 		return -ENOMEM;
2604 
2605 	get_online_cpus();
2606 
2607 	for_each_online_cpu(cpu) {
2608 		struct work_struct *work = per_cpu_ptr(works, cpu);
2609 
2610 		INIT_WORK(work, func);
2611 		schedule_work_on(cpu, work);
2612 	}
2613 
2614 	for_each_online_cpu(cpu)
2615 		flush_work(per_cpu_ptr(works, cpu));
2616 
2617 	put_online_cpus();
2618 	free_percpu(works);
2619 	return 0;
2620 }
2621 
2622 /**
2623  * flush_scheduled_work - ensure that any scheduled work has run to completion.
2624  *
2625  * Forces execution of the kernel-global workqueue and blocks until its
2626  * completion.
2627  *
2628  * Think twice before calling this function!  It's very easy to get into
2629  * trouble if you don't take great care.  Either of the following situations
2630  * will lead to deadlock:
2631  *
2632  *	One of the work items currently on the workqueue needs to acquire
2633  *	a lock held by your code or its caller.
2634  *
2635  *	Your code is running in the context of a work routine.
2636  *
2637  * They will be detected by lockdep when they occur, but the first might not
2638  * occur very often.  It depends on what work items are on the workqueue and
2639  * what locks they need, which you have no control over.
2640  *
2641  * In most situations flushing the entire workqueue is overkill; you merely
2642  * need to know that a particular work item isn't queued and isn't running.
2643  * In such cases you should use cancel_delayed_work_sync() or
2644  * cancel_work_sync() instead.
2645  */
2646 void flush_scheduled_work(void)
2647 {
2648 	flush_workqueue(system_wq);
2649 }
2650 EXPORT_SYMBOL(flush_scheduled_work);
2651 
2652 /**
2653  * execute_in_process_context - reliably execute the routine with user context
2654  * @fn:		the function to execute
2655  * @ew:		guaranteed storage for the execute work structure (must
2656  *		be available when the work executes)
2657  *
2658  * Executes the function immediately if process context is available,
2659  * otherwise schedules the function for delayed execution.
2660  *
2661  * Returns:	0 - function was executed
2662  *		1 - function was scheduled for execution
2663  */
2664 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2665 {
2666 	if (!in_interrupt()) {
2667 		fn(&ew->work);
2668 		return 0;
2669 	}
2670 
2671 	INIT_WORK(&ew->work, fn);
2672 	schedule_work(&ew->work);
2673 
2674 	return 1;
2675 }
2676 EXPORT_SYMBOL_GPL(execute_in_process_context);
2677 
2678 int keventd_up(void)
2679 {
2680 	return system_wq != NULL;
2681 }
2682 
2683 static int alloc_cwqs(struct workqueue_struct *wq)
2684 {
2685 	/*
2686 	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
2687 	 * Make sure that the alignment isn't lower than that of
2688 	 * unsigned long long.
2689 	 */
2690 	const size_t size = sizeof(struct cpu_workqueue_struct);
2691 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2692 				   __alignof__(unsigned long long));
2693 #ifdef CONFIG_SMP
2694 	bool percpu = !(wq->flags & WQ_UNBOUND);
2695 #else
2696 	bool percpu = false;
2697 #endif
2698 
2699 	if (percpu)
2700 		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2701 	else {
2702 		void *ptr;
2703 
2704 		/*
2705 		 * Allocate enough room to align cwq and put an extra
2706 		 * pointer at the end pointing back to the originally
2707 		 * allocated pointer which will be used for free.
2708 		 */
2709 		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2710 		if (ptr) {
2711 			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2712 			*(void **)(wq->cpu_wq.single + 1) = ptr;
2713 		}
2714 	}
2715 
2716 	/* just in case, make sure it's actually aligned */
2717 	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2718 	return wq->cpu_wq.v ? 0 : -ENOMEM;
2719 }
2720 
2721 static void free_cwqs(struct workqueue_struct *wq)
2722 {
2723 #ifdef CONFIG_SMP
2724 	bool percpu = !(wq->flags & WQ_UNBOUND);
2725 #else
2726 	bool percpu = false;
2727 #endif
2728 
2729 	if (percpu)
2730 		free_percpu(wq->cpu_wq.pcpu);
2731 	else if (wq->cpu_wq.single) {
2732 		/* the pointer to free is stored right after the cwq */
2733 		kfree(*(void **)(wq->cpu_wq.single + 1));
2734 	}
2735 }
2736 
2737 static int wq_clamp_max_active(int max_active, unsigned int flags,
2738 			       const char *name)
2739 {
2740 	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
2741 
2742 	if (max_active < 1 || max_active > lim)
2743 		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
2744 		       "is out of range, clamping between %d and %d\n",
2745 		       max_active, name, 1, lim);
2746 
2747 	return clamp_val(max_active, 1, lim);
2748 }
2749 
2750 struct workqueue_struct *__alloc_workqueue_key(const char *name,
2751 					       unsigned int flags,
2752 					       int max_active,
2753 					       struct lock_class_key *key,
2754 					       const char *lock_name)
2755 {
2756 	struct workqueue_struct *wq;
2757 	unsigned int cpu;
2758 
2759 	/*
2760 	 * Unbound workqueues aren't concurrency managed and should be
2761 	 * dispatched to workers immediately.
2762 	 */
2763 	if (flags & WQ_UNBOUND)
2764 		flags |= WQ_HIGHPRI;
2765 
2766 	max_active = max_active ?: WQ_DFL_ACTIVE;
2767 	max_active = wq_clamp_max_active(max_active, flags, name);
2768 
2769 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2770 	if (!wq)
2771 		goto err;
2772 
2773 	wq->flags = flags;
2774 	wq->saved_max_active = max_active;
2775 	mutex_init(&wq->flush_mutex);
2776 	atomic_set(&wq->nr_cwqs_to_flush, 0);
2777 	INIT_LIST_HEAD(&wq->flusher_queue);
2778 	INIT_LIST_HEAD(&wq->flusher_overflow);
2779 
2780 	wq->name = name;
2781 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2782 	INIT_LIST_HEAD(&wq->list);
2783 
2784 	if (alloc_cwqs(wq) < 0)
2785 		goto err;
2786 
2787 	for_each_cwq_cpu(cpu, wq) {
2788 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2789 		struct global_cwq *gcwq = get_gcwq(cpu);
2790 
2791 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
2792 		cwq->gcwq = gcwq;
2793 		cwq->wq = wq;
2794 		cwq->flush_color = -1;
2795 		cwq->max_active = max_active;
2796 		INIT_LIST_HEAD(&cwq->delayed_works);
2797 	}
2798 
2799 	if (flags & WQ_RESCUER) {
2800 		struct worker *rescuer;
2801 
2802 		if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2803 			goto err;
2804 
2805 		wq->rescuer = rescuer = alloc_worker();
2806 		if (!rescuer)
2807 			goto err;
2808 
2809 		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2810 		if (IS_ERR(rescuer->task))
2811 			goto err;
2812 
2813 		rescuer->task->flags |= PF_THREAD_BOUND;
2814 		wake_up_process(rescuer->task);
2815 	}
2816 
2817 	/*
2818 	 * workqueue_lock protects global freeze state and workqueues
2819 	 * list.  Grab it, set max_active accordingly and add the new
2820 	 * workqueue to workqueues list.
2821 	 */
2822 	spin_lock(&workqueue_lock);
2823 
2824 	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
2825 		for_each_cwq_cpu(cpu, wq)
2826 			get_cwq(cpu, wq)->max_active = 0;
2827 
2828 	list_add(&wq->list, &workqueues);
2829 
2830 	spin_unlock(&workqueue_lock);
2831 
2832 	return wq;
2833 err:
2834 	if (wq) {
2835 		free_cwqs(wq);
2836 		free_mayday_mask(wq->mayday_mask);
2837 		kfree(wq->rescuer);
2838 		kfree(wq);
2839 	}
2840 	return NULL;
2841 }
2842 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2843 
2844 /**
2845  * destroy_workqueue - safely terminate a workqueue
2846  * @wq: target workqueue
2847  *
2848  * Safely destroy a workqueue. All work currently pending will be done first.
2849  */
2850 void destroy_workqueue(struct workqueue_struct *wq)
2851 {
2852 	unsigned int cpu;
2853 
2854 	wq->flags |= WQ_DYING;
2855 	flush_workqueue(wq);
2856 
2857 	/*
2858 	 * wq list is used to freeze wq, remove from list after
2859 	 * flushing is complete in case freeze races us.
2860 	 */
2861 	spin_lock(&workqueue_lock);
2862 	list_del(&wq->list);
2863 	spin_unlock(&workqueue_lock);
2864 
2865 	/* sanity check */
2866 	for_each_cwq_cpu(cpu, wq) {
2867 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2868 		int i;
2869 
2870 		for (i = 0; i < WORK_NR_COLORS; i++)
2871 			BUG_ON(cwq->nr_in_flight[i]);
2872 		BUG_ON(cwq->nr_active);
2873 		BUG_ON(!list_empty(&cwq->delayed_works));
2874 	}
2875 
2876 	if (wq->flags & WQ_RESCUER) {
2877 		kthread_stop(wq->rescuer->task);
2878 		free_mayday_mask(wq->mayday_mask);
2879 		kfree(wq->rescuer);
2880 	}
2881 
2882 	free_cwqs(wq);
2883 	kfree(wq);
2884 }
2885 EXPORT_SYMBOL_GPL(destroy_workqueue);
2886 
2887 /**
2888  * workqueue_set_max_active - adjust max_active of a workqueue
2889  * @wq: target workqueue
2890  * @max_active: new max_active value.
2891  *
2892  * Set max_active of @wq to @max_active.
2893  *
2894  * CONTEXT:
2895  * Don't call from IRQ context.
2896  */
2897 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
2898 {
2899 	unsigned int cpu;
2900 
2901 	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
2902 
2903 	spin_lock(&workqueue_lock);
2904 
2905 	wq->saved_max_active = max_active;
2906 
2907 	for_each_cwq_cpu(cpu, wq) {
2908 		struct global_cwq *gcwq = get_gcwq(cpu);
2909 
2910 		spin_lock_irq(&gcwq->lock);
2911 
2912 		if (!(wq->flags & WQ_FREEZEABLE) ||
2913 		    !(gcwq->flags & GCWQ_FREEZING))
2914 			get_cwq(gcwq->cpu, wq)->max_active = max_active;
2915 
2916 		spin_unlock_irq(&gcwq->lock);
2917 	}
2918 
2919 	spin_unlock(&workqueue_lock);
2920 }
2921 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
2922 
2923 /**
2924  * workqueue_congested - test whether a workqueue is congested
2925  * @cpu: CPU in question
2926  * @wq: target workqueue
2927  *
2928  * Test whether @wq's cpu workqueue for @cpu is congested.  There is
2929  * no synchronization around this function and the test result is
2930  * unreliable and only useful as advisory hints or for debugging.
2931  *
2932  * RETURNS:
2933  * %true if congested, %false otherwise.
2934  */
2935 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
2936 {
2937 	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2938 
2939 	return !list_empty(&cwq->delayed_works);
2940 }
2941 EXPORT_SYMBOL_GPL(workqueue_congested);
2942 
2943 /**
2944  * work_cpu - return the last known associated cpu for @work
2945  * @work: the work of interest
2946  *
2947  * RETURNS:
2948  * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
2949  */
2950 unsigned int work_cpu(struct work_struct *work)
2951 {
2952 	struct global_cwq *gcwq = get_work_gcwq(work);
2953 
2954 	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
2955 }
2956 EXPORT_SYMBOL_GPL(work_cpu);
2957 
2958 /**
2959  * work_busy - test whether a work is currently pending or running
2960  * @work: the work to be tested
2961  *
2962  * Test whether @work is currently pending or running.  There is no
2963  * synchronization around this function and the test result is
2964  * unreliable and only useful as advisory hints or for debugging.
2965  * Especially for reentrant wqs, the pending state might hide the
2966  * running state.
2967  *
2968  * RETURNS:
2969  * OR'd bitmask of WORK_BUSY_* bits.
2970  */
2971 unsigned int work_busy(struct work_struct *work)
2972 {
2973 	struct global_cwq *gcwq = get_work_gcwq(work);
2974 	unsigned long flags;
2975 	unsigned int ret = 0;
2976 
2977 	if (!gcwq)
2978 		return false;
2979 
2980 	spin_lock_irqsave(&gcwq->lock, flags);
2981 
2982 	if (work_pending(work))
2983 		ret |= WORK_BUSY_PENDING;
2984 	if (find_worker_executing_work(gcwq, work))
2985 		ret |= WORK_BUSY_RUNNING;
2986 
2987 	spin_unlock_irqrestore(&gcwq->lock, flags);
2988 
2989 	return ret;
2990 }
2991 EXPORT_SYMBOL_GPL(work_busy);
2992 
2993 /*
2994  * CPU hotplug.
2995  *
2996  * There are two challenges in supporting CPU hotplug.  Firstly, there
2997  * are a lot of assumptions on strong associations among work, cwq and
2998  * gcwq which make migrating pending and scheduled works very
2999  * difficult to implement without impacting hot paths.  Secondly,
3000  * gcwqs serve mix of short, long and very long running works making
3001  * blocked draining impractical.
3002  *
3003  * This is solved by allowing a gcwq to be detached from CPU, running
3004  * it with unbound (rogue) workers and allowing it to be reattached
3005  * later if the cpu comes back online.  A separate thread is created
3006  * to govern a gcwq in such state and is called the trustee of the
3007  * gcwq.
3008  *
3009  * Trustee states and their descriptions.
3010  *
3011  * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
3012  *		new trustee is started with this state.
3013  *
3014  * IN_CHARGE	Once started, trustee will enter this state after
3015  *		assuming the manager role and making all existing
3016  *		workers rogue.  DOWN_PREPARE waits for trustee to
3017  *		enter this state.  After reaching IN_CHARGE, trustee
3018  *		tries to execute the pending worklist until it's empty
3019  *		and the state is set to BUTCHER, or the state is set
3020  *		to RELEASE.
3021  *
3022  * BUTCHER	Command state which is set by the cpu callback after
3023  *		the cpu has went down.  Once this state is set trustee
3024  *		knows that there will be no new works on the worklist
3025  *		and once the worklist is empty it can proceed to
3026  *		killing idle workers.
3027  *
3028  * RELEASE	Command state which is set by the cpu callback if the
3029  *		cpu down has been canceled or it has come online
3030  *		again.  After recognizing this state, trustee stops
3031  *		trying to drain or butcher and clears ROGUE, rebinds
3032  *		all remaining workers back to the cpu and releases
3033  *		manager role.
3034  *
3035  * DONE		Trustee will enter this state after BUTCHER or RELEASE
3036  *		is complete.
3037  *
3038  *          trustee                 CPU                draining
3039  *         took over                down               complete
3040  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3041  *                        |                     |                  ^
3042  *                        | CPU is back online  v   return workers |
3043  *                         ----------------> RELEASE --------------
3044  */
3045 
3046 /**
3047  * trustee_wait_event_timeout - timed event wait for trustee
3048  * @cond: condition to wait for
3049  * @timeout: timeout in jiffies
3050  *
3051  * wait_event_timeout() for trustee to use.  Handles locking and
3052  * checks for RELEASE request.
3053  *
3054  * CONTEXT:
3055  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3056  * multiple times.  To be used by trustee.
3057  *
3058  * RETURNS:
3059  * Positive indicating left time if @cond is satisfied, 0 if timed
3060  * out, -1 if canceled.
3061  */
3062 #define trustee_wait_event_timeout(cond, timeout) ({			\
3063 	long __ret = (timeout);						\
3064 	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
3065 	       __ret) {							\
3066 		spin_unlock_irq(&gcwq->lock);				\
3067 		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
3068 			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
3069 			__ret);						\
3070 		spin_lock_irq(&gcwq->lock);				\
3071 	}								\
3072 	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
3073 })
3074 
3075 /**
3076  * trustee_wait_event - event wait for trustee
3077  * @cond: condition to wait for
3078  *
3079  * wait_event() for trustee to use.  Automatically handles locking and
3080  * checks for CANCEL request.
3081  *
3082  * CONTEXT:
3083  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3084  * multiple times.  To be used by trustee.
3085  *
3086  * RETURNS:
3087  * 0 if @cond is satisfied, -1 if canceled.
3088  */
3089 #define trustee_wait_event(cond) ({					\
3090 	long __ret1;							\
3091 	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3092 	__ret1 < 0 ? -1 : 0;						\
3093 })
3094 
3095 static int __cpuinit trustee_thread(void *__gcwq)
3096 {
3097 	struct global_cwq *gcwq = __gcwq;
3098 	struct worker *worker;
3099 	struct work_struct *work;
3100 	struct hlist_node *pos;
3101 	long rc;
3102 	int i;
3103 
3104 	BUG_ON(gcwq->cpu != smp_processor_id());
3105 
3106 	spin_lock_irq(&gcwq->lock);
3107 	/*
3108 	 * Claim the manager position and make all workers rogue.
3109 	 * Trustee must be bound to the target cpu and can't be
3110 	 * cancelled.
3111 	 */
3112 	BUG_ON(gcwq->cpu != smp_processor_id());
3113 	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3114 	BUG_ON(rc < 0);
3115 
3116 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
3117 
3118 	list_for_each_entry(worker, &gcwq->idle_list, entry)
3119 		worker->flags |= WORKER_ROGUE;
3120 
3121 	for_each_busy_worker(worker, i, pos, gcwq)
3122 		worker->flags |= WORKER_ROGUE;
3123 
3124 	/*
3125 	 * Call schedule() so that we cross rq->lock and thus can
3126 	 * guarantee sched callbacks see the rogue flag.  This is
3127 	 * necessary as scheduler callbacks may be invoked from other
3128 	 * cpus.
3129 	 */
3130 	spin_unlock_irq(&gcwq->lock);
3131 	schedule();
3132 	spin_lock_irq(&gcwq->lock);
3133 
3134 	/*
3135 	 * Sched callbacks are disabled now.  Zap nr_running.  After
3136 	 * this, nr_running stays zero and need_more_worker() and
3137 	 * keep_working() are always true as long as the worklist is
3138 	 * not empty.
3139 	 */
3140 	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3141 
3142 	spin_unlock_irq(&gcwq->lock);
3143 	del_timer_sync(&gcwq->idle_timer);
3144 	spin_lock_irq(&gcwq->lock);
3145 
3146 	/*
3147 	 * We're now in charge.  Notify and proceed to drain.  We need
3148 	 * to keep the gcwq running during the whole CPU down
3149 	 * procedure as other cpu hotunplug callbacks may need to
3150 	 * flush currently running tasks.
3151 	 */
3152 	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3153 	wake_up_all(&gcwq->trustee_wait);
3154 
3155 	/*
3156 	 * The original cpu is in the process of dying and may go away
3157 	 * anytime now.  When that happens, we and all workers would
3158 	 * be migrated to other cpus.  Try draining any left work.  We
3159 	 * want to get it over with ASAP - spam rescuers, wake up as
3160 	 * many idlers as necessary and create new ones till the
3161 	 * worklist is empty.  Note that if the gcwq is frozen, there
3162 	 * may be frozen works in freezeable cwqs.  Don't declare
3163 	 * completion while frozen.
3164 	 */
3165 	while (gcwq->nr_workers != gcwq->nr_idle ||
3166 	       gcwq->flags & GCWQ_FREEZING ||
3167 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3168 		int nr_works = 0;
3169 
3170 		list_for_each_entry(work, &gcwq->worklist, entry) {
3171 			send_mayday(work);
3172 			nr_works++;
3173 		}
3174 
3175 		list_for_each_entry(worker, &gcwq->idle_list, entry) {
3176 			if (!nr_works--)
3177 				break;
3178 			wake_up_process(worker->task);
3179 		}
3180 
3181 		if (need_to_create_worker(gcwq)) {
3182 			spin_unlock_irq(&gcwq->lock);
3183 			worker = create_worker(gcwq, false);
3184 			spin_lock_irq(&gcwq->lock);
3185 			if (worker) {
3186 				worker->flags |= WORKER_ROGUE;
3187 				start_worker(worker);
3188 			}
3189 		}
3190 
3191 		/* give a breather */
3192 		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3193 			break;
3194 	}
3195 
3196 	/*
3197 	 * Either all works have been scheduled and cpu is down, or
3198 	 * cpu down has already been canceled.  Wait for and butcher
3199 	 * all workers till we're canceled.
3200 	 */
3201 	do {
3202 		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3203 		while (!list_empty(&gcwq->idle_list))
3204 			destroy_worker(list_first_entry(&gcwq->idle_list,
3205 							struct worker, entry));
3206 	} while (gcwq->nr_workers && rc >= 0);
3207 
3208 	/*
3209 	 * At this point, either draining has completed and no worker
3210 	 * is left, or cpu down has been canceled or the cpu is being
3211 	 * brought back up.  There shouldn't be any idle one left.
3212 	 * Tell the remaining busy ones to rebind once it finishes the
3213 	 * currently scheduled works by scheduling the rebind_work.
3214 	 */
3215 	WARN_ON(!list_empty(&gcwq->idle_list));
3216 
3217 	for_each_busy_worker(worker, i, pos, gcwq) {
3218 		struct work_struct *rebind_work = &worker->rebind_work;
3219 
3220 		/*
3221 		 * Rebind_work may race with future cpu hotplug
3222 		 * operations.  Use a separate flag to mark that
3223 		 * rebinding is scheduled.
3224 		 */
3225 		worker->flags |= WORKER_REBIND;
3226 		worker->flags &= ~WORKER_ROGUE;
3227 
3228 		/* queue rebind_work, wq doesn't matter, use the default one */
3229 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3230 				     work_data_bits(rebind_work)))
3231 			continue;
3232 
3233 		debug_work_activate(rebind_work);
3234 		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3235 			    worker->scheduled.next,
3236 			    work_color_to_flags(WORK_NO_COLOR));
3237 	}
3238 
3239 	/* relinquish manager role */
3240 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3241 
3242 	/* notify completion */
3243 	gcwq->trustee = NULL;
3244 	gcwq->trustee_state = TRUSTEE_DONE;
3245 	wake_up_all(&gcwq->trustee_wait);
3246 	spin_unlock_irq(&gcwq->lock);
3247 	return 0;
3248 }
3249 
3250 /**
3251  * wait_trustee_state - wait for trustee to enter the specified state
3252  * @gcwq: gcwq the trustee of interest belongs to
3253  * @state: target state to wait for
3254  *
3255  * Wait for the trustee to reach @state.  DONE is already matched.
3256  *
3257  * CONTEXT:
3258  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3259  * multiple times.  To be used by cpu_callback.
3260  */
3261 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3262 __releases(&gcwq->lock)
3263 __acquires(&gcwq->lock)
3264 {
3265 	if (!(gcwq->trustee_state == state ||
3266 	      gcwq->trustee_state == TRUSTEE_DONE)) {
3267 		spin_unlock_irq(&gcwq->lock);
3268 		__wait_event(gcwq->trustee_wait,
3269 			     gcwq->trustee_state == state ||
3270 			     gcwq->trustee_state == TRUSTEE_DONE);
3271 		spin_lock_irq(&gcwq->lock);
3272 	}
3273 }
3274 
3275 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3276 						unsigned long action,
3277 						void *hcpu)
3278 {
3279 	unsigned int cpu = (unsigned long)hcpu;
3280 	struct global_cwq *gcwq = get_gcwq(cpu);
3281 	struct task_struct *new_trustee = NULL;
3282 	struct worker *uninitialized_var(new_worker);
3283 	unsigned long flags;
3284 
3285 	action &= ~CPU_TASKS_FROZEN;
3286 
3287 	switch (action) {
3288 	case CPU_DOWN_PREPARE:
3289 		new_trustee = kthread_create(trustee_thread, gcwq,
3290 					     "workqueue_trustee/%d\n", cpu);
3291 		if (IS_ERR(new_trustee))
3292 			return notifier_from_errno(PTR_ERR(new_trustee));
3293 		kthread_bind(new_trustee, cpu);
3294 		/* fall through */
3295 	case CPU_UP_PREPARE:
3296 		BUG_ON(gcwq->first_idle);
3297 		new_worker = create_worker(gcwq, false);
3298 		if (!new_worker) {
3299 			if (new_trustee)
3300 				kthread_stop(new_trustee);
3301 			return NOTIFY_BAD;
3302 		}
3303 	}
3304 
3305 	/* some are called w/ irq disabled, don't disturb irq status */
3306 	spin_lock_irqsave(&gcwq->lock, flags);
3307 
3308 	switch (action) {
3309 	case CPU_DOWN_PREPARE:
3310 		/* initialize trustee and tell it to acquire the gcwq */
3311 		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3312 		gcwq->trustee = new_trustee;
3313 		gcwq->trustee_state = TRUSTEE_START;
3314 		wake_up_process(gcwq->trustee);
3315 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3316 		/* fall through */
3317 	case CPU_UP_PREPARE:
3318 		BUG_ON(gcwq->first_idle);
3319 		gcwq->first_idle = new_worker;
3320 		break;
3321 
3322 	case CPU_DYING:
3323 		/*
3324 		 * Before this, the trustee and all workers except for
3325 		 * the ones which are still executing works from
3326 		 * before the last CPU down must be on the cpu.  After
3327 		 * this, they'll all be diasporas.
3328 		 */
3329 		gcwq->flags |= GCWQ_DISASSOCIATED;
3330 		break;
3331 
3332 	case CPU_POST_DEAD:
3333 		gcwq->trustee_state = TRUSTEE_BUTCHER;
3334 		/* fall through */
3335 	case CPU_UP_CANCELED:
3336 		destroy_worker(gcwq->first_idle);
3337 		gcwq->first_idle = NULL;
3338 		break;
3339 
3340 	case CPU_DOWN_FAILED:
3341 	case CPU_ONLINE:
3342 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
3343 		if (gcwq->trustee_state != TRUSTEE_DONE) {
3344 			gcwq->trustee_state = TRUSTEE_RELEASE;
3345 			wake_up_process(gcwq->trustee);
3346 			wait_trustee_state(gcwq, TRUSTEE_DONE);
3347 		}
3348 
3349 		/*
3350 		 * Trustee is done and there might be no worker left.
3351 		 * Put the first_idle in and request a real manager to
3352 		 * take a look.
3353 		 */
3354 		spin_unlock_irq(&gcwq->lock);
3355 		kthread_bind(gcwq->first_idle->task, cpu);
3356 		spin_lock_irq(&gcwq->lock);
3357 		gcwq->flags |= GCWQ_MANAGE_WORKERS;
3358 		start_worker(gcwq->first_idle);
3359 		gcwq->first_idle = NULL;
3360 		break;
3361 	}
3362 
3363 	spin_unlock_irqrestore(&gcwq->lock, flags);
3364 
3365 	return notifier_from_errno(0);
3366 }
3367 
3368 #ifdef CONFIG_SMP
3369 
3370 struct work_for_cpu {
3371 	struct completion completion;
3372 	long (*fn)(void *);
3373 	void *arg;
3374 	long ret;
3375 };
3376 
3377 static int do_work_for_cpu(void *_wfc)
3378 {
3379 	struct work_for_cpu *wfc = _wfc;
3380 	wfc->ret = wfc->fn(wfc->arg);
3381 	complete(&wfc->completion);
3382 	return 0;
3383 }
3384 
3385 /**
3386  * work_on_cpu - run a function in user context on a particular cpu
3387  * @cpu: the cpu to run on
3388  * @fn: the function to run
3389  * @arg: the function arg
3390  *
3391  * This will return the value @fn returns.
3392  * It is up to the caller to ensure that the cpu doesn't go offline.
3393  * The caller must not hold any locks which would prevent @fn from completing.
3394  */
3395 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3396 {
3397 	struct task_struct *sub_thread;
3398 	struct work_for_cpu wfc = {
3399 		.completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
3400 		.fn = fn,
3401 		.arg = arg,
3402 	};
3403 
3404 	sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
3405 	if (IS_ERR(sub_thread))
3406 		return PTR_ERR(sub_thread);
3407 	kthread_bind(sub_thread, cpu);
3408 	wake_up_process(sub_thread);
3409 	wait_for_completion(&wfc.completion);
3410 	return wfc.ret;
3411 }
3412 EXPORT_SYMBOL_GPL(work_on_cpu);
3413 #endif /* CONFIG_SMP */
3414 
3415 #ifdef CONFIG_FREEZER
3416 
3417 /**
3418  * freeze_workqueues_begin - begin freezing workqueues
3419  *
3420  * Start freezing workqueues.  After this function returns, all
3421  * freezeable workqueues will queue new works to their frozen_works
3422  * list instead of gcwq->worklist.
3423  *
3424  * CONTEXT:
3425  * Grabs and releases workqueue_lock and gcwq->lock's.
3426  */
3427 void freeze_workqueues_begin(void)
3428 {
3429 	unsigned int cpu;
3430 
3431 	spin_lock(&workqueue_lock);
3432 
3433 	BUG_ON(workqueue_freezing);
3434 	workqueue_freezing = true;
3435 
3436 	for_each_gcwq_cpu(cpu) {
3437 		struct global_cwq *gcwq = get_gcwq(cpu);
3438 		struct workqueue_struct *wq;
3439 
3440 		spin_lock_irq(&gcwq->lock);
3441 
3442 		BUG_ON(gcwq->flags & GCWQ_FREEZING);
3443 		gcwq->flags |= GCWQ_FREEZING;
3444 
3445 		list_for_each_entry(wq, &workqueues, list) {
3446 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3447 
3448 			if (cwq && wq->flags & WQ_FREEZEABLE)
3449 				cwq->max_active = 0;
3450 		}
3451 
3452 		spin_unlock_irq(&gcwq->lock);
3453 	}
3454 
3455 	spin_unlock(&workqueue_lock);
3456 }
3457 
3458 /**
3459  * freeze_workqueues_busy - are freezeable workqueues still busy?
3460  *
3461  * Check whether freezing is complete.  This function must be called
3462  * between freeze_workqueues_begin() and thaw_workqueues().
3463  *
3464  * CONTEXT:
3465  * Grabs and releases workqueue_lock.
3466  *
3467  * RETURNS:
3468  * %true if some freezeable workqueues are still busy.  %false if
3469  * freezing is complete.
3470  */
3471 bool freeze_workqueues_busy(void)
3472 {
3473 	unsigned int cpu;
3474 	bool busy = false;
3475 
3476 	spin_lock(&workqueue_lock);
3477 
3478 	BUG_ON(!workqueue_freezing);
3479 
3480 	for_each_gcwq_cpu(cpu) {
3481 		struct workqueue_struct *wq;
3482 		/*
3483 		 * nr_active is monotonically decreasing.  It's safe
3484 		 * to peek without lock.
3485 		 */
3486 		list_for_each_entry(wq, &workqueues, list) {
3487 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3488 
3489 			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3490 				continue;
3491 
3492 			BUG_ON(cwq->nr_active < 0);
3493 			if (cwq->nr_active) {
3494 				busy = true;
3495 				goto out_unlock;
3496 			}
3497 		}
3498 	}
3499 out_unlock:
3500 	spin_unlock(&workqueue_lock);
3501 	return busy;
3502 }
3503 
3504 /**
3505  * thaw_workqueues - thaw workqueues
3506  *
3507  * Thaw workqueues.  Normal queueing is restored and all collected
3508  * frozen works are transferred to their respective gcwq worklists.
3509  *
3510  * CONTEXT:
3511  * Grabs and releases workqueue_lock and gcwq->lock's.
3512  */
3513 void thaw_workqueues(void)
3514 {
3515 	unsigned int cpu;
3516 
3517 	spin_lock(&workqueue_lock);
3518 
3519 	if (!workqueue_freezing)
3520 		goto out_unlock;
3521 
3522 	for_each_gcwq_cpu(cpu) {
3523 		struct global_cwq *gcwq = get_gcwq(cpu);
3524 		struct workqueue_struct *wq;
3525 
3526 		spin_lock_irq(&gcwq->lock);
3527 
3528 		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3529 		gcwq->flags &= ~GCWQ_FREEZING;
3530 
3531 		list_for_each_entry(wq, &workqueues, list) {
3532 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3533 
3534 			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
3535 				continue;
3536 
3537 			/* restore max_active and repopulate worklist */
3538 			cwq->max_active = wq->saved_max_active;
3539 
3540 			while (!list_empty(&cwq->delayed_works) &&
3541 			       cwq->nr_active < cwq->max_active)
3542 				cwq_activate_first_delayed(cwq);
3543 		}
3544 
3545 		wake_up_worker(gcwq);
3546 
3547 		spin_unlock_irq(&gcwq->lock);
3548 	}
3549 
3550 	workqueue_freezing = false;
3551 out_unlock:
3552 	spin_unlock(&workqueue_lock);
3553 }
3554 #endif /* CONFIG_FREEZER */
3555 
3556 static int __init init_workqueues(void)
3557 {
3558 	unsigned int cpu;
3559 	int i;
3560 
3561 	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3562 
3563 	/* initialize gcwqs */
3564 	for_each_gcwq_cpu(cpu) {
3565 		struct global_cwq *gcwq = get_gcwq(cpu);
3566 
3567 		spin_lock_init(&gcwq->lock);
3568 		INIT_LIST_HEAD(&gcwq->worklist);
3569 		gcwq->cpu = cpu;
3570 		gcwq->flags |= GCWQ_DISASSOCIATED;
3571 
3572 		INIT_LIST_HEAD(&gcwq->idle_list);
3573 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3574 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3575 
3576 		init_timer_deferrable(&gcwq->idle_timer);
3577 		gcwq->idle_timer.function = idle_worker_timeout;
3578 		gcwq->idle_timer.data = (unsigned long)gcwq;
3579 
3580 		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3581 			    (unsigned long)gcwq);
3582 
3583 		ida_init(&gcwq->worker_ida);
3584 
3585 		gcwq->trustee_state = TRUSTEE_DONE;
3586 		init_waitqueue_head(&gcwq->trustee_wait);
3587 	}
3588 
3589 	/* create the initial worker */
3590 	for_each_online_gcwq_cpu(cpu) {
3591 		struct global_cwq *gcwq = get_gcwq(cpu);
3592 		struct worker *worker;
3593 
3594 		if (cpu != WORK_CPU_UNBOUND)
3595 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
3596 		worker = create_worker(gcwq, true);
3597 		BUG_ON(!worker);
3598 		spin_lock_irq(&gcwq->lock);
3599 		start_worker(worker);
3600 		spin_unlock_irq(&gcwq->lock);
3601 	}
3602 
3603 	system_wq = alloc_workqueue("events", 0, 0);
3604 	system_long_wq = alloc_workqueue("events_long", 0, 0);
3605 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3606 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3607 					    WQ_UNBOUND_MAX_ACTIVE);
3608 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
3609 	return 0;
3610 }
3611 early_initcall(init_workqueues);
3612