xref: /xnu-11215/bsd/pthread/pthread_workqueue.c (revision 8d741a5d)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
29 
30 #include <sys/cdefs.h>
31 
32 #include <kern/assert.h>
33 #include <kern/ast.h>
34 #include <kern/clock.h>
35 #include <kern/cpu_data.h>
36 #include <kern/kern_types.h>
37 #include <kern/policy_internal.h>
38 #include <kern/processor.h>
39 #include <kern/sched_prim.h>    /* for thread_exception_return */
40 #include <kern/task.h>
41 #include <kern/thread.h>
42 #include <kern/thread_group.h>
43 #include <kern/zalloc.h>
44 #include <kern/work_interval.h>
45 #include <mach/kern_return.h>
46 #include <mach/mach_param.h>
47 #include <mach/mach_port.h>
48 #include <mach/mach_types.h>
49 #include <mach/mach_vm.h>
50 #include <mach/sync_policy.h>
51 #include <mach/task.h>
52 #include <mach/thread_act.h> /* for thread_resume */
53 #include <mach/thread_policy.h>
54 #include <mach/thread_status.h>
55 #include <mach/vm_prot.h>
56 #include <mach/vm_statistics.h>
57 #include <machine/atomic.h>
58 #include <machine/machine_routines.h>
59 #include <machine/smp.h>
60 #include <vm/vm_map.h>
61 #include <vm/vm_protos.h>
62 
63 #include <sys/eventvar.h>
64 #include <sys/kdebug.h>
65 #include <sys/kernel.h>
66 #include <sys/lock.h>
67 #include <sys/param.h>
68 #include <sys/proc_info.h>      /* for fill_procworkqueue */
69 #include <sys/proc_internal.h>
70 #include <sys/pthread_shims.h>
71 #include <sys/resourcevar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
74 #include <sys/sysproto.h>
75 #include <sys/systm.h>
76 #include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
77 
78 #include <pthread/bsdthread_private.h>
79 #include <pthread/workqueue_syscalls.h>
80 #include <pthread/workqueue_internal.h>
81 #include <pthread/workqueue_trace.h>
82 
83 #include <os/log.h>
84 
85 static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
86 
87 static void workq_bound_thread_unpark_continue(void *uth, wait_result_t wr) __dead2;
88 
89 static void workq_bound_thread_initialize_and_unpark_continue(void *uth, wait_result_t wr) __dead2;
90 
91 static void workq_bound_thread_setup_and_run(struct uthread *uth, int setup_flags) __dead2;
92 
93 static void workq_schedule_creator(proc_t p, struct workqueue *wq,
94     workq_kern_threadreq_flags_t flags);
95 
96 static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
97     workq_threadreq_t req);
98 
99 static uint32_t workq_constrained_allowance(struct workqueue *wq,
100     thread_qos_t at_qos, struct uthread *uth,
101     bool may_start_timer, bool record_failed_allowance);
102 
103 static bool _wq_cooperative_queue_refresh_best_req_qos(struct workqueue *wq);
104 
105 static bool workq_thread_is_busy(uint64_t cur_ts,
106     _Atomic uint64_t *lastblocked_tsp);
107 
108 static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
109 
110 static bool
111 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags);
112 
113 static inline void
114 workq_lock_spin(struct workqueue *wq);
115 
116 static inline void
117 workq_unlock(struct workqueue *wq);
118 
119 #pragma mark globals
120 
121 struct workq_usec_var {
122 	uint32_t usecs;
123 	uint64_t abstime;
124 };
125 
126 #define WORKQ_SYSCTL_USECS(var, init) \
127 	        static struct workq_usec_var var = { .usecs = init }; \
128 	        SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
129 	                        CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
130 	                        workq_sysctl_handle_usecs, "I", "")
131 
132 static LCK_GRP_DECLARE(workq_lck_grp, "workq");
133 os_refgrp_decl(static, workq_refgrp, "workq", NULL);
134 
135 static ZONE_DEFINE(workq_zone_workqueue, "workq.wq",
136     sizeof(struct workqueue), ZC_NONE);
137 static ZONE_DEFINE(workq_zone_threadreq, "workq.threadreq",
138     sizeof(struct workq_threadreq_s), ZC_CACHING);
139 
140 static struct mpsc_daemon_queue workq_deallocate_queue;
141 
142 WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS);
143 WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
144 WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
145 static uint32_t wq_max_threads              = WORKQUEUE_MAXTHREADS;
146 static uint32_t wq_max_constrained_threads  = WORKQUEUE_MAXTHREADS / 8;
147 static uint32_t wq_init_constrained_limit   = 1;
148 static uint16_t wq_death_max_load;
149 static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
150 
151 /*
152  * This is not a hard limit but the max size we want to aim to hit across the
153  * entire cooperative pool. We can oversubscribe the pool due to non-cooperative
154  * workers and the max we will oversubscribe the pool by, is a total of
155  * wq_max_cooperative_threads * WORKQ_NUM_QOS_BUCKETS.
156  */
157 static uint32_t wq_max_cooperative_threads;
158 
159 static inline uint32_t
wq_cooperative_queue_max_size(struct workqueue * wq)160 wq_cooperative_queue_max_size(struct workqueue *wq)
161 {
162 	return wq->wq_cooperative_queue_has_limited_max_size ? 1 : wq_max_cooperative_threads;
163 }
164 
165 #pragma mark sysctls
166 
167 static int
168 workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
169 {
170 #pragma unused(arg2)
171 	struct workq_usec_var *v = arg1;
172 	int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
173 	if (error || !req->newptr) {
174 		return error;
175 	}
176 	clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
177 	    &v->abstime);
178 	return 0;
179 }
180 
181 SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
182     &wq_max_threads, 0, "");
183 
184 SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
185     &wq_max_constrained_threads, 0, "");
186 
187 static int
188 wq_limit_cooperative_threads_for_proc SYSCTL_HANDLER_ARGS
189 {
190 #pragma unused(arg1, arg2, oidp)
191 	int input_pool_size = 0;
192 	int changed;
193 	int error = 0;
194 
195 	error = sysctl_io_number(req, 0, sizeof(int), &input_pool_size, &changed);
196 	if (error || !changed) {
197 		return error;
198 	}
199 
200 #define WQ_COOPERATIVE_POOL_SIZE_DEFAULT 0
201 #define WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS -1
202 /* Not available currently, but sysctl interface is designed to allow these
203  * extra parameters:
204  *		WQ_COOPERATIVE_POOL_SIZE_STRICT : -2 (across all bucket)
205  *		WQ_COOPERATIVE_POOL_SIZE_CUSTOM : [1, 512]
206  */
207 
208 	if (input_pool_size != WQ_COOPERATIVE_POOL_SIZE_DEFAULT
209 	    && input_pool_size != WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS) {
210 		error = EINVAL;
211 		goto out;
212 	}
213 
214 	proc_t p = req->p;
215 	struct workqueue *wq = proc_get_wqptr(p);
216 
217 	if (wq != NULL) {
218 		workq_lock_spin(wq);
219 		if (wq->wq_reqcount > 0 || wq->wq_nthreads > 0) {
220 			// Hackily enforce that the workqueue is still new (no requests or
221 			// threads)
222 			error = ENOTSUP;
223 		} else {
224 			wq->wq_cooperative_queue_has_limited_max_size = (input_pool_size == WQ_COOPERATIVE_POOL_SIZE_STRICT_PER_QOS);
225 		}
226 		workq_unlock(wq);
227 	} else {
228 		/* This process has no workqueue, calling this syctl makes no sense */
229 		return ENOTSUP;
230 	}
231 
232 out:
233 	return error;
234 }
235 
236 SYSCTL_PROC(_kern, OID_AUTO, wq_limit_cooperative_threads,
237     CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_WR | CTLFLAG_LOCKED | CTLTYPE_INT, 0, 0,
238     wq_limit_cooperative_threads_for_proc,
239     "I", "Modify the max pool size of the cooperative pool");
240 
241 #pragma mark p_wqptr
242 
243 #define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
244 
245 static struct workqueue *
proc_get_wqptr_fast(struct proc * p)246 proc_get_wqptr_fast(struct proc *p)
247 {
248 	return os_atomic_load(&p->p_wqptr, relaxed);
249 }
250 
251 struct workqueue *
proc_get_wqptr(struct proc * p)252 proc_get_wqptr(struct proc *p)
253 {
254 	struct workqueue *wq = proc_get_wqptr_fast(p);
255 	return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
256 }
257 
258 static void
proc_set_wqptr(struct proc * p,struct workqueue * wq)259 proc_set_wqptr(struct proc *p, struct workqueue *wq)
260 {
261 	wq = os_atomic_xchg(&p->p_wqptr, wq, release);
262 	if (wq == WQPTR_IS_INITING_VALUE) {
263 		proc_lock(p);
264 		thread_wakeup(&p->p_wqptr);
265 		proc_unlock(p);
266 	}
267 }
268 
269 static bool
proc_init_wqptr_or_wait(struct proc * p)270 proc_init_wqptr_or_wait(struct proc *p)
271 {
272 	struct workqueue *wq;
273 
274 	proc_lock(p);
275 	wq = os_atomic_load(&p->p_wqptr, relaxed);
276 
277 	if (wq == NULL) {
278 		os_atomic_store(&p->p_wqptr, WQPTR_IS_INITING_VALUE, relaxed);
279 		proc_unlock(p);
280 		return true;
281 	}
282 
283 	if (wq == WQPTR_IS_INITING_VALUE) {
284 		assert_wait(&p->p_wqptr, THREAD_UNINT);
285 		proc_unlock(p);
286 		thread_block(THREAD_CONTINUE_NULL);
287 	} else {
288 		proc_unlock(p);
289 	}
290 	return false;
291 }
292 
293 static inline event_t
workq_parked_wait_event(struct uthread * uth)294 workq_parked_wait_event(struct uthread *uth)
295 {
296 	return (event_t)&uth->uu_workq_stackaddr;
297 }
298 
299 static inline void
workq_thread_wakeup(struct uthread * uth)300 workq_thread_wakeup(struct uthread *uth)
301 {
302 	thread_wakeup_thread(workq_parked_wait_event(uth), get_machthread(uth));
303 }
304 
305 #pragma mark wq_thactive
306 
307 #if defined(__LP64__)
308 // Layout is:
309 //   127 - 115 : 13 bits of zeroes
310 //   114 - 112 : best QoS among all pending constrained requests
311 //   111 -   0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
312 #define WQ_THACTIVE_BUCKET_WIDTH 16
313 #define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
314 #else
315 // Layout is:
316 //   63 - 61 : best QoS among all pending constrained requests
317 //   60      : Manager bucket (0 or 1)
318 //   59 -  0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
319 #define WQ_THACTIVE_BUCKET_WIDTH 10
320 #define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
321 #endif
322 #define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
323 #define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
324 
325 static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
326     "Make sure we have space to encode a QoS");
327 
328 static inline wq_thactive_t
_wq_thactive(struct workqueue * wq)329 _wq_thactive(struct workqueue *wq)
330 {
331 	return os_atomic_load_wide(&wq->wq_thactive, relaxed);
332 }
333 
334 static inline uint8_t
_wq_bucket(thread_qos_t qos)335 _wq_bucket(thread_qos_t qos)
336 {
337 	// Map both BG and MT to the same bucket by over-shifting down and
338 	// clamping MT and BG together.
339 	switch (qos) {
340 	case THREAD_QOS_MAINTENANCE:
341 		return 0;
342 	default:
343 		return qos - 2;
344 	}
345 }
346 
347 #define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
348 	        ((thread_qos_t)((tha) >> WQ_THACTIVE_QOS_SHIFT))
349 
350 static inline thread_qos_t
_wq_thactive_best_constrained_req_qos(struct workqueue * wq)351 _wq_thactive_best_constrained_req_qos(struct workqueue *wq)
352 {
353 	// Avoid expensive atomic operations: the three bits we're loading are in
354 	// a single byte, and always updated under the workqueue lock
355 	wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
356 	return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
357 }
358 
359 static void
_wq_thactive_refresh_best_constrained_req_qos(struct workqueue * wq)360 _wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
361 {
362 	thread_qos_t old_qos, new_qos;
363 	workq_threadreq_t req;
364 
365 	req = priority_queue_max(&wq->wq_constrained_queue,
366 	    struct workq_threadreq_s, tr_entry);
367 	new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
368 	old_qos = _wq_thactive_best_constrained_req_qos(wq);
369 	if (old_qos != new_qos) {
370 		long delta = (long)new_qos - (long)old_qos;
371 		wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
372 		/*
373 		 * We can do an atomic add relative to the initial load because updates
374 		 * to this qos are always serialized under the workqueue lock.
375 		 */
376 		v = os_atomic_add(&wq->wq_thactive, v, relaxed);
377 #ifdef __LP64__
378 		WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
379 		    (uint64_t)(v >> 64), 0);
380 #else
381 		WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0);
382 #endif
383 	}
384 }
385 
386 static inline wq_thactive_t
_wq_thactive_offset_for_qos(thread_qos_t qos)387 _wq_thactive_offset_for_qos(thread_qos_t qos)
388 {
389 	uint8_t bucket = _wq_bucket(qos);
390 	__builtin_assume(bucket < WORKQ_NUM_BUCKETS);
391 	return (wq_thactive_t)1 << (bucket * WQ_THACTIVE_BUCKET_WIDTH);
392 }
393 
394 static inline wq_thactive_t
_wq_thactive_inc(struct workqueue * wq,thread_qos_t qos)395 _wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
396 {
397 	wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
398 	return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
399 }
400 
401 static inline wq_thactive_t
_wq_thactive_dec(struct workqueue * wq,thread_qos_t qos)402 _wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
403 {
404 	wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
405 	return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
406 }
407 
408 static inline void
_wq_thactive_move(struct workqueue * wq,thread_qos_t old_qos,thread_qos_t new_qos)409 _wq_thactive_move(struct workqueue *wq,
410     thread_qos_t old_qos, thread_qos_t new_qos)
411 {
412 	wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
413 	    _wq_thactive_offset_for_qos(old_qos);
414 	os_atomic_add(&wq->wq_thactive, v, relaxed);
415 	wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
416 	wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
417 }
418 
419 static inline uint32_t
_wq_thactive_aggregate_downto_qos(struct workqueue * wq,wq_thactive_t v,thread_qos_t qos,uint32_t * busycount,uint32_t * max_busycount)420 _wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
421     thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
422 {
423 	uint32_t count = 0, active;
424 	uint64_t curtime;
425 
426 	assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
427 
428 	if (busycount) {
429 		curtime = mach_absolute_time();
430 		*busycount = 0;
431 	}
432 	if (max_busycount) {
433 		*max_busycount = THREAD_QOS_LAST - qos;
434 	}
435 
436 	uint8_t i = _wq_bucket(qos);
437 	v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
438 	for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
439 		active = v & WQ_THACTIVE_BUCKET_MASK;
440 		count += active;
441 
442 		if (busycount && wq->wq_thscheduled_count[i] > active) {
443 			if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
444 				/*
445 				 * We only consider the last blocked thread for a given bucket
446 				 * as busy because we don't want to take the list lock in each
447 				 * sched callback. However this is an approximation that could
448 				 * contribute to thread creation storms.
449 				 */
450 				(*busycount)++;
451 			}
452 		}
453 	}
454 
455 	return count;
456 }
457 
458 /* The input qos here should be the requested QoS of the thread, not accounting
459  * for any overrides */
460 static inline void
_wq_cooperative_queue_scheduled_count_dec(struct workqueue * wq,thread_qos_t qos)461 _wq_cooperative_queue_scheduled_count_dec(struct workqueue *wq, thread_qos_t qos)
462 {
463 	__assert_only uint8_t old_scheduled_count = wq->wq_cooperative_queue_scheduled_count[_wq_bucket(qos)]--;
464 	assert(old_scheduled_count > 0);
465 }
466 
467 /* The input qos here should be the requested QoS of the thread, not accounting
468  * for any overrides */
469 static inline void
_wq_cooperative_queue_scheduled_count_inc(struct workqueue * wq,thread_qos_t qos)470 _wq_cooperative_queue_scheduled_count_inc(struct workqueue *wq, thread_qos_t qos)
471 {
472 	__assert_only uint8_t old_scheduled_count = wq->wq_cooperative_queue_scheduled_count[_wq_bucket(qos)]++;
473 	assert(old_scheduled_count < UINT8_MAX);
474 }
475 
476 #pragma mark wq_flags
477 
478 static inline uint32_t
_wq_flags(struct workqueue * wq)479 _wq_flags(struct workqueue *wq)
480 {
481 	return os_atomic_load(&wq->wq_flags, relaxed);
482 }
483 
484 static inline bool
_wq_exiting(struct workqueue * wq)485 _wq_exiting(struct workqueue *wq)
486 {
487 	return _wq_flags(wq) & WQ_EXITING;
488 }
489 
490 bool
workq_is_exiting(struct proc * p)491 workq_is_exiting(struct proc *p)
492 {
493 	struct workqueue *wq = proc_get_wqptr(p);
494 	return !wq || _wq_exiting(wq);
495 }
496 
497 
498 #pragma mark workqueue lock
499 
500 static bool
workq_lock_is_acquired_kdp(struct workqueue * wq)501 workq_lock_is_acquired_kdp(struct workqueue *wq)
502 {
503 	return kdp_lck_ticket_is_acquired(&wq->wq_lock);
504 }
505 
506 static inline void
workq_lock_spin(struct workqueue * wq)507 workq_lock_spin(struct workqueue *wq)
508 {
509 	lck_ticket_lock(&wq->wq_lock, &workq_lck_grp);
510 }
511 
512 static inline void
workq_lock_held(struct workqueue * wq)513 workq_lock_held(struct workqueue *wq)
514 {
515 	LCK_TICKET_ASSERT_OWNED(&wq->wq_lock);
516 }
517 
518 static inline bool
workq_lock_try(struct workqueue * wq)519 workq_lock_try(struct workqueue *wq)
520 {
521 	return lck_ticket_lock_try(&wq->wq_lock, &workq_lck_grp);
522 }
523 
524 static inline void
workq_unlock(struct workqueue * wq)525 workq_unlock(struct workqueue *wq)
526 {
527 	lck_ticket_unlock(&wq->wq_lock);
528 }
529 
530 #pragma mark idle thread lists
531 
532 #define WORKQ_POLICY_INIT(qos) \
533 	        (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
534 
535 static inline thread_qos_t
workq_pri_bucket(struct uu_workq_policy req)536 workq_pri_bucket(struct uu_workq_policy req)
537 {
538 	return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
539 }
540 
541 static inline thread_qos_t
workq_pri_override(struct uu_workq_policy req)542 workq_pri_override(struct uu_workq_policy req)
543 {
544 	return MAX(workq_pri_bucket(req), req.qos_bucket);
545 }
546 
547 static inline bool
workq_thread_needs_params_change(workq_threadreq_t req,struct uthread * uth)548 workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
549 {
550 	workq_threadreq_param_t cur_trp, req_trp = { };
551 
552 	cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
553 	if (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
554 		req_trp = kqueue_threadreq_workloop_param(req);
555 	}
556 
557 	/*
558 	 * CPU percent flags are handled separately to policy changes, so ignore
559 	 * them for all of these checks.
560 	 */
561 	uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
562 	uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
563 
564 	if (!req_flags && !cur_flags) {
565 		return false;
566 	}
567 
568 	if (req_flags != cur_flags) {
569 		return true;
570 	}
571 
572 	if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
573 		return true;
574 	}
575 
576 	if ((req_flags & TRP_POLICY) && req_trp.trp_pol != cur_trp.trp_pol) {
577 		return true;
578 	}
579 
580 	return false;
581 }
582 
583 static inline bool
workq_thread_needs_priority_change(workq_threadreq_t req,struct uthread * uth)584 workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
585 {
586 	if (workq_thread_needs_params_change(req, uth)) {
587 		return true;
588 	}
589 
590 	if (req->tr_qos != workq_pri_override(uth->uu_workq_pri)) {
591 		return true;
592 	}
593 
594 #if CONFIG_PREADOPT_TG
595 	thread_group_qos_t tg = kqr_preadopt_thread_group(req);
596 	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
597 		/*
598 		 * Ideally, we'd add check here to see if thread's preadopt TG is same
599 		 * as the thread requests's thread group and short circuit if that is
600 		 * the case. But in the interest of keeping the code clean and not
601 		 * taking the thread lock here, we're going to skip this. We will
602 		 * eventually shortcircuit once we try to set the preadoption thread
603 		 * group on the thread.
604 		 */
605 		return true;
606 	}
607 #endif
608 
609 	return false;
610 }
611 
612 /* Input thread must be self. Called during self override, resetting overrides
613  * or while processing kevents
614  *
615  * Called with workq lock held. Sometimes also the thread mutex
616  */
617 static void
workq_thread_update_bucket(proc_t p,struct workqueue * wq,struct uthread * uth,struct uu_workq_policy old_pri,struct uu_workq_policy new_pri,bool force_run)618 workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
619     struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
620     bool force_run)
621 {
622 	assert(uth == current_uthread());
623 
624 	thread_qos_t old_bucket = old_pri.qos_bucket;
625 	thread_qos_t new_bucket = workq_pri_bucket(new_pri);
626 
627 	if ((old_bucket != new_bucket) &&
628 	    !workq_thread_is_permanently_bound(uth)) {
629 		_wq_thactive_move(wq, old_bucket, new_bucket);
630 	}
631 
632 	new_pri.qos_bucket = new_bucket;
633 	uth->uu_workq_pri = new_pri;
634 
635 	if (old_pri.qos_override != new_pri.qos_override) {
636 		thread_set_workq_override(get_machthread(uth), new_pri.qos_override);
637 	}
638 
639 	if (wq->wq_reqcount &&
640 	    !workq_thread_is_permanently_bound(uth) &&
641 	    (old_bucket > new_bucket || force_run)) {
642 		int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
643 		if (old_bucket > new_bucket) {
644 			/*
645 			 * When lowering our bucket, we may unblock a thread request,
646 			 * but we can't drop our priority before we have evaluated
647 			 * whether this is the case, and if we ever drop the workqueue lock
648 			 * that would cause a priority inversion.
649 			 *
650 			 * We hence have to disallow thread creation in that case.
651 			 */
652 			flags = 0;
653 		}
654 		workq_schedule_creator(p, wq, flags);
655 	}
656 }
657 
658 /*
659  * Sets/resets the cpu percent limits on the current thread. We can't set
660  * these limits from outside of the current thread, so this function needs
661  * to be called when we're executing on the intended
662  */
663 static void
workq_thread_reset_cpupercent(workq_threadreq_t req,struct uthread * uth)664 workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
665 {
666 	assert(uth == current_uthread());
667 	workq_threadreq_param_t trp = { };
668 
669 	if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
670 		trp = kqueue_threadreq_workloop_param(req);
671 	}
672 
673 	if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
674 		/*
675 		 * Going through disable when we have an existing CPU percent limit
676 		 * set will force the ledger to refill the token bucket of the current
677 		 * thread. Removing any penalty applied by previous thread use.
678 		 */
679 		thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
680 		uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
681 	}
682 
683 	if (trp.trp_flags & TRP_CPUPERCENT) {
684 		thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
685 		    (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
686 		uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
687 	}
688 }
689 
690 /*
691  * This function is always called with the workq lock, except for the
692  * permanently bound workqueue thread, which instead requires the kqlock.
693  * See locking model for bound thread's uu_workq_flags.
694  */
695 static void
workq_thread_reset_pri(struct workqueue * wq,struct uthread * uth,workq_threadreq_t req,bool unpark)696 workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
697     workq_threadreq_t req, bool unpark)
698 {
699 	thread_t th = get_machthread(uth);
700 	thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
701 	workq_threadreq_param_t trp = { };
702 	int priority = 31;
703 	int policy = POLICY_TIMESHARE;
704 
705 	if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS)) {
706 		trp = kqueue_threadreq_workloop_param(req);
707 	}
708 
709 	uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
710 	uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
711 
712 	if (unpark) {
713 		uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
714 		// qos sent out to userspace (may differ from uu_workq_pri on param threads)
715 		uth->uu_save.uus_workq_park_data.qos = qos;
716 	}
717 
718 	if (qos == WORKQ_THREAD_QOS_MANAGER) {
719 		uint32_t mgr_pri = wq->wq_event_manager_priority;
720 		assert(trp.trp_value == 0); // manager qos and thread policy don't mix
721 
722 		if (_pthread_priority_has_sched_pri(mgr_pri)) {
723 			mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
724 			thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
725 			    POLICY_TIMESHARE);
726 			return;
727 		}
728 
729 		qos = _pthread_priority_thread_qos(mgr_pri);
730 	} else {
731 		if (trp.trp_flags & TRP_PRIORITY) {
732 			qos = THREAD_QOS_UNSPECIFIED;
733 			priority = trp.trp_pri;
734 			uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
735 		}
736 
737 		if (trp.trp_flags & TRP_POLICY) {
738 			policy = trp.trp_pol;
739 		}
740 	}
741 
742 #if CONFIG_PREADOPT_TG
743 	if (req && (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP)) {
744 		/*
745 		 * For kqwl permanently configured with a thread group, we can safely borrow
746 		 * +1 ref from kqwl_preadopt_tg. A thread then takes additional +1 ref
747 		 * for itself via thread_set_preadopt_thread_group.
748 		 *
749 		 * In all other cases, we cannot safely read and borrow the reference from the kqwl
750 		 * since it can disappear from under us at any time due to the max-ing logic in
751 		 * kqueue_set_preadopted_thread_group.
752 		 *
753 		 * As such, we do the following dance:
754 		 *
755 		 * 1) cmpxchng and steal the kqwl's preadopt thread group and leave
756 		 * behind with (NULL + QoS). At this point, we have the reference
757 		 * to the thread group from the kqwl.
758 		 * 2) Have the thread set the preadoption thread group on itself.
759 		 * 3) cmpxchng from (NULL + QoS) which we set earlier in (1), back to
760 		 * thread_group + QoS. ie we try to give the reference back to the kqwl.
761 		 * If we fail, that's because a higher QoS thread group was set on the
762 		 * kqwl in kqueue_set_preadopted_thread_group in which case, we need to
763 		 * go back to (1).
764 		 */
765 
766 		_Atomic(struct thread_group *) * tg_loc = kqr_preadopt_thread_group_addr(req);
767 
768 		thread_group_qos_t old_tg, new_tg;
769 		int ret = 0;
770 again:
771 		ret = os_atomic_rmw_loop(tg_loc, old_tg, new_tg, relaxed, {
772 			if ((!KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) ||
773 			KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
774 			        os_atomic_rmw_loop_give_up(break);
775 			}
776 
777 			/*
778 			 * Leave the QoS behind - kqueue_set_preadopted_thread_group will
779 			 * only modify it if there is a higher QoS thread group to attach
780 			 */
781 			new_tg = (thread_group_qos_t) ((uintptr_t) old_tg & KQWL_PREADOPT_TG_QOS_MASK);
782 		});
783 
784 		if (ret) {
785 			/*
786 			 * We successfully took the ref from the kqwl so set it on the
787 			 * thread now
788 			 */
789 			thread_set_preadopt_thread_group(th, KQWL_GET_PREADOPTED_TG(old_tg));
790 
791 			thread_group_qos_t thread_group_to_expect = new_tg;
792 			thread_group_qos_t thread_group_to_set = old_tg;
793 
794 			os_atomic_rmw_loop(tg_loc, old_tg, new_tg, relaxed, {
795 				if (old_tg != thread_group_to_expect) {
796 				        /*
797 				         * There was an intervening write to the kqwl_preadopt_tg,
798 				         * and it has a higher QoS than what we are working with
799 				         * here. Abandon our current adopted thread group and redo
800 				         * the full dance
801 				         */
802 				        thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(thread_group_to_set));
803 				        os_atomic_rmw_loop_give_up(goto again);
804 				}
805 
806 				new_tg = thread_group_to_set;
807 			});
808 		} else {
809 			if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
810 				thread_set_preadopt_thread_group(th, KQWL_GET_PREADOPTED_TG(old_tg));
811 			} else {
812 				/* Nothing valid on the kqwl, just clear what's on the thread */
813 				thread_set_preadopt_thread_group(th, NULL);
814 			}
815 		}
816 	} else {
817 		/* Not even a kqwl, clear what's on the thread */
818 		thread_set_preadopt_thread_group(th, NULL);
819 	}
820 #endif
821 	thread_set_workq_pri(th, qos, priority, policy);
822 }
823 
824 /*
825  * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
826  * every time a servicer is being told about a new max QoS.
827  */
828 void
workq_thread_set_max_qos(struct proc * p,workq_threadreq_t kqr)829 workq_thread_set_max_qos(struct proc *p, workq_threadreq_t kqr)
830 {
831 	struct uu_workq_policy old_pri, new_pri;
832 	struct uthread *uth = current_uthread();
833 	struct workqueue *wq = proc_get_wqptr_fast(p);
834 	thread_qos_t qos = kqr->tr_kq_qos_index;
835 
836 	if (uth->uu_workq_pri.qos_max == qos) {
837 		return;
838 	}
839 
840 	workq_lock_spin(wq);
841 	old_pri = new_pri = uth->uu_workq_pri;
842 	new_pri.qos_max = qos;
843 	workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
844 	workq_unlock(wq);
845 }
846 
847 #pragma mark idle threads accounting and handling
848 
849 static inline struct uthread *
workq_oldest_killable_idle_thread(struct workqueue * wq)850 workq_oldest_killable_idle_thread(struct workqueue *wq)
851 {
852 	struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
853 
854 	if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
855 		uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
856 		if (uth) {
857 			assert(uth->uu_save.uus_workq_park_data.has_stack);
858 		}
859 	}
860 	return uth;
861 }
862 
863 static inline uint64_t
workq_kill_delay_for_idle_thread(struct workqueue * wq)864 workq_kill_delay_for_idle_thread(struct workqueue *wq)
865 {
866 	uint64_t delay = wq_reduce_pool_window.abstime;
867 	uint16_t idle = wq->wq_thidlecount;
868 
869 	/*
870 	 * If we have less than wq_death_max_load threads, have a 5s timer.
871 	 *
872 	 * For the next wq_max_constrained_threads ones, decay linearly from
873 	 * from 5s to 50ms.
874 	 */
875 	if (idle <= wq_death_max_load) {
876 		return delay;
877 	}
878 
879 	if (wq_max_constrained_threads > idle - wq_death_max_load) {
880 		delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
881 	}
882 	return delay / wq_max_constrained_threads;
883 }
884 
885 static inline bool
workq_should_kill_idle_thread(struct workqueue * wq,struct uthread * uth,uint64_t now)886 workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
887     uint64_t now)
888 {
889 	uint64_t delay = workq_kill_delay_for_idle_thread(wq);
890 	return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
891 }
892 
893 static void
workq_death_call_schedule(struct workqueue * wq,uint64_t deadline)894 workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
895 {
896 	uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
897 
898 	if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
899 		return;
900 	}
901 	os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
902 
903 	WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0);
904 
905 	/*
906 	 * <rdar://problem/13139182> Due to how long term timers work, the leeway
907 	 * can't be too short, so use 500ms which is long enough that we will not
908 	 * wake up the CPU for killing threads, but short enough that it doesn't
909 	 * fall into long-term timer list shenanigans.
910 	 */
911 	thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
912 	    wq_reduce_pool_window.abstime / 10,
913 	    THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
914 }
915 
916 /*
917  * `decrement` is set to the number of threads that are no longer dying:
918  * - because they have been resuscitated just in time (workq_pop_idle_thread)
919  * - or have been killed (workq_thread_terminate).
920  */
921 static void
workq_death_policy_evaluate(struct workqueue * wq,uint16_t decrement)922 workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
923 {
924 	struct uthread *uth;
925 
926 	assert(wq->wq_thdying_count >= decrement);
927 	if ((wq->wq_thdying_count -= decrement) > 0) {
928 		return;
929 	}
930 
931 	if (wq->wq_thidlecount <= 1) {
932 		return;
933 	}
934 
935 	if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) {
936 		return;
937 	}
938 
939 	uint64_t now = mach_absolute_time();
940 	uint64_t delay = workq_kill_delay_for_idle_thread(wq);
941 
942 	if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
943 		WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
944 		    wq, wq->wq_thidlecount, 0, 0);
945 		wq->wq_thdying_count++;
946 		uth->uu_workq_flags |= UT_WORKQ_DYING;
947 		if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
948 			workq_thread_wakeup(uth);
949 		}
950 		return;
951 	}
952 
953 	workq_death_call_schedule(wq,
954 	    uth->uu_save.uus_workq_park_data.idle_stamp + delay);
955 }
956 
957 void
workq_thread_terminate(struct proc * p,struct uthread * uth)958 workq_thread_terminate(struct proc *p, struct uthread *uth)
959 {
960 	struct workqueue *wq = proc_get_wqptr_fast(p);
961 
962 	workq_lock_spin(wq);
963 	if (!workq_thread_is_permanently_bound(uth)) {
964 		TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
965 		if (uth->uu_workq_flags & UT_WORKQ_DYING) {
966 			WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
967 			    wq, wq->wq_thidlecount, 0, 0);
968 			workq_death_policy_evaluate(wq, 1);
969 		}
970 	}
971 	if (wq->wq_nthreads-- == wq_max_threads) {
972 		/*
973 		 * We got under the thread limit again, which may have prevented
974 		 * thread creation from happening, redrive if there are pending requests
975 		 */
976 		if (wq->wq_reqcount) {
977 			workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
978 		}
979 	}
980 	workq_unlock(wq);
981 
982 	thread_deallocate(get_machthread(uth));
983 }
984 
985 static void
workq_kill_old_threads_call(void * param0,void * param1 __unused)986 workq_kill_old_threads_call(void *param0, void *param1 __unused)
987 {
988 	struct workqueue *wq = param0;
989 
990 	workq_lock_spin(wq);
991 	WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0);
992 	os_atomic_andnot(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
993 	workq_death_policy_evaluate(wq, 0);
994 	WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0);
995 	workq_unlock(wq);
996 }
997 
998 static struct uthread *
workq_pop_idle_thread(struct workqueue * wq,uint16_t uu_flags,bool * needs_wakeup)999 workq_pop_idle_thread(struct workqueue *wq, uint16_t uu_flags,
1000     bool *needs_wakeup)
1001 {
1002 	struct uthread *uth;
1003 
1004 	if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
1005 		TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
1006 	} else {
1007 		uth = TAILQ_FIRST(&wq->wq_thnewlist);
1008 		TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
1009 	}
1010 	TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
1011 
1012 	assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
1013 	uth->uu_workq_flags |= UT_WORKQ_RUNNING | uu_flags;
1014 
1015 	/* A thread is never woken up as part of the cooperative pool */
1016 	assert((uu_flags & UT_WORKQ_COOPERATIVE) == 0);
1017 
1018 	if ((uu_flags & UT_WORKQ_OVERCOMMIT) == 0) {
1019 		wq->wq_constrained_threads_scheduled++;
1020 	}
1021 	wq->wq_threads_scheduled++;
1022 	wq->wq_thidlecount--;
1023 
1024 	if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
1025 		uth->uu_workq_flags ^= UT_WORKQ_DYING;
1026 		workq_death_policy_evaluate(wq, 1);
1027 		*needs_wakeup = false;
1028 	} else if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
1029 		*needs_wakeup = false;
1030 	} else {
1031 		*needs_wakeup = true;
1032 	}
1033 	return uth;
1034 }
1035 
1036 /*
1037  * Called by thread_create_workq_waiting() during thread initialization, before
1038  * assert_wait, before the thread has been started.
1039  */
1040 event_t
workq_thread_init_and_wq_lock(task_t task,thread_t th)1041 workq_thread_init_and_wq_lock(task_t task, thread_t th)
1042 {
1043 	struct uthread *uth = get_bsdthread_info(th);
1044 
1045 	uth->uu_workq_flags = UT_WORKQ_NEW;
1046 	uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
1047 	uth->uu_workq_thport = MACH_PORT_NULL;
1048 	uth->uu_workq_stackaddr = 0;
1049 	uth->uu_workq_pthread_kill_allowed = 0;
1050 
1051 	thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
1052 	thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
1053 
1054 	workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
1055 	return workq_parked_wait_event(uth);
1056 }
1057 
1058 /**
1059  * Try to add a new workqueue thread.
1060  *
1061  * - called with workq lock held
1062  * - dropped and retaken around thread creation
1063  * - return with workq lock held
1064  */
1065 static kern_return_t
workq_add_new_idle_thread(proc_t p,struct workqueue * wq,thread_continue_t continuation,bool is_permanently_bound,thread_t * new_thread)1066 workq_add_new_idle_thread(
1067 	proc_t             p,
1068 	struct workqueue  *wq,
1069 	thread_continue_t continuation,
1070 	bool              is_permanently_bound,
1071 	thread_t          *new_thread)
1072 {
1073 	mach_vm_offset_t th_stackaddr;
1074 	kern_return_t kret;
1075 	thread_t th;
1076 
1077 	wq->wq_nthreads++;
1078 
1079 	workq_unlock(wq);
1080 
1081 	vm_map_t vmap = get_task_map(proc_task(p));
1082 
1083 	kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
1084 	if (kret != KERN_SUCCESS) {
1085 		WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
1086 		    kret, 1, 0);
1087 		goto out;
1088 	}
1089 
1090 	kret = thread_create_workq_waiting(proc_task(p),
1091 	    continuation,
1092 	    &th,
1093 	    is_permanently_bound);
1094 	if (kret != KERN_SUCCESS) {
1095 		WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
1096 		    kret, 0, 0);
1097 		pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
1098 		goto out;
1099 	}
1100 
1101 	// thread_create_workq_waiting() will return with the wq lock held
1102 	// on success, because it calls workq_thread_init_and_wq_lock() above
1103 
1104 	struct uthread *uth = get_bsdthread_info(th);
1105 	uth->uu_workq_stackaddr = (user_addr_t)th_stackaddr;
1106 
1107 	wq->wq_creations++;
1108 	if (!is_permanently_bound) {
1109 		wq->wq_thidlecount++;
1110 		TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1111 	}
1112 
1113 	if (new_thread) {
1114 		*new_thread = th;
1115 	}
1116 
1117 	WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0);
1118 	return kret;
1119 
1120 out:
1121 	workq_lock_spin(wq);
1122 	/*
1123 	 * Do not redrive here if we went under wq_max_threads again,
1124 	 * it is the responsibility of the callers of this function
1125 	 * to do so when it fails.
1126 	 */
1127 	wq->wq_nthreads--;
1128 	return kret;
1129 }
1130 
1131 static inline bool
workq_thread_is_overcommit(struct uthread * uth)1132 workq_thread_is_overcommit(struct uthread *uth)
1133 {
1134 	return (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0;
1135 }
1136 
1137 static inline bool
workq_thread_is_nonovercommit(struct uthread * uth)1138 workq_thread_is_nonovercommit(struct uthread *uth)
1139 {
1140 	return (uth->uu_workq_flags & (UT_WORKQ_OVERCOMMIT |
1141 	       UT_WORKQ_COOPERATIVE)) == 0;
1142 }
1143 
1144 static inline bool
workq_thread_is_cooperative(struct uthread * uth)1145 workq_thread_is_cooperative(struct uthread *uth)
1146 {
1147 	return (uth->uu_workq_flags & UT_WORKQ_COOPERATIVE) != 0;
1148 }
1149 
1150 bool
workq_thread_is_permanently_bound(struct uthread * uth)1151 workq_thread_is_permanently_bound(struct uthread *uth)
1152 {
1153 	return (uth->uu_workq_flags & UT_WORKQ_PERMANENT_BIND) != 0;
1154 }
1155 
1156 static inline void
workq_thread_set_type(struct uthread * uth,uint16_t flags)1157 workq_thread_set_type(struct uthread *uth, uint16_t flags)
1158 {
1159 	uth->uu_workq_flags &= ~(UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE);
1160 	uth->uu_workq_flags |= flags;
1161 }
1162 
1163 
1164 #define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
1165 
1166 __attribute__((noreturn, noinline))
1167 static void
workq_unpark_for_death_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t death_flags,uint32_t setup_flags)1168 workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
1169     struct uthread *uth, uint32_t death_flags, uint32_t setup_flags)
1170 {
1171 	thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
1172 	bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
1173 
1174 	if (qos > WORKQ_THREAD_QOS_CLEANUP) {
1175 		workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
1176 		qos = WORKQ_THREAD_QOS_CLEANUP;
1177 	}
1178 
1179 	workq_thread_reset_cpupercent(NULL, uth);
1180 
1181 	if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
1182 		wq->wq_thidlecount--;
1183 		if (first_use) {
1184 			TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
1185 		} else {
1186 			TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
1187 		}
1188 	}
1189 	TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
1190 
1191 	workq_unlock(wq);
1192 
1193 	if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
1194 		__assert_only kern_return_t kr;
1195 		kr = thread_set_voucher_name(MACH_PORT_NULL);
1196 		assert(kr == KERN_SUCCESS);
1197 	}
1198 
1199 	uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
1200 	thread_t th = get_machthread(uth);
1201 	vm_map_t vmap = get_task_map(proc_task(p));
1202 
1203 	if (!first_use) {
1204 		flags |= WQ_FLAG_THREAD_REUSE;
1205 	}
1206 
1207 	pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
1208 	    uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, flags);
1209 	__builtin_unreachable();
1210 }
1211 
1212 bool
workq_is_current_thread_updating_turnstile(struct workqueue * wq)1213 workq_is_current_thread_updating_turnstile(struct workqueue *wq)
1214 {
1215 	return wq->wq_turnstile_updater == current_thread();
1216 }
1217 
1218 __attribute__((always_inline))
1219 static inline void
1220 workq_perform_turnstile_operation_locked(struct workqueue *wq,
1221     void (^operation)(void))
1222 {
1223 	workq_lock_held(wq);
1224 	wq->wq_turnstile_updater = current_thread();
1225 	operation();
1226 	wq->wq_turnstile_updater = THREAD_NULL;
1227 }
1228 
1229 static void
workq_turnstile_update_inheritor(struct workqueue * wq,turnstile_inheritor_t inheritor,turnstile_update_flags_t flags)1230 workq_turnstile_update_inheritor(struct workqueue *wq,
1231     turnstile_inheritor_t inheritor,
1232     turnstile_update_flags_t flags)
1233 {
1234 	if (wq->wq_inheritor == inheritor) {
1235 		return;
1236 	}
1237 	wq->wq_inheritor = inheritor;
1238 	workq_perform_turnstile_operation_locked(wq, ^{
1239 		turnstile_update_inheritor(wq->wq_turnstile, inheritor,
1240 		flags | TURNSTILE_IMMEDIATE_UPDATE);
1241 		turnstile_update_inheritor_complete(wq->wq_turnstile,
1242 		TURNSTILE_INTERLOCK_HELD);
1243 	});
1244 }
1245 
1246 static void
workq_push_idle_thread(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)1247 workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth,
1248     uint32_t setup_flags)
1249 {
1250 	uint64_t now = mach_absolute_time();
1251 	bool is_creator = (uth == wq->wq_creator);
1252 
1253 	if (workq_thread_is_cooperative(uth)) {
1254 		assert(!is_creator);
1255 
1256 		thread_qos_t thread_qos = uth->uu_workq_pri.qos_req;
1257 		_wq_cooperative_queue_scheduled_count_dec(wq, thread_qos);
1258 
1259 		/* Before we get here, we always go through
1260 		 * workq_select_threadreq_or_park_and_unlock. If we got here, it means
1261 		 * that we went through the logic in workq_threadreq_select which
1262 		 * did the refresh for the next best cooperative qos while
1263 		 * excluding the current thread - we shouldn't need to do it again.
1264 		 */
1265 		assert(_wq_cooperative_queue_refresh_best_req_qos(wq) == false);
1266 	} else if (workq_thread_is_nonovercommit(uth)) {
1267 		assert(!is_creator);
1268 
1269 		wq->wq_constrained_threads_scheduled--;
1270 	}
1271 
1272 	uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE);
1273 	TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
1274 	wq->wq_threads_scheduled--;
1275 
1276 	if (is_creator) {
1277 		wq->wq_creator = NULL;
1278 		WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
1279 		    uth->uu_save.uus_workq_park_data.yields);
1280 	}
1281 
1282 	if (wq->wq_inheritor == get_machthread(uth)) {
1283 		assert(wq->wq_creator == NULL);
1284 		if (wq->wq_reqcount) {
1285 			workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
1286 		} else {
1287 			workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
1288 		}
1289 	}
1290 
1291 	if (uth->uu_workq_flags & UT_WORKQ_NEW) {
1292 		assert(is_creator || (_wq_flags(wq) & WQ_EXITING));
1293 		TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
1294 		wq->wq_thidlecount++;
1295 		return;
1296 	}
1297 
1298 	if (!is_creator) {
1299 		_wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
1300 		wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
1301 		uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
1302 	}
1303 
1304 	uth->uu_save.uus_workq_park_data.idle_stamp = now;
1305 
1306 	struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
1307 	uint16_t cur_idle = wq->wq_thidlecount;
1308 
1309 	if (cur_idle >= wq_max_constrained_threads ||
1310 	    (wq->wq_thdying_count == 0 && oldest &&
1311 	    workq_should_kill_idle_thread(wq, oldest, now))) {
1312 		/*
1313 		 * Immediately kill threads if we have too may of them.
1314 		 *
1315 		 * And swap "place" with the oldest one we'd have woken up.
1316 		 * This is a relatively desperate situation where we really
1317 		 * need to kill threads quickly and it's best to kill
1318 		 * the one that's currently on core than context switching.
1319 		 */
1320 		if (oldest) {
1321 			oldest->uu_save.uus_workq_park_data.idle_stamp = now;
1322 			TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
1323 			TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
1324 		}
1325 
1326 		WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
1327 		    wq, cur_idle, 0, 0);
1328 		wq->wq_thdying_count++;
1329 		uth->uu_workq_flags |= UT_WORKQ_DYING;
1330 		uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
1331 		workq_unpark_for_death_and_unlock(p, wq, uth, 0, setup_flags);
1332 		__builtin_unreachable();
1333 	}
1334 
1335 	struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
1336 
1337 	cur_idle += 1;
1338 	wq->wq_thidlecount = cur_idle;
1339 
1340 	if (cur_idle >= wq_death_max_load && tail &&
1341 	    tail->uu_save.uus_workq_park_data.has_stack) {
1342 		uth->uu_save.uus_workq_park_data.has_stack = false;
1343 		TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
1344 	} else {
1345 		uth->uu_save.uus_workq_park_data.has_stack = true;
1346 		TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
1347 	}
1348 
1349 	if (!tail) {
1350 		uint64_t delay = workq_kill_delay_for_idle_thread(wq);
1351 		workq_death_call_schedule(wq, now + delay);
1352 	}
1353 }
1354 
1355 #pragma mark thread requests
1356 
1357 static inline bool
workq_tr_is_overcommit(workq_tr_flags_t tr_flags)1358 workq_tr_is_overcommit(workq_tr_flags_t tr_flags)
1359 {
1360 	return (tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) != 0;
1361 }
1362 
1363 static inline bool
workq_tr_is_nonovercommit(workq_tr_flags_t tr_flags)1364 workq_tr_is_nonovercommit(workq_tr_flags_t tr_flags)
1365 {
1366 	return (tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT |
1367 	       WORKQ_TR_FLAG_COOPERATIVE |
1368 	       WORKQ_TR_FLAG_PERMANENT_BIND)) == 0;
1369 }
1370 
1371 static inline bool
workq_tr_is_cooperative(workq_tr_flags_t tr_flags)1372 workq_tr_is_cooperative(workq_tr_flags_t tr_flags)
1373 {
1374 	return (tr_flags & WORKQ_TR_FLAG_COOPERATIVE) != 0;
1375 }
1376 
1377 #define workq_threadreq_is_overcommit(req) workq_tr_is_overcommit((req)->tr_flags)
1378 #define workq_threadreq_is_nonovercommit(req) workq_tr_is_nonovercommit((req)->tr_flags)
1379 #define workq_threadreq_is_cooperative(req) workq_tr_is_cooperative((req)->tr_flags)
1380 
1381 static inline int
workq_priority_for_req(workq_threadreq_t req)1382 workq_priority_for_req(workq_threadreq_t req)
1383 {
1384 	thread_qos_t qos = req->tr_qos;
1385 
1386 	if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1387 		workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
1388 		assert(trp.trp_flags & TRP_PRIORITY);
1389 		return trp.trp_pri;
1390 	}
1391 	return thread_workq_pri_for_qos(qos);
1392 }
1393 
1394 static inline struct priority_queue_sched_max *
workq_priority_queue_for_req(struct workqueue * wq,workq_threadreq_t req)1395 workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
1396 {
1397 	assert(!workq_tr_is_cooperative(req->tr_flags));
1398 
1399 	if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
1400 		return &wq->wq_special_queue;
1401 	} else if (workq_tr_is_overcommit(req->tr_flags)) {
1402 		return &wq->wq_overcommit_queue;
1403 	} else {
1404 		return &wq->wq_constrained_queue;
1405 	}
1406 }
1407 
1408 /* Calculates the number of threads scheduled >= the input QoS */
1409 static uint64_t
workq_num_cooperative_threads_scheduled_to_qos_internal(struct workqueue * wq,thread_qos_t qos)1410 workq_num_cooperative_threads_scheduled_to_qos_internal(struct workqueue *wq, thread_qos_t qos)
1411 {
1412 	uint64_t num_cooperative_threads = 0;
1413 
1414 	for (thread_qos_t cur_qos = WORKQ_THREAD_QOS_MAX; cur_qos >= qos; cur_qos--) {
1415 		uint8_t bucket = _wq_bucket(cur_qos);
1416 		num_cooperative_threads += wq->wq_cooperative_queue_scheduled_count[bucket];
1417 	}
1418 
1419 	return num_cooperative_threads;
1420 }
1421 
1422 /* Calculates the number of threads scheduled >= the input QoS */
1423 static uint64_t
workq_num_cooperative_threads_scheduled_to_qos_locked(struct workqueue * wq,thread_qos_t qos)1424 workq_num_cooperative_threads_scheduled_to_qos_locked(struct workqueue *wq, thread_qos_t qos)
1425 {
1426 	workq_lock_held(wq);
1427 	return workq_num_cooperative_threads_scheduled_to_qos_internal(wq, qos);
1428 }
1429 
1430 static uint64_t
workq_num_cooperative_threads_scheduled_total(struct workqueue * wq)1431 workq_num_cooperative_threads_scheduled_total(struct workqueue *wq)
1432 {
1433 	return workq_num_cooperative_threads_scheduled_to_qos_locked(wq, WORKQ_THREAD_QOS_MIN);
1434 }
1435 
1436 static bool
workq_has_cooperative_thread_requests(struct workqueue * wq)1437 workq_has_cooperative_thread_requests(struct workqueue *wq)
1438 {
1439 	for (thread_qos_t qos = WORKQ_THREAD_QOS_MAX; qos >= WORKQ_THREAD_QOS_MIN; qos--) {
1440 		uint8_t bucket = _wq_bucket(qos);
1441 		if (!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1442 			return true;
1443 		}
1444 	}
1445 
1446 	return false;
1447 }
1448 
1449 /*
1450  * Determines the next QoS bucket we should service next in the cooperative
1451  * pool. This function will always return a QoS for cooperative pool as long as
1452  * there are requests to be serviced.
1453  *
1454  * Unlike the other thread pools, for the cooperative thread pool the schedule
1455  * counts for the various buckets in the pool affect the next best request for
1456  * it.
1457  *
1458  * This function is called in the following contexts:
1459  *
1460  * a) When determining the best thread QoS for cooperative bucket for the
1461  * creator/thread reuse
1462  *
1463  * b) Once (a) has happened and thread has bound to a thread request, figuring
1464  * out whether the next best request for this pool has changed so that creator
1465  * can be scheduled.
1466  *
1467  * Returns true if the cooperative queue's best qos changed from previous
1468  * value.
1469  */
1470 static bool
_wq_cooperative_queue_refresh_best_req_qos(struct workqueue * wq)1471 _wq_cooperative_queue_refresh_best_req_qos(struct workqueue *wq)
1472 {
1473 	workq_lock_held(wq);
1474 
1475 	thread_qos_t old_best_req_qos = wq->wq_cooperative_queue_best_req_qos;
1476 
1477 	/* We determine the next best cooperative thread request based on the
1478 	 * following:
1479 	 *
1480 	 * 1. Take the MAX of the following:
1481 	 *		a) Highest qos with pending TRs such that number of scheduled
1482 	 *		threads so far with >= qos is < wq_max_cooperative_threads
1483 	 *		b) Highest qos bucket with pending TRs but no scheduled threads for that bucket
1484 	 *
1485 	 * 2. If the result of (1) is UN, then we pick the highest priority amongst
1486 	 * pending thread requests in the pool.
1487 	 *
1488 	 */
1489 	thread_qos_t highest_qos_with_no_scheduled = THREAD_QOS_UNSPECIFIED;
1490 	thread_qos_t highest_qos_req_with_width = THREAD_QOS_UNSPECIFIED;
1491 
1492 	thread_qos_t highest_qos_req = THREAD_QOS_UNSPECIFIED;
1493 
1494 	int scheduled_count_till_qos = 0;
1495 
1496 	for (thread_qos_t qos = WORKQ_THREAD_QOS_MAX; qos >= WORKQ_THREAD_QOS_MIN; qos--) {
1497 		uint8_t bucket = _wq_bucket(qos);
1498 		uint8_t scheduled_count_for_bucket = wq->wq_cooperative_queue_scheduled_count[bucket];
1499 		scheduled_count_till_qos += scheduled_count_for_bucket;
1500 
1501 		if (!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1502 			if (qos > highest_qos_req) {
1503 				highest_qos_req = qos;
1504 			}
1505 			/*
1506 			 * The pool isn't saturated for threads at and above this QoS, and
1507 			 * this qos bucket has pending requests
1508 			 */
1509 			if (scheduled_count_till_qos < wq_cooperative_queue_max_size(wq)) {
1510 				if (qos > highest_qos_req_with_width) {
1511 					highest_qos_req_with_width = qos;
1512 				}
1513 			}
1514 
1515 			/*
1516 			 * There are no threads scheduled for this bucket but there
1517 			 * is work pending, give it at least 1 thread
1518 			 */
1519 			if (scheduled_count_for_bucket == 0) {
1520 				if (qos > highest_qos_with_no_scheduled) {
1521 					highest_qos_with_no_scheduled = qos;
1522 				}
1523 			}
1524 		}
1525 	}
1526 
1527 	wq->wq_cooperative_queue_best_req_qos = MAX(highest_qos_with_no_scheduled, highest_qos_req_with_width);
1528 	if (wq->wq_cooperative_queue_best_req_qos == THREAD_QOS_UNSPECIFIED) {
1529 		wq->wq_cooperative_queue_best_req_qos = highest_qos_req;
1530 	}
1531 
1532 #if MACH_ASSERT
1533 	/* Assert that if we are showing up the next best req as UN, then there
1534 	 * actually is no thread request in the cooperative pool buckets */
1535 	if (wq->wq_cooperative_queue_best_req_qos == THREAD_QOS_UNSPECIFIED) {
1536 		assert(!workq_has_cooperative_thread_requests(wq));
1537 	}
1538 #endif
1539 
1540 	return old_best_req_qos != wq->wq_cooperative_queue_best_req_qos;
1541 }
1542 
1543 /*
1544  * Returns whether or not the input thread (or creator thread if uth is NULL)
1545  * should be allowed to work as part of the cooperative pool for the <input qos>
1546  * bucket.
1547  *
1548  * This function is called in a bunch of places:
1549  *		a) Quantum expires for a thread and it is part of the cooperative pool
1550  *		b) When trying to pick a thread request for the creator thread to
1551  *		represent.
1552  *		c) When a thread is trying to pick a thread request to actually bind to
1553  *		and service.
1554  *
1555  * Called with workq lock held.
1556  */
1557 
1558 #define WQ_COOPERATIVE_POOL_UNSATURATED 1
1559 #define WQ_COOPERATIVE_BUCKET_UNSERVICED 2
1560 #define WQ_COOPERATIVE_POOL_SATURATED_UP_TO_QOS 3
1561 
1562 static bool
workq_cooperative_allowance(struct workqueue * wq,thread_qos_t qos,struct uthread * uth,bool may_start_timer)1563 workq_cooperative_allowance(struct workqueue *wq, thread_qos_t qos, struct uthread *uth,
1564     bool may_start_timer)
1565 {
1566 	workq_lock_held(wq);
1567 
1568 	bool exclude_thread_as_scheduled = false;
1569 	bool passed_admissions = false;
1570 	uint8_t bucket = _wq_bucket(qos);
1571 
1572 	if (uth && workq_thread_is_cooperative(uth)) {
1573 		exclude_thread_as_scheduled = true;
1574 		_wq_cooperative_queue_scheduled_count_dec(wq, uth->uu_workq_pri.qos_req);
1575 	}
1576 
1577 	/*
1578 	 * We have not saturated the pool yet, let this thread continue
1579 	 */
1580 	uint64_t total_cooperative_threads;
1581 	total_cooperative_threads = workq_num_cooperative_threads_scheduled_total(wq);
1582 	if (total_cooperative_threads < wq_cooperative_queue_max_size(wq)) {
1583 		passed_admissions = true;
1584 		WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE,
1585 		    total_cooperative_threads, qos, passed_admissions,
1586 		    WQ_COOPERATIVE_POOL_UNSATURATED);
1587 		goto out;
1588 	}
1589 
1590 	/*
1591 	 * Without this thread, nothing is servicing the bucket which has pending
1592 	 * work
1593 	 */
1594 	uint64_t bucket_scheduled = wq->wq_cooperative_queue_scheduled_count[bucket];
1595 	if (bucket_scheduled == 0 &&
1596 	    !STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket])) {
1597 		passed_admissions = true;
1598 		WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE,
1599 		    total_cooperative_threads, qos, passed_admissions,
1600 		    WQ_COOPERATIVE_BUCKET_UNSERVICED);
1601 		goto out;
1602 	}
1603 
1604 	/*
1605 	 * If number of threads at the QoS bucket >= input QoS exceeds the max we want
1606 	 * for the pool, deny this thread
1607 	 */
1608 	uint64_t aggregate_down_to_qos = workq_num_cooperative_threads_scheduled_to_qos_locked(wq, qos);
1609 	passed_admissions = (aggregate_down_to_qos < wq_cooperative_queue_max_size(wq));
1610 	WQ_TRACE(TRACE_wq_cooperative_admission | DBG_FUNC_NONE, aggregate_down_to_qos,
1611 	    qos, passed_admissions, WQ_COOPERATIVE_POOL_SATURATED_UP_TO_QOS);
1612 
1613 	if (!passed_admissions && may_start_timer) {
1614 		workq_schedule_delayed_thread_creation(wq, 0);
1615 	}
1616 
1617 out:
1618 	if (exclude_thread_as_scheduled) {
1619 		_wq_cooperative_queue_scheduled_count_inc(wq, uth->uu_workq_pri.qos_req);
1620 	}
1621 	return passed_admissions;
1622 }
1623 
1624 /*
1625  * returns true if the best request for the pool changed as a result of
1626  * enqueuing this thread request.
1627  */
1628 static bool
workq_threadreq_enqueue(struct workqueue * wq,workq_threadreq_t req)1629 workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
1630 {
1631 	assert(req->tr_state == WORKQ_TR_STATE_NEW);
1632 
1633 	req->tr_state = WORKQ_TR_STATE_QUEUED;
1634 	wq->wq_reqcount += req->tr_count;
1635 
1636 	if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1637 		assert(wq->wq_event_manager_threadreq == NULL);
1638 		assert(req->tr_flags & WORKQ_TR_FLAG_KEVENT);
1639 		assert(req->tr_count == 1);
1640 		wq->wq_event_manager_threadreq = req;
1641 		return true;
1642 	}
1643 
1644 	if (workq_threadreq_is_cooperative(req)) {
1645 		assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
1646 		assert(req->tr_qos != WORKQ_THREAD_QOS_ABOVEUI);
1647 
1648 		struct workq_threadreq_tailq *bucket = &wq->wq_cooperative_queue[_wq_bucket(req->tr_qos)];
1649 		STAILQ_INSERT_TAIL(bucket, req, tr_link);
1650 
1651 		return _wq_cooperative_queue_refresh_best_req_qos(wq);
1652 	}
1653 
1654 	struct priority_queue_sched_max *q = workq_priority_queue_for_req(wq, req);
1655 
1656 	priority_queue_entry_set_sched_pri(q, &req->tr_entry,
1657 	    workq_priority_for_req(req), false);
1658 
1659 	if (priority_queue_insert(q, &req->tr_entry)) {
1660 		if (workq_threadreq_is_nonovercommit(req)) {
1661 			_wq_thactive_refresh_best_constrained_req_qos(wq);
1662 		}
1663 		return true;
1664 	}
1665 	return false;
1666 }
1667 
1668 /*
1669  * returns true if one of the following is true (so as to update creator if
1670  * needed):
1671  *
1672  * (a) the next highest request of the pool we dequeued the request from changed
1673  * (b) the next highest requests of the pool the current thread used to be a
1674  * part of, changed
1675  *
1676  * For overcommit, special and constrained pools, the next highest QoS for each
1677  * pool just a MAX of pending requests so tracking (a) is sufficient.
1678  *
1679  * But for cooperative thread pool, the next highest QoS for the pool depends on
1680  * schedule counts in the pool as well. So if the current thread used to be
1681  * cooperative in it's previous logical run ie (b), then that can also affect
1682  * cooperative pool's next best QoS requests.
1683  */
1684 static bool
workq_threadreq_dequeue(struct workqueue * wq,workq_threadreq_t req,bool cooperative_sched_count_changed)1685 workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req,
1686     bool cooperative_sched_count_changed)
1687 {
1688 	wq->wq_reqcount--;
1689 
1690 	bool next_highest_request_changed = false;
1691 
1692 	if (--req->tr_count == 0) {
1693 		if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
1694 			assert(wq->wq_event_manager_threadreq == req);
1695 			assert(req->tr_count == 0);
1696 			wq->wq_event_manager_threadreq = NULL;
1697 
1698 			/* If a cooperative thread was the one which picked up the manager
1699 			 * thread request, we need to reevaluate the cooperative pool
1700 			 * anyways.
1701 			 */
1702 			if (cooperative_sched_count_changed) {
1703 				_wq_cooperative_queue_refresh_best_req_qos(wq);
1704 			}
1705 			return true;
1706 		}
1707 
1708 		if (workq_threadreq_is_cooperative(req)) {
1709 			assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
1710 			assert(req->tr_qos != WORKQ_THREAD_QOS_ABOVEUI);
1711 			/* Account for the fact that BG and MT are coalesced when
1712 			 * calculating best request for cooperative pool
1713 			 */
1714 			assert(_wq_bucket(req->tr_qos) == _wq_bucket(wq->wq_cooperative_queue_best_req_qos));
1715 
1716 			struct workq_threadreq_tailq *bucket = &wq->wq_cooperative_queue[_wq_bucket(req->tr_qos)];
1717 			__assert_only workq_threadreq_t head = STAILQ_FIRST(bucket);
1718 
1719 			assert(head == req);
1720 			STAILQ_REMOVE_HEAD(bucket, tr_link);
1721 
1722 			/*
1723 			 * If the request we're dequeueing is cooperative, then the sched
1724 			 * counts definitely changed.
1725 			 */
1726 			assert(cooperative_sched_count_changed);
1727 		}
1728 
1729 		/*
1730 		 * We want to do the cooperative pool refresh after dequeueing a
1731 		 * cooperative thread request if any (to combine both effects into 1
1732 		 * refresh operation)
1733 		 */
1734 		if (cooperative_sched_count_changed) {
1735 			next_highest_request_changed = _wq_cooperative_queue_refresh_best_req_qos(wq);
1736 		}
1737 
1738 		if (!workq_threadreq_is_cooperative(req)) {
1739 			/*
1740 			 * All other types of requests are enqueued in priority queues
1741 			 */
1742 
1743 			if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
1744 			    &req->tr_entry)) {
1745 				next_highest_request_changed |= true;
1746 				if (workq_threadreq_is_nonovercommit(req)) {
1747 					_wq_thactive_refresh_best_constrained_req_qos(wq);
1748 				}
1749 			}
1750 		}
1751 	}
1752 
1753 	return next_highest_request_changed;
1754 }
1755 
1756 static void
workq_threadreq_destroy(proc_t p,workq_threadreq_t req)1757 workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
1758 {
1759 	req->tr_state = WORKQ_TR_STATE_CANCELED;
1760 	if (req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT)) {
1761 		kqueue_threadreq_cancel(p, req);
1762 	} else {
1763 		zfree(workq_zone_threadreq, req);
1764 	}
1765 }
1766 
1767 #pragma mark workqueue thread creation thread calls
1768 
1769 static inline bool
workq_thread_call_prepost(struct workqueue * wq,uint32_t sched,uint32_t pend,uint32_t fail_mask)1770 workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
1771     uint32_t fail_mask)
1772 {
1773 	uint32_t old_flags, new_flags;
1774 
1775 	os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
1776 		if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
1777 		        os_atomic_rmw_loop_give_up(return false);
1778 		}
1779 		if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
1780 		        new_flags = old_flags | pend;
1781 		} else {
1782 		        new_flags = old_flags | sched;
1783 		}
1784 	});
1785 
1786 	return (old_flags & WQ_PROC_SUSPENDED) == 0;
1787 }
1788 
1789 #define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
1790 
1791 static bool
workq_schedule_delayed_thread_creation(struct workqueue * wq,int flags)1792 workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
1793 {
1794 	assert(!preemption_enabled());
1795 
1796 	if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
1797 	    WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
1798 	    WQ_IMMEDIATE_CALL_SCHEDULED)) {
1799 		return false;
1800 	}
1801 
1802 	uint64_t now = mach_absolute_time();
1803 
1804 	if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
1805 		/* do not change the window */
1806 	} else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
1807 		wq->wq_timer_interval *= 2;
1808 		if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
1809 			wq->wq_timer_interval = (uint32_t)wq_max_timer_interval.abstime;
1810 		}
1811 	} else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
1812 		wq->wq_timer_interval /= 2;
1813 		if (wq->wq_timer_interval < wq_stalled_window.abstime) {
1814 			wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
1815 		}
1816 	}
1817 
1818 	WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1819 	    _wq_flags(wq), wq->wq_timer_interval);
1820 
1821 	thread_call_t call = wq->wq_delayed_call;
1822 	uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
1823 	uint64_t deadline = now + wq->wq_timer_interval;
1824 	if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
1825 		panic("delayed_call was already enqueued");
1826 	}
1827 	return true;
1828 }
1829 
1830 static void
workq_schedule_immediate_thread_creation(struct workqueue * wq)1831 workq_schedule_immediate_thread_creation(struct workqueue *wq)
1832 {
1833 	assert(!preemption_enabled());
1834 
1835 	if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
1836 	    WQ_IMMEDIATE_CALL_PENDED, 0)) {
1837 		WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
1838 		    _wq_flags(wq), 0);
1839 
1840 		uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
1841 		if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
1842 			panic("immediate_call was already enqueued");
1843 		}
1844 	}
1845 }
1846 
1847 void
workq_proc_suspended(struct proc * p)1848 workq_proc_suspended(struct proc *p)
1849 {
1850 	struct workqueue *wq = proc_get_wqptr(p);
1851 
1852 	if (wq) {
1853 		os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
1854 	}
1855 }
1856 
1857 void
workq_proc_resumed(struct proc * p)1858 workq_proc_resumed(struct proc *p)
1859 {
1860 	struct workqueue *wq = proc_get_wqptr(p);
1861 	uint32_t wq_flags;
1862 
1863 	if (!wq) {
1864 		return;
1865 	}
1866 
1867 	wq_flags = os_atomic_andnot_orig(&wq->wq_flags, WQ_PROC_SUSPENDED |
1868 	    WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED, relaxed);
1869 	if ((wq_flags & WQ_EXITING) == 0) {
1870 		disable_preemption();
1871 		if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
1872 			workq_schedule_immediate_thread_creation(wq);
1873 		} else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
1874 			workq_schedule_delayed_thread_creation(wq,
1875 			    WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
1876 		}
1877 		enable_preemption();
1878 	}
1879 }
1880 
1881 /**
1882  * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
1883  */
1884 static bool
workq_thread_is_busy(uint64_t now,_Atomic uint64_t * lastblocked_tsp)1885 workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
1886 {
1887 	uint64_t lastblocked_ts = os_atomic_load_wide(lastblocked_tsp, relaxed);
1888 	if (now <= lastblocked_ts) {
1889 		/*
1890 		 * Because the update of the timestamp when a thread blocks
1891 		 * isn't serialized against us looking at it (i.e. we don't hold
1892 		 * the workq lock), it's possible to have a timestamp that matches
1893 		 * the current time or that even looks to be in the future relative
1894 		 * to when we grabbed the current time...
1895 		 *
1896 		 * Just treat this as a busy thread since it must have just blocked.
1897 		 */
1898 		return true;
1899 	}
1900 	return (now - lastblocked_ts) < wq_stalled_window.abstime;
1901 }
1902 
1903 static void
workq_add_new_threads_call(void * _p,void * flags)1904 workq_add_new_threads_call(void *_p, void *flags)
1905 {
1906 	proc_t p = _p;
1907 	struct workqueue *wq = proc_get_wqptr(p);
1908 	uint32_t my_flag = (uint32_t)(uintptr_t)flags;
1909 
1910 	/*
1911 	 * workq_exit() will set the workqueue to NULL before
1912 	 * it cancels thread calls.
1913 	 */
1914 	if (!wq) {
1915 		return;
1916 	}
1917 
1918 	assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
1919 	    (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
1920 
1921 	WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
1922 	    wq->wq_nthreads, wq->wq_thidlecount);
1923 
1924 	workq_lock_spin(wq);
1925 
1926 	wq->wq_thread_call_last_run = mach_absolute_time();
1927 	os_atomic_andnot(&wq->wq_flags, my_flag, release);
1928 
1929 	/* This can drop the workqueue lock, and take it again */
1930 	workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
1931 
1932 	workq_unlock(wq);
1933 
1934 	WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
1935 	    wq->wq_nthreads, wq->wq_thidlecount);
1936 }
1937 
1938 #pragma mark thread state tracking
1939 
1940 static void
workq_sched_callback(int type,thread_t thread)1941 workq_sched_callback(int type, thread_t thread)
1942 {
1943 	thread_ro_t tro = get_thread_ro(thread);
1944 	struct uthread *uth = get_bsdthread_info(thread);
1945 	struct workqueue *wq = proc_get_wqptr(tro->tro_proc);
1946 	thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
1947 	wq_thactive_t old_thactive;
1948 	bool start_timer = false;
1949 
1950 	if (qos == WORKQ_THREAD_QOS_MANAGER) {
1951 		return;
1952 	}
1953 
1954 	switch (type) {
1955 	case SCHED_CALL_BLOCK:
1956 		old_thactive = _wq_thactive_dec(wq, qos);
1957 		req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
1958 
1959 		/*
1960 		 * Remember the timestamp of the last thread that blocked in this
1961 		 * bucket, it used used by admission checks to ignore one thread
1962 		 * being inactive if this timestamp is recent enough.
1963 		 *
1964 		 * If we collide with another thread trying to update the
1965 		 * last_blocked (really unlikely since another thread would have to
1966 		 * get scheduled and then block after we start down this path), it's
1967 		 * not a problem.  Either timestamp is adequate, so no need to retry
1968 		 */
1969 		os_atomic_store_wide(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
1970 		    thread_last_run_time(thread), relaxed);
1971 
1972 		if (req_qos == THREAD_QOS_UNSPECIFIED) {
1973 			/*
1974 			 * No pending request at the moment we could unblock, move on.
1975 			 */
1976 		} else if (qos < req_qos) {
1977 			/*
1978 			 * The blocking thread is at a lower QoS than the highest currently
1979 			 * pending constrained request, nothing has to be redriven
1980 			 */
1981 		} else {
1982 			uint32_t max_busycount, old_req_count;
1983 			old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
1984 			    req_qos, NULL, &max_busycount);
1985 			/*
1986 			 * If it is possible that may_start_constrained_thread had refused
1987 			 * admission due to being over the max concurrency, we may need to
1988 			 * spin up a new thread.
1989 			 *
1990 			 * We take into account the maximum number of busy threads
1991 			 * that can affect may_start_constrained_thread as looking at the
1992 			 * actual number may_start_constrained_thread will see is racy.
1993 			 *
1994 			 * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
1995 			 * between NCPU (4) and NCPU - 2 (2) we need to redrive.
1996 			 */
1997 			uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
1998 			if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
1999 				start_timer = workq_schedule_delayed_thread_creation(wq, 0);
2000 			}
2001 		}
2002 		if (__improbable(kdebug_enable)) {
2003 			__unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
2004 			    old_thactive, qos, NULL, NULL);
2005 			WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
2006 			    old - 1, qos | (req_qos << 8),
2007 			    wq->wq_reqcount << 1 | start_timer);
2008 		}
2009 		break;
2010 
2011 	case SCHED_CALL_UNBLOCK:
2012 		/*
2013 		 * we cannot take the workqueue_lock here...
2014 		 * an UNBLOCK can occur from a timer event which
2015 		 * is run from an interrupt context... if the workqueue_lock
2016 		 * is already held by this processor, we'll deadlock...
2017 		 * the thread lock for the thread being UNBLOCKED
2018 		 * is also held
2019 		 */
2020 		old_thactive = _wq_thactive_inc(wq, qos);
2021 		if (__improbable(kdebug_enable)) {
2022 			__unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
2023 			    old_thactive, qos, NULL, NULL);
2024 			req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
2025 			WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
2026 			    old + 1, qos | (req_qos << 8),
2027 			    wq->wq_threads_scheduled);
2028 		}
2029 		break;
2030 	}
2031 }
2032 
2033 #pragma mark workq lifecycle
2034 
2035 void
workq_reference(struct workqueue * wq)2036 workq_reference(struct workqueue *wq)
2037 {
2038 	os_ref_retain(&wq->wq_refcnt);
2039 }
2040 
2041 static void
workq_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)2042 workq_deallocate_queue_invoke(mpsc_queue_chain_t e,
2043     __assert_only mpsc_daemon_queue_t dq)
2044 {
2045 	struct workqueue *wq;
2046 	struct turnstile *ts;
2047 
2048 	wq = mpsc_queue_element(e, struct workqueue, wq_destroy_link);
2049 	assert(dq == &workq_deallocate_queue);
2050 
2051 	turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts, TURNSTILE_WORKQS);
2052 	assert(ts);
2053 	turnstile_cleanup();
2054 	turnstile_deallocate(ts);
2055 
2056 	lck_ticket_destroy(&wq->wq_lock, &workq_lck_grp);
2057 	zfree(workq_zone_workqueue, wq);
2058 }
2059 
2060 static void
workq_deallocate(struct workqueue * wq)2061 workq_deallocate(struct workqueue *wq)
2062 {
2063 	if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
2064 		workq_deallocate_queue_invoke(&wq->wq_destroy_link,
2065 		    &workq_deallocate_queue);
2066 	}
2067 }
2068 
2069 void
workq_deallocate_safe(struct workqueue * wq)2070 workq_deallocate_safe(struct workqueue *wq)
2071 {
2072 	if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
2073 		mpsc_daemon_enqueue(&workq_deallocate_queue, &wq->wq_destroy_link,
2074 		    MPSC_QUEUE_DISABLE_PREEMPTION);
2075 	}
2076 }
2077 
2078 /**
2079  * Setup per-process state for the workqueue.
2080  */
2081 int
workq_open(struct proc * p,__unused struct workq_open_args * uap,__unused int32_t * retval)2082 workq_open(struct proc *p, __unused struct workq_open_args *uap,
2083     __unused int32_t *retval)
2084 {
2085 	struct workqueue *wq;
2086 	int error = 0;
2087 
2088 	if ((p->p_lflag & P_LREGISTER) == 0) {
2089 		return EINVAL;
2090 	}
2091 
2092 	if (wq_init_constrained_limit) {
2093 		uint32_t limit, num_cpus = ml_wait_max_cpus();
2094 
2095 		/*
2096 		 * set up the limit for the constrained pool
2097 		 * this is a virtual pool in that we don't
2098 		 * maintain it on a separate idle and run list
2099 		 */
2100 		limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
2101 
2102 		if (limit > wq_max_constrained_threads) {
2103 			wq_max_constrained_threads = limit;
2104 		}
2105 
2106 		if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
2107 			wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
2108 		}
2109 		if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
2110 			wq_max_threads = CONFIG_THREAD_MAX - 20;
2111 		}
2112 
2113 		wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
2114 
2115 		for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
2116 			wq_max_parallelism[_wq_bucket(qos)] =
2117 			    qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
2118 		}
2119 
2120 		wq_max_cooperative_threads = num_cpus;
2121 
2122 		wq_init_constrained_limit = 0;
2123 	}
2124 
2125 	if (proc_get_wqptr(p) == NULL) {
2126 		if (proc_init_wqptr_or_wait(p) == FALSE) {
2127 			assert(proc_get_wqptr(p) != NULL);
2128 			goto out;
2129 		}
2130 
2131 		wq = zalloc_flags(workq_zone_workqueue, Z_WAITOK | Z_ZERO);
2132 
2133 		os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
2134 
2135 		// Start the event manager at the priority hinted at by the policy engine
2136 		thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
2137 		pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
2138 		wq->wq_event_manager_priority = (uint32_t)pp;
2139 		wq->wq_timer_interval = (uint32_t)wq_stalled_window.abstime;
2140 		wq->wq_proc = p;
2141 		turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
2142 		    TURNSTILE_WORKQS);
2143 
2144 		TAILQ_INIT(&wq->wq_thrunlist);
2145 		TAILQ_INIT(&wq->wq_thnewlist);
2146 		TAILQ_INIT(&wq->wq_thidlelist);
2147 		priority_queue_init(&wq->wq_overcommit_queue);
2148 		priority_queue_init(&wq->wq_constrained_queue);
2149 		priority_queue_init(&wq->wq_special_queue);
2150 		for (int bucket = 0; bucket < WORKQ_NUM_QOS_BUCKETS; bucket++) {
2151 			STAILQ_INIT(&wq->wq_cooperative_queue[bucket]);
2152 		}
2153 
2154 		/* We are only using the delayed thread call for the constrained pool
2155 		 * which can't have work at >= UI QoS and so we can be fine with a
2156 		 * UI QoS thread call.
2157 		 */
2158 		wq->wq_delayed_call = thread_call_allocate_with_qos(
2159 			workq_add_new_threads_call, p, THREAD_QOS_USER_INTERACTIVE,
2160 			THREAD_CALL_OPTIONS_ONCE);
2161 		wq->wq_immediate_call = thread_call_allocate_with_options(
2162 			workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
2163 			THREAD_CALL_OPTIONS_ONCE);
2164 		wq->wq_death_call = thread_call_allocate_with_options(
2165 			workq_kill_old_threads_call, wq,
2166 			THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
2167 
2168 		lck_ticket_init(&wq->wq_lock, &workq_lck_grp);
2169 
2170 		WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
2171 		    VM_KERNEL_ADDRHIDE(wq), 0, 0);
2172 		proc_set_wqptr(p, wq);
2173 	}
2174 out:
2175 
2176 	return error;
2177 }
2178 
2179 /*
2180  * Routine:	workq_mark_exiting
2181  *
2182  * Function:	Mark the work queue such that new threads will not be added to the
2183  *		work queue after we return.
2184  *
2185  * Conditions:	Called against the current process.
2186  */
2187 void
workq_mark_exiting(struct proc * p)2188 workq_mark_exiting(struct proc *p)
2189 {
2190 	struct workqueue *wq = proc_get_wqptr(p);
2191 	uint32_t wq_flags;
2192 	workq_threadreq_t mgr_req;
2193 
2194 	if (!wq) {
2195 		return;
2196 	}
2197 
2198 	WQ_TRACE_WQ(TRACE_wq_pthread_exit | DBG_FUNC_START, wq, 0, 0, 0);
2199 
2200 	workq_lock_spin(wq);
2201 
2202 	wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
2203 	if (__improbable(wq_flags & WQ_EXITING)) {
2204 		panic("workq_mark_exiting called twice");
2205 	}
2206 
2207 	/*
2208 	 * Opportunistically try to cancel thread calls that are likely in flight.
2209 	 * workq_exit() will do the proper cleanup.
2210 	 */
2211 	if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
2212 		thread_call_cancel(wq->wq_immediate_call);
2213 	}
2214 	if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
2215 		thread_call_cancel(wq->wq_delayed_call);
2216 	}
2217 	if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
2218 		thread_call_cancel(wq->wq_death_call);
2219 	}
2220 
2221 	mgr_req = wq->wq_event_manager_threadreq;
2222 	wq->wq_event_manager_threadreq = NULL;
2223 	wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
2224 	wq->wq_creator = NULL;
2225 	workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
2226 
2227 	workq_unlock(wq);
2228 
2229 	if (mgr_req) {
2230 		kqueue_threadreq_cancel(p, mgr_req);
2231 	}
2232 	/*
2233 	 * No one touches the priority queues once WQ_EXITING is set.
2234 	 * It is hence safe to do the tear down without holding any lock.
2235 	 */
2236 	priority_queue_destroy(&wq->wq_overcommit_queue,
2237 	    struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2238 		workq_threadreq_destroy(p, e);
2239 	});
2240 	priority_queue_destroy(&wq->wq_constrained_queue,
2241 	    struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2242 		workq_threadreq_destroy(p, e);
2243 	});
2244 	priority_queue_destroy(&wq->wq_special_queue,
2245 	    struct workq_threadreq_s, tr_entry, ^(workq_threadreq_t e){
2246 		workq_threadreq_destroy(p, e);
2247 	});
2248 
2249 	WQ_TRACE(TRACE_wq_pthread_exit | DBG_FUNC_END, 0, 0, 0, 0);
2250 }
2251 
2252 /*
2253  * Routine:	workq_exit
2254  *
2255  * Function:	clean up the work queue structure(s) now that there are no threads
2256  *		left running inside the work queue (except possibly current_thread).
2257  *
2258  * Conditions:	Called by the last thread in the process.
2259  *		Called against current process.
2260  */
2261 void
workq_exit(struct proc * p)2262 workq_exit(struct proc *p)
2263 {
2264 	struct workqueue *wq;
2265 	struct uthread *uth, *tmp;
2266 
2267 	wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
2268 	if (wq != NULL) {
2269 		thread_t th = current_thread();
2270 
2271 		WQ_TRACE_WQ(TRACE_wq_workqueue_exit | DBG_FUNC_START, wq, 0, 0, 0);
2272 
2273 		if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
2274 			/*
2275 			 * <rdar://problem/40111515> Make sure we will no longer call the
2276 			 * sched call, if we ever block this thread, which the cancel_wait
2277 			 * below can do.
2278 			 */
2279 			thread_sched_call(th, NULL);
2280 		}
2281 
2282 		/*
2283 		 * Thread calls are always scheduled by the proc itself or under the
2284 		 * workqueue spinlock if WQ_EXITING is not yet set.
2285 		 *
2286 		 * Either way, when this runs, the proc has no threads left beside
2287 		 * the one running this very code, so we know no thread call can be
2288 		 * dispatched anymore.
2289 		 */
2290 		thread_call_cancel_wait(wq->wq_delayed_call);
2291 		thread_call_cancel_wait(wq->wq_immediate_call);
2292 		thread_call_cancel_wait(wq->wq_death_call);
2293 		thread_call_free(wq->wq_delayed_call);
2294 		thread_call_free(wq->wq_immediate_call);
2295 		thread_call_free(wq->wq_death_call);
2296 
2297 		/*
2298 		 * Clean up workqueue data structures for threads that exited and
2299 		 * didn't get a chance to clean up after themselves.
2300 		 *
2301 		 * idle/new threads should have been interrupted and died on their own
2302 		 */
2303 		TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
2304 			thread_t mth = get_machthread(uth);
2305 			thread_sched_call(mth, NULL);
2306 			thread_deallocate(mth);
2307 		}
2308 		assert(TAILQ_EMPTY(&wq->wq_thnewlist));
2309 		assert(TAILQ_EMPTY(&wq->wq_thidlelist));
2310 
2311 		WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
2312 		    VM_KERNEL_ADDRHIDE(wq), 0, 0);
2313 
2314 		workq_deallocate(wq);
2315 
2316 		WQ_TRACE(TRACE_wq_workqueue_exit | DBG_FUNC_END, 0, 0, 0, 0);
2317 	}
2318 }
2319 
2320 
2321 #pragma mark bsd thread control
2322 
2323 bool
bsdthread_part_of_cooperative_workqueue(struct uthread * uth)2324 bsdthread_part_of_cooperative_workqueue(struct uthread *uth)
2325 {
2326 	return (workq_thread_is_cooperative(uth) || workq_thread_is_nonovercommit(uth)) &&
2327 	       (uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER) &&
2328 	       (!workq_thread_is_permanently_bound(uth));
2329 }
2330 
2331 static bool
_pthread_priority_to_policy(pthread_priority_t priority,thread_qos_policy_data_t * data)2332 _pthread_priority_to_policy(pthread_priority_t priority,
2333     thread_qos_policy_data_t *data)
2334 {
2335 	data->qos_tier = _pthread_priority_thread_qos(priority);
2336 	data->tier_importance = _pthread_priority_relpri(priority);
2337 	if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
2338 	    data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
2339 		return false;
2340 	}
2341 	return true;
2342 }
2343 
2344 static int
bsdthread_set_self(proc_t p,thread_t th,pthread_priority_t priority,mach_port_name_t voucher,enum workq_set_self_flags flags)2345 bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
2346     mach_port_name_t voucher, enum workq_set_self_flags flags)
2347 {
2348 	struct uthread *uth = get_bsdthread_info(th);
2349 	struct workqueue *wq = proc_get_wqptr(p);
2350 
2351 	kern_return_t kr;
2352 	int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
2353 	bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
2354 
2355 	assert(th == current_thread());
2356 	if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
2357 		if (!is_wq_thread) {
2358 			unbind_rv = EINVAL;
2359 			goto qos;
2360 		}
2361 
2362 		if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
2363 			unbind_rv = EINVAL;
2364 			goto qos;
2365 		}
2366 
2367 		workq_threadreq_t kqr = uth->uu_kqr_bound;
2368 		if (kqr == NULL) {
2369 			unbind_rv = EALREADY;
2370 			goto qos;
2371 		}
2372 
2373 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2374 			unbind_rv = EINVAL;
2375 			goto qos;
2376 		}
2377 
2378 		kqueue_threadreq_unbind(p, kqr);
2379 	}
2380 
2381 qos:
2382 	if (flags & (WORKQ_SET_SELF_QOS_FLAG | WORKQ_SET_SELF_QOS_OVERRIDE_FLAG)) {
2383 		assert(flags & WORKQ_SET_SELF_QOS_FLAG);
2384 
2385 		thread_qos_policy_data_t new_policy;
2386 		thread_qos_t qos_override = THREAD_QOS_UNSPECIFIED;
2387 
2388 		if (!_pthread_priority_to_policy(priority, &new_policy)) {
2389 			qos_rv = EINVAL;
2390 			goto voucher;
2391 		}
2392 
2393 		if (flags & WORKQ_SET_SELF_QOS_OVERRIDE_FLAG) {
2394 			/*
2395 			 * If the WORKQ_SET_SELF_QOS_OVERRIDE_FLAG is set, we definitely
2396 			 * should have an override QoS in the pthread_priority_t and we should
2397 			 * only come into this path for cooperative thread requests
2398 			 */
2399 			if (!_pthread_priority_has_override_qos(priority) ||
2400 			    !_pthread_priority_is_cooperative(priority)) {
2401 				qos_rv = EINVAL;
2402 				goto voucher;
2403 			}
2404 			qos_override = _pthread_priority_thread_override_qos(priority);
2405 		} else {
2406 			/*
2407 			 * If the WORKQ_SET_SELF_QOS_OVERRIDE_FLAG is not set, we definitely
2408 			 * should not have an override QoS in the pthread_priority_t
2409 			 */
2410 			if (_pthread_priority_has_override_qos(priority)) {
2411 				qos_rv = EINVAL;
2412 				goto voucher;
2413 			}
2414 		}
2415 
2416 		if (!is_wq_thread) {
2417 			/*
2418 			 * Threads opted out of QoS can't change QoS
2419 			 */
2420 			if (!thread_has_qos_policy(th)) {
2421 				qos_rv = EPERM;
2422 				goto voucher;
2423 			}
2424 		} else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER ||
2425 		    uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_ABOVEUI) {
2426 			/*
2427 			 * Workqueue manager threads or threads above UI can't change QoS
2428 			 */
2429 			qos_rv = EINVAL;
2430 			goto voucher;
2431 		} else {
2432 			/*
2433 			 * For workqueue threads, possibly adjust buckets and redrive thread
2434 			 * requests.
2435 			 *
2436 			 * Transitions allowed:
2437 			 *
2438 			 * overcommit --> non-overcommit
2439 			 * overcommit --> overcommit
2440 			 * non-overcommit --> non-overcommit
2441 			 * non-overcommit --> overcommit (to be deprecated later)
2442 			 * cooperative --> cooperative
2443 			 *
2444 			 * All other transitions aren't allowed so reject them.
2445 			 */
2446 			if (workq_thread_is_overcommit(uth) && _pthread_priority_is_cooperative(priority)) {
2447 				qos_rv = EINVAL;
2448 				goto voucher;
2449 			} else if (workq_thread_is_cooperative(uth) && !_pthread_priority_is_cooperative(priority)) {
2450 				qos_rv = EINVAL;
2451 				goto voucher;
2452 			} else if (workq_thread_is_nonovercommit(uth) && _pthread_priority_is_cooperative(priority)) {
2453 				qos_rv = EINVAL;
2454 				goto voucher;
2455 			}
2456 
2457 			struct uu_workq_policy old_pri, new_pri;
2458 			bool force_run = false;
2459 
2460 			if (qos_override) {
2461 				/*
2462 				 * We're in the case of a thread clarifying that it is for eg. not IN
2463 				 * req QoS but rather, UT req QoS with IN override. However, this can
2464 				 * race with a concurrent override happening to the thread via
2465 				 * workq_thread_add_dispatch_override so this needs to be
2466 				 * synchronized with the thread mutex.
2467 				 */
2468 				thread_mtx_lock(th);
2469 			}
2470 
2471 			workq_lock_spin(wq);
2472 
2473 			old_pri = new_pri = uth->uu_workq_pri;
2474 			new_pri.qos_req = (thread_qos_t)new_policy.qos_tier;
2475 
2476 			if (old_pri.qos_override < qos_override) {
2477 				/*
2478 				 * Since this can race with a concurrent override via
2479 				 * workq_thread_add_dispatch_override, only adjust override value if we
2480 				 * are higher - this is a saturating function.
2481 				 *
2482 				 * We should not be changing the final override values, we should simply
2483 				 * be redistributing the current value with a different breakdown of req
2484 				 * vs override QoS - assert to that effect. Therefore, buckets should
2485 				 * not change.
2486 				 */
2487 				new_pri.qos_override = qos_override;
2488 				assert(workq_pri_override(new_pri) == workq_pri_override(old_pri));
2489 				assert(workq_pri_bucket(new_pri) == workq_pri_bucket(old_pri));
2490 			}
2491 
2492 			/* Adjust schedule counts for various types of transitions */
2493 
2494 			/* overcommit -> non-overcommit */
2495 			if (workq_thread_is_overcommit(uth) && _pthread_priority_is_nonovercommit(priority)) {
2496 				workq_thread_set_type(uth, 0);
2497 				wq->wq_constrained_threads_scheduled++;
2498 
2499 				/* non-overcommit -> overcommit */
2500 			} else if (workq_thread_is_nonovercommit(uth) && _pthread_priority_is_overcommit(priority)) {
2501 				workq_thread_set_type(uth, UT_WORKQ_OVERCOMMIT);
2502 				force_run = (wq->wq_constrained_threads_scheduled-- == wq_max_constrained_threads);
2503 
2504 				/* cooperative -> cooperative */
2505 			} else if (workq_thread_is_cooperative(uth)) {
2506 				_wq_cooperative_queue_scheduled_count_dec(wq, old_pri.qos_req);
2507 				_wq_cooperative_queue_scheduled_count_inc(wq, new_pri.qos_req);
2508 
2509 				/* We're changing schedule counts within cooperative pool, we
2510 				 * need to refresh best cooperative QoS logic again */
2511 				force_run = _wq_cooperative_queue_refresh_best_req_qos(wq);
2512 			}
2513 
2514 			/*
2515 			 * This will set up an override on the thread if any and will also call
2516 			 * schedule_creator if needed
2517 			 */
2518 			workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
2519 			workq_unlock(wq);
2520 
2521 			if (qos_override) {
2522 				thread_mtx_unlock(th);
2523 			}
2524 
2525 			if (workq_thread_is_overcommit(uth)) {
2526 				thread_disarm_workqueue_quantum(th);
2527 			} else {
2528 				/* If the thread changed QoS buckets, the quantum duration
2529 				 * may have changed too */
2530 				thread_arm_workqueue_quantum(th);
2531 			}
2532 		}
2533 
2534 		kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
2535 		    (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
2536 		if (kr != KERN_SUCCESS) {
2537 			qos_rv = EINVAL;
2538 		}
2539 	}
2540 
2541 voucher:
2542 	if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
2543 		kr = thread_set_voucher_name(voucher);
2544 		if (kr != KERN_SUCCESS) {
2545 			voucher_rv = ENOENT;
2546 			goto fixedpri;
2547 		}
2548 	}
2549 
2550 fixedpri:
2551 	if (qos_rv) {
2552 		goto done;
2553 	}
2554 	if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
2555 		thread_extended_policy_data_t extpol = {.timeshare = 0};
2556 
2557 		if (is_wq_thread) {
2558 			/* Not allowed on workqueue threads */
2559 			fixedpri_rv = ENOTSUP;
2560 			goto done;
2561 		}
2562 
2563 		kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
2564 		    (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
2565 		if (kr != KERN_SUCCESS) {
2566 			fixedpri_rv = EINVAL;
2567 			goto done;
2568 		}
2569 	} else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
2570 		thread_extended_policy_data_t extpol = {.timeshare = 1};
2571 
2572 		if (is_wq_thread) {
2573 			/* Not allowed on workqueue threads */
2574 			fixedpri_rv = ENOTSUP;
2575 			goto done;
2576 		}
2577 
2578 		kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
2579 		    (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
2580 		if (kr != KERN_SUCCESS) {
2581 			fixedpri_rv = EINVAL;
2582 			goto done;
2583 		}
2584 	}
2585 
2586 done:
2587 	if (qos_rv && voucher_rv) {
2588 		/* Both failed, give that a unique error. */
2589 		return EBADMSG;
2590 	}
2591 
2592 	if (unbind_rv) {
2593 		return unbind_rv;
2594 	}
2595 
2596 	if (qos_rv) {
2597 		return qos_rv;
2598 	}
2599 
2600 	if (voucher_rv) {
2601 		return voucher_rv;
2602 	}
2603 
2604 	if (fixedpri_rv) {
2605 		return fixedpri_rv;
2606 	}
2607 
2608 
2609 	return 0;
2610 }
2611 
2612 static int
bsdthread_add_explicit_override(proc_t p,mach_port_name_t kport,pthread_priority_t pp,user_addr_t resource)2613 bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
2614     pthread_priority_t pp, user_addr_t resource)
2615 {
2616 	thread_qos_t qos = _pthread_priority_thread_qos(pp);
2617 	if (qos == THREAD_QOS_UNSPECIFIED) {
2618 		return EINVAL;
2619 	}
2620 
2621 	thread_t th = port_name_to_thread(kport,
2622 	    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2623 	if (th == THREAD_NULL) {
2624 		return ESRCH;
2625 	}
2626 
2627 	int rv = proc_thread_qos_add_override(proc_task(p), th, 0, qos, TRUE,
2628 	    resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
2629 
2630 	thread_deallocate(th);
2631 	return rv;
2632 }
2633 
2634 static int
bsdthread_remove_explicit_override(proc_t p,mach_port_name_t kport,user_addr_t resource)2635 bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
2636     user_addr_t resource)
2637 {
2638 	thread_t th = port_name_to_thread(kport,
2639 	    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2640 	if (th == THREAD_NULL) {
2641 		return ESRCH;
2642 	}
2643 
2644 	int rv = proc_thread_qos_remove_override(proc_task(p), th, 0, resource,
2645 	    THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
2646 
2647 	thread_deallocate(th);
2648 	return rv;
2649 }
2650 
2651 static int
workq_thread_add_dispatch_override(proc_t p,mach_port_name_t kport,pthread_priority_t pp,user_addr_t ulock_addr)2652 workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
2653     pthread_priority_t pp, user_addr_t ulock_addr)
2654 {
2655 	struct uu_workq_policy old_pri, new_pri;
2656 	struct workqueue *wq = proc_get_wqptr(p);
2657 
2658 	thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
2659 	if (qos_override == THREAD_QOS_UNSPECIFIED) {
2660 		return EINVAL;
2661 	}
2662 
2663 	thread_t thread = port_name_to_thread(kport,
2664 	    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2665 	if (thread == THREAD_NULL) {
2666 		return ESRCH;
2667 	}
2668 
2669 	struct uthread *uth = get_bsdthread_info(thread);
2670 	if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2671 		thread_deallocate(thread);
2672 		return EPERM;
2673 	}
2674 
2675 	WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
2676 	    wq, thread_tid(thread), 1, pp);
2677 
2678 	thread_mtx_lock(thread);
2679 
2680 	if (ulock_addr) {
2681 		uint32_t val;
2682 		int rc;
2683 		/*
2684 		 * Workaround lack of explicit support for 'no-fault copyin'
2685 		 * <rdar://problem/24999882>, as disabling preemption prevents paging in
2686 		 */
2687 		disable_preemption();
2688 		rc = copyin_atomic32(ulock_addr, &val);
2689 		enable_preemption();
2690 		if (rc == 0 && ulock_owner_value_to_port_name(val) != kport) {
2691 			goto out;
2692 		}
2693 	}
2694 
2695 	workq_lock_spin(wq);
2696 
2697 	old_pri = uth->uu_workq_pri;
2698 	if (old_pri.qos_override >= qos_override) {
2699 		/* Nothing to do */
2700 	} else if (thread == current_thread()) {
2701 		new_pri = old_pri;
2702 		new_pri.qos_override = qos_override;
2703 		workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2704 	} else {
2705 		uth->uu_workq_pri.qos_override = qos_override;
2706 		if (qos_override > workq_pri_override(old_pri)) {
2707 			thread_set_workq_override(thread, qos_override);
2708 		}
2709 	}
2710 
2711 	workq_unlock(wq);
2712 
2713 out:
2714 	thread_mtx_unlock(thread);
2715 	thread_deallocate(thread);
2716 	return 0;
2717 }
2718 
2719 static int
workq_thread_reset_dispatch_override(proc_t p,thread_t thread)2720 workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
2721 {
2722 	struct uu_workq_policy old_pri, new_pri;
2723 	struct workqueue *wq = proc_get_wqptr(p);
2724 	struct uthread *uth = get_bsdthread_info(thread);
2725 
2726 	if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
2727 		return EPERM;
2728 	}
2729 
2730 	WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0);
2731 
2732 	/*
2733 	 * workq_thread_add_dispatch_override takes the thread mutex before doing the
2734 	 * copyin to validate the drainer and apply the override. We need to do the
2735 	 * same here. See rdar://84472518
2736 	 */
2737 	thread_mtx_lock(thread);
2738 
2739 	workq_lock_spin(wq);
2740 	old_pri = new_pri = uth->uu_workq_pri;
2741 	new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
2742 	workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
2743 	workq_unlock(wq);
2744 
2745 	thread_mtx_unlock(thread);
2746 	return 0;
2747 }
2748 
2749 static int
workq_thread_allow_kill(__unused proc_t p,thread_t thread,bool enable)2750 workq_thread_allow_kill(__unused proc_t p, thread_t thread, bool enable)
2751 {
2752 	if (!(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE)) {
2753 		// If the thread isn't a workqueue thread, don't set the
2754 		// kill_allowed bit; however, we still need to return 0
2755 		// instead of an error code since this code is executed
2756 		// on the abort path which needs to not depend on the
2757 		// pthread_t (returning an error depends on pthread_t via
2758 		// cerror_nocancel)
2759 		return 0;
2760 	}
2761 	struct uthread *uth = get_bsdthread_info(thread);
2762 	uth->uu_workq_pthread_kill_allowed = enable;
2763 	return 0;
2764 }
2765 
2766 static int
workq_allow_sigmask(proc_t p,sigset_t mask)2767 workq_allow_sigmask(proc_t p, sigset_t mask)
2768 {
2769 	if (mask & workq_threadmask) {
2770 		return EINVAL;
2771 	}
2772 
2773 	proc_lock(p);
2774 	p->p_workq_allow_sigmask |= mask;
2775 	proc_unlock(p);
2776 
2777 	return 0;
2778 }
2779 
2780 static int
bsdthread_get_max_parallelism(thread_qos_t qos,unsigned long flags,int * retval)2781 bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
2782     int *retval)
2783 {
2784 	static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
2785 	    _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
2786 	static_assert(QOS_PARALLELISM_REALTIME ==
2787 	    _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
2788 	static_assert(QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE ==
2789 	    _PTHREAD_QOS_PARALLELISM_CLUSTER_SHARED_RSRC, "cluster shared resource");
2790 
2791 	if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL | QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE)) {
2792 		return EINVAL;
2793 	}
2794 
2795 	/* No units are present */
2796 	if (flags & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) {
2797 		return ENOTSUP;
2798 	}
2799 
2800 	if (flags & QOS_PARALLELISM_REALTIME) {
2801 		if (qos) {
2802 			return EINVAL;
2803 		}
2804 	} else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
2805 		return EINVAL;
2806 	}
2807 
2808 	*retval = qos_max_parallelism(qos, flags);
2809 	return 0;
2810 }
2811 
2812 static int
bsdthread_dispatch_apply_attr(__unused struct proc * p,thread_t thread,unsigned long flags,uint64_t value1,__unused uint64_t value2)2813 bsdthread_dispatch_apply_attr(__unused struct proc *p, thread_t thread,
2814     unsigned long flags, uint64_t value1, __unused uint64_t value2)
2815 {
2816 	uint32_t apply_worker_index;
2817 	kern_return_t kr;
2818 
2819 	switch (flags) {
2820 	case _PTHREAD_DISPATCH_APPLY_ATTR_CLUSTER_SHARED_RSRC_SET:
2821 		apply_worker_index = (uint32_t)value1;
2822 		kr = thread_shared_rsrc_policy_set(thread, apply_worker_index, CLUSTER_SHARED_RSRC_TYPE_RR, SHARED_RSRC_POLICY_AGENT_DISPATCH);
2823 		/*
2824 		 * KERN_INVALID_POLICY indicates that the thread was trying to bind to a
2825 		 * cluster which it was not eligible to execute on.
2826 		 */
2827 		return (kr == KERN_SUCCESS) ? 0 : ((kr == KERN_INVALID_POLICY) ? ENOTSUP : EINVAL);
2828 	case _PTHREAD_DISPATCH_APPLY_ATTR_CLUSTER_SHARED_RSRC_CLEAR:
2829 		kr = thread_shared_rsrc_policy_clear(thread, CLUSTER_SHARED_RSRC_TYPE_RR, SHARED_RSRC_POLICY_AGENT_DISPATCH);
2830 		return (kr == KERN_SUCCESS) ? 0 : EINVAL;
2831 	default:
2832 		return EINVAL;
2833 	}
2834 }
2835 
2836 #define ENSURE_UNUSED(arg) \
2837 	        ({ if ((arg) != 0) { return EINVAL; } })
2838 
2839 int
bsdthread_ctl(struct proc * p,struct bsdthread_ctl_args * uap,int * retval)2840 bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
2841 {
2842 	switch (uap->cmd) {
2843 	case BSDTHREAD_CTL_QOS_OVERRIDE_START:
2844 		return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
2845 		           (pthread_priority_t)uap->arg2, uap->arg3);
2846 	case BSDTHREAD_CTL_QOS_OVERRIDE_END:
2847 		ENSURE_UNUSED(uap->arg3);
2848 		return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
2849 		           (user_addr_t)uap->arg2);
2850 
2851 	case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
2852 		return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
2853 		           (pthread_priority_t)uap->arg2, uap->arg3);
2854 	case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
2855 		return workq_thread_reset_dispatch_override(p, current_thread());
2856 
2857 	case BSDTHREAD_CTL_SET_SELF:
2858 		return bsdthread_set_self(p, current_thread(),
2859 		           (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
2860 		           (enum workq_set_self_flags)uap->arg3);
2861 
2862 	case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
2863 		ENSURE_UNUSED(uap->arg3);
2864 		return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
2865 		           (unsigned long)uap->arg2, retval);
2866 	case BSDTHREAD_CTL_WORKQ_ALLOW_KILL:
2867 		ENSURE_UNUSED(uap->arg2);
2868 		ENSURE_UNUSED(uap->arg3);
2869 		return workq_thread_allow_kill(p, current_thread(), (bool)uap->arg1);
2870 	case BSDTHREAD_CTL_DISPATCH_APPLY_ATTR:
2871 		return bsdthread_dispatch_apply_attr(p, current_thread(),
2872 		           (unsigned long)uap->arg1, (uint64_t)uap->arg2,
2873 		           (uint64_t)uap->arg3);
2874 	case BSDTHREAD_CTL_WORKQ_ALLOW_SIGMASK:
2875 		return workq_allow_sigmask(p, (int)uap->arg1);
2876 	case BSDTHREAD_CTL_SET_QOS:
2877 	case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
2878 	case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
2879 		/* no longer supported */
2880 		return ENOTSUP;
2881 
2882 	default:
2883 		return EINVAL;
2884 	}
2885 }
2886 
2887 #pragma mark workqueue thread manipulation
2888 
2889 static void __dead2
2890 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2891     struct uthread *uth, uint32_t setup_flags);
2892 
2893 static void __dead2
2894 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
2895     struct uthread *uth, uint32_t setup_flags);
2896 
2897 static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
2898 
2899 #if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
2900 static inline uint64_t
workq_trace_req_id(workq_threadreq_t req)2901 workq_trace_req_id(workq_threadreq_t req)
2902 {
2903 	struct kqworkloop *kqwl;
2904 	if (req->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
2905 		kqwl = __container_of(req, struct kqworkloop, kqwl_request);
2906 		return kqwl->kqwl_dynamicid;
2907 	}
2908 
2909 	return VM_KERNEL_ADDRHIDE(req);
2910 }
2911 #endif
2912 
2913 /**
2914  * Entry point for libdispatch to ask for threads
2915  */
2916 static int
workq_reqthreads(struct proc * p,uint32_t reqcount,pthread_priority_t pp,bool cooperative)2917 workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp, bool cooperative)
2918 {
2919 	thread_qos_t qos = _pthread_priority_thread_qos(pp);
2920 	struct workqueue *wq = proc_get_wqptr(p);
2921 	uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
2922 	int ret = 0;
2923 
2924 	if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
2925 	    qos == THREAD_QOS_UNSPECIFIED) {
2926 		ret = EINVAL;
2927 		goto exit;
2928 	}
2929 
2930 	WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
2931 	    wq, reqcount, pp, cooperative);
2932 
2933 	workq_threadreq_t req = zalloc(workq_zone_threadreq);
2934 	priority_queue_entry_init(&req->tr_entry);
2935 	req->tr_state = WORKQ_TR_STATE_NEW;
2936 	req->tr_qos   = qos;
2937 	workq_tr_flags_t tr_flags = 0;
2938 
2939 	if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
2940 		tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
2941 		upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
2942 	}
2943 
2944 	if (cooperative) {
2945 		tr_flags |= WORKQ_TR_FLAG_COOPERATIVE;
2946 		upcall_flags |= WQ_FLAG_THREAD_COOPERATIVE;
2947 
2948 		if (reqcount > 1) {
2949 			ret = ENOTSUP;
2950 			goto free_and_exit;
2951 		}
2952 	}
2953 
2954 	/* A thread request cannot be both overcommit and cooperative */
2955 	if (workq_tr_is_cooperative(tr_flags) &&
2956 	    workq_tr_is_overcommit(tr_flags)) {
2957 		ret = EINVAL;
2958 		goto free_and_exit;
2959 	}
2960 	req->tr_flags = tr_flags;
2961 
2962 	WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
2963 	    wq, workq_trace_req_id(req), req->tr_qos, reqcount);
2964 
2965 	workq_lock_spin(wq);
2966 	do {
2967 		if (_wq_exiting(wq)) {
2968 			goto unlock_and_exit;
2969 		}
2970 
2971 		/*
2972 		 * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
2973 		 * threads without pacing, to inform the scheduler of that workload.
2974 		 *
2975 		 * The last requests, or the ones that failed the admission checks are
2976 		 * enqueued and go through the regular creator codepath.
2977 		 *
2978 		 * If there aren't enough threads, add one, but re-evaluate everything
2979 		 * as conditions may now have changed.
2980 		 */
2981 		unpaced = reqcount - 1;
2982 
2983 		if (reqcount > 1) {
2984 			/* We don't handle asking for parallelism on the cooperative
2985 			 * workqueue just yet */
2986 			assert(!workq_threadreq_is_cooperative(req));
2987 
2988 			if (workq_threadreq_is_nonovercommit(req)) {
2989 				unpaced = workq_constrained_allowance(wq, qos, NULL, false, true);
2990 				if (unpaced >= reqcount - 1) {
2991 					unpaced = reqcount - 1;
2992 				}
2993 			}
2994 		}
2995 
2996 		/*
2997 		 * This path does not currently handle custom workloop parameters
2998 		 * when creating threads for parallelism.
2999 		 */
3000 		assert(!(req->tr_flags & WORKQ_TR_FLAG_WL_PARAMS));
3001 
3002 		/*
3003 		 * This is a trimmed down version of workq_threadreq_bind_and_unlock()
3004 		 */
3005 		while (unpaced > 0 && wq->wq_thidlecount) {
3006 			struct uthread *uth;
3007 			bool needs_wakeup;
3008 			uint8_t uu_flags = UT_WORKQ_EARLY_BOUND;
3009 
3010 			if (workq_tr_is_overcommit(req->tr_flags)) {
3011 				uu_flags |= UT_WORKQ_OVERCOMMIT;
3012 			}
3013 
3014 			uth = workq_pop_idle_thread(wq, uu_flags, &needs_wakeup);
3015 
3016 			_wq_thactive_inc(wq, qos);
3017 			wq->wq_thscheduled_count[_wq_bucket(qos)]++;
3018 			workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3019 			wq->wq_fulfilled++;
3020 
3021 			uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
3022 			uth->uu_save.uus_workq_park_data.thread_request = req;
3023 			if (needs_wakeup) {
3024 				workq_thread_wakeup(uth);
3025 			}
3026 			unpaced--;
3027 			reqcount--;
3028 		}
3029 	} while (unpaced && wq->wq_nthreads < wq_max_threads &&
3030 	    (workq_add_new_idle_thread(p, wq, workq_unpark_continue,
3031 	    false, NULL) == KERN_SUCCESS));
3032 
3033 	if (_wq_exiting(wq)) {
3034 		goto unlock_and_exit;
3035 	}
3036 
3037 	req->tr_count = (uint16_t)reqcount;
3038 	if (workq_threadreq_enqueue(wq, req)) {
3039 		/* This can drop the workqueue lock, and take it again */
3040 		workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
3041 	}
3042 	workq_unlock(wq);
3043 	return 0;
3044 
3045 unlock_and_exit:
3046 	workq_unlock(wq);
3047 free_and_exit:
3048 	zfree(workq_zone_threadreq, req);
3049 exit:
3050 	return ret;
3051 }
3052 
3053 bool
workq_kern_threadreq_initiate(struct proc * p,workq_threadreq_t req,struct turnstile * workloop_ts,thread_qos_t qos,workq_kern_threadreq_flags_t flags)3054 workq_kern_threadreq_initiate(struct proc *p, workq_threadreq_t req,
3055     struct turnstile *workloop_ts, thread_qos_t qos,
3056     workq_kern_threadreq_flags_t flags)
3057 {
3058 	struct workqueue *wq = proc_get_wqptr_fast(p);
3059 	struct uthread *uth = NULL;
3060 
3061 	assert(req->tr_flags & (WORKQ_TR_FLAG_WORKLOOP | WORKQ_TR_FLAG_KEVENT));
3062 
3063 	/*
3064 	 * For any new initialization changes done to workqueue thread request below,
3065 	 * please also consider if they are relevant to permanently bound thread
3066 	 * request. See workq_kern_threadreq_permanent_bind.
3067 	 */
3068 	if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
3069 		workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
3070 		qos = thread_workq_qos_for_pri(trp.trp_pri);
3071 		if (qos == THREAD_QOS_UNSPECIFIED) {
3072 			qos = WORKQ_THREAD_QOS_ABOVEUI;
3073 		}
3074 	}
3075 
3076 	assert(req->tr_state == WORKQ_TR_STATE_IDLE);
3077 	priority_queue_entry_init(&req->tr_entry);
3078 	req->tr_count = 1;
3079 	req->tr_state = WORKQ_TR_STATE_NEW;
3080 	req->tr_qos   = qos;
3081 
3082 	WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
3083 	    workq_trace_req_id(req), qos, 1);
3084 
3085 	if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
3086 		/*
3087 		 * we're called back synchronously from the context of
3088 		 * kqueue_threadreq_unbind from within workq_thread_return()
3089 		 * we can try to match up this thread with this request !
3090 		 */
3091 		uth = current_uthread();
3092 		assert(uth->uu_kqr_bound == NULL);
3093 	}
3094 
3095 	workq_lock_spin(wq);
3096 	if (_wq_exiting(wq)) {
3097 		req->tr_state = WORKQ_TR_STATE_IDLE;
3098 		workq_unlock(wq);
3099 		return false;
3100 	}
3101 
3102 	if (uth && workq_threadreq_admissible(wq, uth, req)) {
3103 		/* This is the case of the rebind - we were about to park and unbind
3104 		 * when more events came so keep the binding.
3105 		 */
3106 		assert(uth != wq->wq_creator);
3107 
3108 		if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
3109 			_wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
3110 			workq_thread_reset_pri(wq, uth, req, /*unpark*/ false);
3111 		}
3112 		/*
3113 		 * We're called from workq_kern_threadreq_initiate()
3114 		 * due to an unbind, with the kq req held.
3115 		 */
3116 		WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
3117 		    workq_trace_req_id(req), req->tr_flags, 0);
3118 		wq->wq_fulfilled++;
3119 
3120 		kqueue_threadreq_bind(p, req, get_machthread(uth), 0);
3121 	} else {
3122 		if (workloop_ts) {
3123 			workq_perform_turnstile_operation_locked(wq, ^{
3124 				turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
3125 				TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
3126 				turnstile_update_inheritor_complete(workloop_ts,
3127 				TURNSTILE_INTERLOCK_HELD);
3128 			});
3129 		}
3130 
3131 		bool reevaluate_creator_thread_group = false;
3132 #if CONFIG_PREADOPT_TG
3133 		reevaluate_creator_thread_group = (flags & WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG);
3134 #endif
3135 		/* We enqueued the highest priority item or we may need to reevaluate if
3136 		 * the creator needs a thread group pre-adoption */
3137 		if (workq_threadreq_enqueue(wq, req) || reevaluate_creator_thread_group) {
3138 			workq_schedule_creator(p, wq, flags);
3139 		}
3140 	}
3141 
3142 	workq_unlock(wq);
3143 
3144 	return true;
3145 }
3146 
3147 void
workq_kern_threadreq_modify(struct proc * p,workq_threadreq_t req,thread_qos_t qos,workq_kern_threadreq_flags_t flags)3148 workq_kern_threadreq_modify(struct proc *p, workq_threadreq_t req,
3149     thread_qos_t qos, workq_kern_threadreq_flags_t flags)
3150 {
3151 	struct workqueue *wq = proc_get_wqptr_fast(p);
3152 	bool make_overcommit = false;
3153 
3154 	if (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
3155 		/* Requests outside-of-QoS shouldn't accept modify operations */
3156 		return;
3157 	}
3158 
3159 	workq_lock_spin(wq);
3160 
3161 	assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
3162 	assert(req->tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP));
3163 
3164 	if (req->tr_state == WORKQ_TR_STATE_BINDING) {
3165 		kqueue_threadreq_bind(p, req, req->tr_thread, 0);
3166 		workq_unlock(wq);
3167 		return;
3168 	}
3169 
3170 	if (flags & WORKQ_THREADREQ_MAKE_OVERCOMMIT) {
3171 		/* TODO (rokhinip): We come into this code path for kqwl thread
3172 		 * requests. kqwl requests cannot be cooperative.
3173 		 */
3174 		assert(!workq_threadreq_is_cooperative(req));
3175 
3176 		make_overcommit = workq_threadreq_is_nonovercommit(req);
3177 	}
3178 
3179 	if (_wq_exiting(wq) || (req->tr_qos == qos && !make_overcommit)) {
3180 		workq_unlock(wq);
3181 		return;
3182 	}
3183 
3184 	assert(req->tr_count == 1);
3185 	if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
3186 		panic("Invalid thread request (%p) state %d", req, req->tr_state);
3187 	}
3188 
3189 	WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
3190 	    workq_trace_req_id(req), qos, 0);
3191 
3192 	struct priority_queue_sched_max *pq = workq_priority_queue_for_req(wq, req);
3193 	workq_threadreq_t req_max;
3194 
3195 	/*
3196 	 * Stage 1: Dequeue the request from its priority queue.
3197 	 *
3198 	 * If we dequeue the root item of the constrained priority queue,
3199 	 * maintain the best constrained request qos invariant.
3200 	 */
3201 	if (priority_queue_remove(pq, &req->tr_entry)) {
3202 		if (workq_threadreq_is_nonovercommit(req)) {
3203 			_wq_thactive_refresh_best_constrained_req_qos(wq);
3204 		}
3205 	}
3206 
3207 	/*
3208 	 * Stage 2: Apply changes to the thread request
3209 	 *
3210 	 * If the item will not become the root of the priority queue it belongs to,
3211 	 * then we need to wait in line, just enqueue and return quickly.
3212 	 */
3213 	if (__improbable(make_overcommit)) {
3214 		req->tr_flags ^= WORKQ_TR_FLAG_OVERCOMMIT;
3215 		pq = workq_priority_queue_for_req(wq, req);
3216 	}
3217 	req->tr_qos = qos;
3218 
3219 	req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
3220 	if (req_max && req_max->tr_qos >= qos) {
3221 		priority_queue_entry_set_sched_pri(pq, &req->tr_entry,
3222 		    workq_priority_for_req(req), false);
3223 		priority_queue_insert(pq, &req->tr_entry);
3224 		workq_unlock(wq);
3225 		return;
3226 	}
3227 
3228 	/*
3229 	 * Stage 3: Reevaluate whether we should run the thread request.
3230 	 *
3231 	 * Pretend the thread request is new again:
3232 	 * - adjust wq_reqcount to not count it anymore.
3233 	 * - make its state WORKQ_TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
3234 	 *   properly attempts a synchronous bind)
3235 	 */
3236 	wq->wq_reqcount--;
3237 	req->tr_state = WORKQ_TR_STATE_NEW;
3238 
3239 	/* We enqueued the highest priority item or we may need to reevaluate if
3240 	 * the creator needs a thread group pre-adoption if the request got a new TG */
3241 	bool reevaluate_creator_tg = false;
3242 
3243 #if CONFIG_PREADOPT_TG
3244 	reevaluate_creator_tg = (flags & WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG);
3245 #endif
3246 
3247 	if (workq_threadreq_enqueue(wq, req) || reevaluate_creator_tg) {
3248 		workq_schedule_creator(p, wq, flags);
3249 	}
3250 	workq_unlock(wq);
3251 }
3252 
3253 void
workq_kern_bound_thread_reset_pri(workq_threadreq_t req,struct uthread * uth)3254 workq_kern_bound_thread_reset_pri(workq_threadreq_t req, struct uthread *uth)
3255 {
3256 	assert(workq_thread_is_permanently_bound(uth));
3257 
3258 	if (req && (req->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS)) {
3259 		/*
3260 		 * For requests outside-of-QoS, we set the scheduling policy and
3261 		 * absolute priority for the bound thread right at the initialization
3262 		 * time. See workq_kern_threadreq_permanent_bind.
3263 		 */
3264 		return;
3265 	}
3266 
3267 	struct workqueue *wq = proc_get_wqptr_fast(current_proc());
3268 	if (req) {
3269 		assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
3270 		workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
3271 	} else {
3272 		thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
3273 		if (qos > WORKQ_THREAD_QOS_CLEANUP) {
3274 			workq_thread_reset_pri(wq, uth, NULL, /*unpark*/ true);
3275 		} else {
3276 			uth->uu_save.uus_workq_park_data.qos = qos;
3277 		}
3278 	}
3279 }
3280 
3281 void
workq_kern_threadreq_lock(struct proc * p)3282 workq_kern_threadreq_lock(struct proc *p)
3283 {
3284 	workq_lock_spin(proc_get_wqptr_fast(p));
3285 }
3286 
3287 void
workq_kern_threadreq_unlock(struct proc * p)3288 workq_kern_threadreq_unlock(struct proc *p)
3289 {
3290 	workq_unlock(proc_get_wqptr_fast(p));
3291 }
3292 
3293 void
workq_kern_threadreq_update_inheritor(struct proc * p,workq_threadreq_t req,thread_t owner,struct turnstile * wl_ts,turnstile_update_flags_t flags)3294 workq_kern_threadreq_update_inheritor(struct proc *p, workq_threadreq_t req,
3295     thread_t owner, struct turnstile *wl_ts,
3296     turnstile_update_flags_t flags)
3297 {
3298 	struct workqueue *wq = proc_get_wqptr_fast(p);
3299 	turnstile_inheritor_t inheritor;
3300 
3301 	assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
3302 	assert(req->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
3303 	workq_lock_held(wq);
3304 
3305 	if (req->tr_state == WORKQ_TR_STATE_BINDING) {
3306 		kqueue_threadreq_bind(p, req, req->tr_thread,
3307 		    KQUEUE_THREADREQ_BIND_NO_INHERITOR_UPDATE);
3308 		return;
3309 	}
3310 
3311 	if (_wq_exiting(wq)) {
3312 		inheritor = TURNSTILE_INHERITOR_NULL;
3313 	} else {
3314 		if (req->tr_state != WORKQ_TR_STATE_QUEUED) {
3315 			panic("Invalid thread request (%p) state %d", req, req->tr_state);
3316 		}
3317 
3318 		if (owner) {
3319 			inheritor = owner;
3320 			flags |= TURNSTILE_INHERITOR_THREAD;
3321 		} else {
3322 			inheritor = wq->wq_turnstile;
3323 			flags |= TURNSTILE_INHERITOR_TURNSTILE;
3324 		}
3325 	}
3326 
3327 	workq_perform_turnstile_operation_locked(wq, ^{
3328 		turnstile_update_inheritor(wl_ts, inheritor, flags);
3329 	});
3330 }
3331 
3332 /*
3333  * An entry point for kevent to request a newly created workqueue thread
3334  * and bind it permanently to the given workqueue thread request.
3335  *
3336  * It currently only supports fixed scheduler priority thread requests.
3337  *
3338  * The newly created thread counts towards wq_nthreads. This function returns
3339  * an error if we are above that limit. There is no concept of delayed thread
3340  * creation for such specially configured kqworkloops.
3341  *
3342  * If successful, the newly created thread will be parked in
3343  * workq_bound_thread_initialize_and_unpark_continue waiting for
3344  * new incoming events.
3345  */
3346 kern_return_t
workq_kern_threadreq_permanent_bind(struct proc * p,struct workq_threadreq_s * kqr)3347 workq_kern_threadreq_permanent_bind(struct proc *p, struct workq_threadreq_s *kqr)
3348 {
3349 	kern_return_t ret = 0;
3350 	thread_t new_thread = NULL;
3351 	struct workqueue *wq = proc_get_wqptr_fast(p);
3352 
3353 	workq_lock_spin(wq);
3354 
3355 	if (wq->wq_nthreads >= wq_max_threads) {
3356 		ret = EDOM;
3357 	} else {
3358 		if (kqr->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
3359 			workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
3360 			/*
3361 			 * For requests outside-of-QoS, we fully initialize the thread
3362 			 * request here followed by preadopting the scheduling properties
3363 			 * on the newly created bound thread.
3364 			 */
3365 			thread_qos_t qos = thread_workq_qos_for_pri(trp.trp_pri);
3366 			if (qos == THREAD_QOS_UNSPECIFIED) {
3367 				qos = WORKQ_THREAD_QOS_ABOVEUI;
3368 			}
3369 			kqr->tr_qos = qos;
3370 		}
3371 		kqr->tr_count = 1;
3372 
3373 		/* workq_lock dropped and retaken around thread creation below. */
3374 		ret = workq_add_new_idle_thread(p, wq,
3375 		    workq_bound_thread_initialize_and_unpark_continue,
3376 		    true, &new_thread);
3377 		if (ret == KERN_SUCCESS) {
3378 			struct uthread *uth = get_bsdthread_info(new_thread);
3379 			if (kqr->tr_flags & WORKQ_TR_FLAG_WL_OUTSIDE_QOS) {
3380 				workq_thread_reset_pri(wq, uth, kqr, /*unpark*/ true);
3381 			}
3382 			/*
3383 			 * The newly created thread goes through a full bind to the kqwl
3384 			 * right upon creation.
3385 			 * It then falls back to soft bind/unbind upon wakeup/park.
3386 			 */
3387 			kqueue_threadreq_bind_prepost(p, kqr, uth);
3388 			uth->uu_workq_flags |= UT_WORKQ_PERMANENT_BIND;
3389 		}
3390 	}
3391 
3392 	workq_unlock(wq);
3393 
3394 	if (ret == KERN_SUCCESS) {
3395 		kqueue_threadreq_bind_commit(p, new_thread);
3396 	}
3397 	return ret;
3398 }
3399 
3400 /*
3401  * Called with kqlock held. It does not need to take the process wide
3402  * global workq lock -> making it faster.
3403  */
3404 void
workq_kern_bound_thread_wakeup(struct workq_threadreq_s * kqr)3405 workq_kern_bound_thread_wakeup(struct workq_threadreq_s *kqr)
3406 {
3407 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
3408 	workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
3409 
3410 	/*
3411 	 * See "Locking model for accessing uu_workq_flags" for more information
3412 	 * on how access to uu_workq_flags for the bound thread is synchronized.
3413 	 */
3414 	assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)) == 0);
3415 
3416 	if (trp.trp_flags & TRP_RELEASED) {
3417 		uth->uu_workq_flags |= UT_WORKQ_DYING;
3418 	} else {
3419 		uth->uu_workq_flags |= UT_WORKQ_RUNNING;
3420 	}
3421 
3422 	workq_thread_wakeup(uth);
3423 }
3424 
3425 /*
3426  * Called with kqlock held. Dropped before parking.
3427  * It does not need to take process wide global workqueue
3428  * lock -> making it faster.
3429  */
3430 __attribute__((noreturn, noinline))
3431 void
workq_kern_bound_thread_park(struct workq_threadreq_s * kqr)3432 workq_kern_bound_thread_park(struct workq_threadreq_s *kqr)
3433 {
3434 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
3435 	assert(uth == current_uthread());
3436 
3437 	/*
3438 	 * See "Locking model for accessing uu_workq_flags" for more information
3439 	 * on how access to uu_workq_flags for the bound thread is synchronized.
3440 	 */
3441 	uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING);
3442 
3443 	thread_disarm_workqueue_quantum(get_machthread(uth));
3444 
3445 	/*
3446 	 * TODO (pavhad) We could do the reusable userspace stack performance
3447 	 * optimization here.
3448 	 */
3449 
3450 	kqworkloop_bound_thread_park_prepost(kqr);
3451 	/* KQ_SLEEP bit is set and kqlock is dropped. */
3452 
3453 	__assert_only kern_return_t kr;
3454 	kr = thread_set_voucher_name(MACH_PORT_NULL);
3455 	assert(kr == KERN_SUCCESS);
3456 
3457 	kqworkloop_bound_thread_park_commit(kqr,
3458 	    workq_parked_wait_event(uth), workq_bound_thread_unpark_continue);
3459 
3460 	__builtin_unreachable();
3461 }
3462 
3463 /*
3464  * To terminate the permenantly bound workqueue thread. It unbinds itself
3465  * with the kqwl during uthread_cleanup -> kqueue_threadreq_unbind.
3466  * It is also when it will release its reference on the kqwl.
3467  */
3468 __attribute__((noreturn, noinline))
3469 void
workq_kern_bound_thread_terminate(struct workq_threadreq_s * kqr)3470 workq_kern_bound_thread_terminate(struct workq_threadreq_s *kqr)
3471 {
3472 	proc_t p = current_proc();
3473 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
3474 	uint16_t uu_workq_flags_orig;
3475 
3476 	assert(uth == current_uthread());
3477 
3478 	/*
3479 	 * See "Locking model for accessing uu_workq_flags" for more information
3480 	 * on how access to uu_workq_flags for the bound thread is synchronized.
3481 	 */
3482 	kqworkloop_bound_thread_terminate(kqr, &uu_workq_flags_orig);
3483 
3484 	if (uu_workq_flags_orig & UT_WORKQ_WORK_INTERVAL_JOINED) {
3485 		__assert_only kern_return_t kr;
3486 		kr = kern_work_interval_join(get_machthread(uth), MACH_PORT_NULL);
3487 		/* The bound thread un-joins the work interval and drops its +1 ref. */
3488 		assert(kr == KERN_SUCCESS);
3489 	}
3490 
3491 	/*
3492 	 * Drop the voucher now that we are on our way to termination.
3493 	 */
3494 	__assert_only kern_return_t kr;
3495 	kr = thread_set_voucher_name(MACH_PORT_NULL);
3496 	assert(kr == KERN_SUCCESS);
3497 
3498 	uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
3499 	upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
3500 	    WQ_FLAG_THREAD_PRIO_QOS;
3501 
3502 	thread_t th = get_machthread(uth);
3503 	vm_map_t vmap = get_task_map(proc_task(p));
3504 
3505 	if ((uu_workq_flags_orig & UT_WORKQ_NEW) == 0) {
3506 		upcall_flags |= WQ_FLAG_THREAD_REUSE;
3507 	}
3508 
3509 	pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
3510 	    uth->uu_workq_thport, 0, WQ_SETUP_EXIT_THREAD, upcall_flags);
3511 	__builtin_unreachable();
3512 }
3513 
3514 void
workq_kern_threadreq_redrive(struct proc * p,workq_kern_threadreq_flags_t flags)3515 workq_kern_threadreq_redrive(struct proc *p, workq_kern_threadreq_flags_t flags)
3516 {
3517 	struct workqueue *wq = proc_get_wqptr_fast(p);
3518 
3519 	workq_lock_spin(wq);
3520 	workq_schedule_creator(p, wq, flags);
3521 	workq_unlock(wq);
3522 }
3523 
3524 /*
3525  * Always called at AST by the thread on itself
3526  *
3527  * Upon quantum expiry, the workqueue subsystem evaluates its state and decides
3528  * on what the thread should do next. The TSD value is always set by the thread
3529  * on itself in the kernel and cleared either by userspace when it acks the TSD
3530  * value and takes action, or by the thread in the kernel when the quantum
3531  * expires again.
3532  */
3533 void
workq_kern_quantum_expiry_reevaluate(proc_t proc,thread_t thread)3534 workq_kern_quantum_expiry_reevaluate(proc_t proc, thread_t thread)
3535 {
3536 	struct uthread *uth = get_bsdthread_info(thread);
3537 
3538 	if (uth->uu_workq_flags & UT_WORKQ_DYING) {
3539 		return;
3540 	}
3541 
3542 	if (!thread_supports_cooperative_workqueue(thread)) {
3543 		panic("Quantum expired for thread that doesn't support cooperative workqueue");
3544 	}
3545 
3546 	thread_qos_t qos = uth->uu_workq_pri.qos_bucket;
3547 	if (qos == THREAD_QOS_UNSPECIFIED) {
3548 		panic("Thread should not have workq bucket of QoS UN");
3549 	}
3550 
3551 	assert(thread_has_expired_workqueue_quantum(thread, false));
3552 
3553 	struct workqueue *wq = proc_get_wqptr(proc);
3554 	assert(wq != NULL);
3555 
3556 	/*
3557 	 * For starters, we're just going to evaluate and see if we need to narrow
3558 	 * the pool and tell this thread to park if needed. In the future, we'll
3559 	 * evaluate and convey other workqueue state information like needing to
3560 	 * pump kevents, etc.
3561 	 */
3562 	uint64_t flags = 0;
3563 
3564 	workq_lock_spin(wq);
3565 
3566 	if (workq_thread_is_cooperative(uth)) {
3567 		if (!workq_cooperative_allowance(wq, qos, uth, false)) {
3568 			flags |= PTHREAD_WQ_QUANTUM_EXPIRY_NARROW;
3569 		} else {
3570 			/* In the future, when we have kevent hookups for the cooperative
3571 			 * pool, we need fancier logic for what userspace should do. But
3572 			 * right now, only userspace thread requests exist - so we'll just
3573 			 * tell userspace to shuffle work items */
3574 			flags |= PTHREAD_WQ_QUANTUM_EXPIRY_SHUFFLE;
3575 		}
3576 	} else if (workq_thread_is_nonovercommit(uth)) {
3577 		if (!workq_constrained_allowance(wq, qos, uth, false, false)) {
3578 			flags |= PTHREAD_WQ_QUANTUM_EXPIRY_NARROW;
3579 		}
3580 	}
3581 	workq_unlock(wq);
3582 
3583 	WQ_TRACE(TRACE_wq_quantum_expiry_reevaluate, flags, 0, 0, 0);
3584 
3585 	kevent_set_workq_quantum_expiry_user_tsd(proc, thread, flags);
3586 
3587 	/* We have conveyed to userspace about what it needs to do upon quantum
3588 	 * expiry, now rearm the workqueue quantum again */
3589 	thread_arm_workqueue_quantum(get_machthread(uth));
3590 }
3591 
3592 void
workq_schedule_creator_turnstile_redrive(struct workqueue * wq,bool locked)3593 workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
3594 {
3595 	if (locked) {
3596 		workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_NONE);
3597 	} else {
3598 		workq_schedule_immediate_thread_creation(wq);
3599 	}
3600 }
3601 
3602 static int
workq_thread_return(struct proc * p,struct workq_kernreturn_args * uap,struct workqueue * wq)3603 workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
3604     struct workqueue *wq)
3605 {
3606 	thread_t th = current_thread();
3607 	struct uthread *uth = get_bsdthread_info(th);
3608 	workq_threadreq_t kqr = uth->uu_kqr_bound;
3609 	workq_threadreq_param_t trp = { };
3610 	int nevents = uap->affinity, error;
3611 	user_addr_t eventlist = uap->item;
3612 
3613 	if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
3614 	    (uth->uu_workq_flags & UT_WORKQ_DYING)) {
3615 		return EINVAL;
3616 	}
3617 
3618 	if (eventlist && nevents && kqr == NULL) {
3619 		return EINVAL;
3620 	}
3621 
3622 	/*
3623 	 * Reset signal mask on the workqueue thread to default state,
3624 	 * but do not touch any signals that are marked for preservation.
3625 	 */
3626 	sigset_t resettable = uth->uu_sigmask & ~p->p_workq_allow_sigmask;
3627 	if (resettable != (sigset_t)~workq_threadmask) {
3628 		proc_lock(p);
3629 		uth->uu_sigmask |= ~workq_threadmask & ~p->p_workq_allow_sigmask;
3630 		proc_unlock(p);
3631 	}
3632 
3633 	if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_WL_PARAMS) {
3634 		/*
3635 		 * Ensure we store the threadreq param before unbinding
3636 		 * the kqr from this thread.
3637 		 */
3638 		trp = kqueue_threadreq_workloop_param(kqr);
3639 	}
3640 
3641 	if (kqr && kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) {
3642 		goto handle_stack_events;
3643 	}
3644 
3645 	/*
3646 	 * Freeze the base pri while we decide the fate of this thread.
3647 	 *
3648 	 * Either:
3649 	 * - we return to user and kevent_cleanup will have unfrozen the base pri,
3650 	 * - or we proceed to workq_select_threadreq_or_park_and_unlock() who will.
3651 	 */
3652 	thread_freeze_base_pri(th);
3653 
3654 handle_stack_events:
3655 
3656 	if (kqr) {
3657 		uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
3658 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
3659 			upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
3660 		} else {
3661 			upcall_flags |= WQ_FLAG_THREAD_KEVENT;
3662 		}
3663 		if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
3664 			upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
3665 		} else {
3666 			if (workq_thread_is_overcommit(uth)) {
3667 				upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
3668 			}
3669 			if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
3670 				upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
3671 			} else {
3672 				upcall_flags |= uth->uu_workq_pri.qos_req |
3673 				    WQ_FLAG_THREAD_PRIO_QOS;
3674 			}
3675 		}
3676 		error = pthread_functions->workq_handle_stack_events(p, th,
3677 		    get_task_map(proc_task(p)), uth->uu_workq_stackaddr,
3678 		    uth->uu_workq_thport, eventlist, nevents, upcall_flags);
3679 		if (error) {
3680 			assert(uth->uu_kqr_bound == kqr);
3681 			return error;
3682 		}
3683 
3684 		// pthread is supposed to pass KEVENT_FLAG_PARKING here
3685 		// which should cause the above call to either:
3686 		// - not return
3687 		// - return an error
3688 		// - return 0 and have unbound properly
3689 		assert(uth->uu_kqr_bound == NULL);
3690 	}
3691 
3692 	WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0);
3693 
3694 	thread_sched_call(th, NULL);
3695 	thread_will_park_or_terminate(th);
3696 #if CONFIG_WORKLOOP_DEBUG
3697 	UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
3698 #endif
3699 
3700 	workq_lock_spin(wq);
3701 	WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0);
3702 	uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
3703 	workq_select_threadreq_or_park_and_unlock(p, wq, uth,
3704 	    WQ_SETUP_CLEAR_VOUCHER);
3705 	__builtin_unreachable();
3706 }
3707 
3708 /**
3709  * Multiplexed call to interact with the workqueue mechanism
3710  */
3711 int
workq_kernreturn(struct proc * p,struct workq_kernreturn_args * uap,int32_t * retval)3712 workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
3713 {
3714 	int options = uap->options;
3715 	int arg2 = uap->affinity;
3716 	int arg3 = uap->prio;
3717 	struct workqueue *wq = proc_get_wqptr(p);
3718 	int error = 0;
3719 
3720 	if ((p->p_lflag & P_LREGISTER) == 0) {
3721 		return EINVAL;
3722 	}
3723 
3724 	switch (options) {
3725 	case WQOPS_QUEUE_NEWSPISUPP: {
3726 		/*
3727 		 * arg2 = offset of serialno into dispatch queue
3728 		 * arg3 = kevent support
3729 		 */
3730 		int offset = arg2;
3731 		if (arg3 & 0x01) {
3732 			// If we get here, then userspace has indicated support for kevent delivery.
3733 		}
3734 
3735 		p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
3736 		break;
3737 	}
3738 	case WQOPS_QUEUE_REQTHREADS: {
3739 		/*
3740 		 * arg2 = number of threads to start
3741 		 * arg3 = priority
3742 		 */
3743 		error = workq_reqthreads(p, arg2, arg3, false);
3744 		break;
3745 	}
3746 	/* For requesting threads for the cooperative pool */
3747 	case WQOPS_QUEUE_REQTHREADS2: {
3748 		/*
3749 		 * arg2 = number of threads to start
3750 		 * arg3 = priority
3751 		 */
3752 		error = workq_reqthreads(p, arg2, arg3, true);
3753 		break;
3754 	}
3755 	case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
3756 		/*
3757 		 * arg2 = priority for the manager thread
3758 		 *
3759 		 * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
3760 		 * the low bits of the value contains a scheduling priority
3761 		 * instead of a QOS value
3762 		 */
3763 		pthread_priority_t pri = arg2;
3764 
3765 		if (wq == NULL) {
3766 			error = EINVAL;
3767 			break;
3768 		}
3769 
3770 		/*
3771 		 * Normalize the incoming priority so that it is ordered numerically.
3772 		 */
3773 		if (_pthread_priority_has_sched_pri(pri)) {
3774 			pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
3775 			    _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
3776 		} else {
3777 			thread_qos_t qos = _pthread_priority_thread_qos(pri);
3778 			int relpri = _pthread_priority_relpri(pri);
3779 			if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
3780 			    qos == THREAD_QOS_UNSPECIFIED) {
3781 				error = EINVAL;
3782 				break;
3783 			}
3784 			pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
3785 		}
3786 
3787 		/*
3788 		 * If userspace passes a scheduling priority, that wins over any QoS.
3789 		 * Userspace should takes care not to lower the priority this way.
3790 		 */
3791 		workq_lock_spin(wq);
3792 		if (wq->wq_event_manager_priority < (uint32_t)pri) {
3793 			wq->wq_event_manager_priority = (uint32_t)pri;
3794 		}
3795 		workq_unlock(wq);
3796 		break;
3797 	}
3798 	case WQOPS_THREAD_KEVENT_RETURN:
3799 	case WQOPS_THREAD_WORKLOOP_RETURN:
3800 	case WQOPS_THREAD_RETURN: {
3801 		error = workq_thread_return(p, uap, wq);
3802 		break;
3803 	}
3804 
3805 	case WQOPS_SHOULD_NARROW: {
3806 		/*
3807 		 * arg2 = priority to test
3808 		 * arg3 = unused
3809 		 */
3810 		thread_t th = current_thread();
3811 		struct uthread *uth = get_bsdthread_info(th);
3812 		if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
3813 		    (uth->uu_workq_flags & (UT_WORKQ_DYING | UT_WORKQ_OVERCOMMIT))) {
3814 			error = EINVAL;
3815 			break;
3816 		}
3817 
3818 		thread_qos_t qos = _pthread_priority_thread_qos(arg2);
3819 		if (qos == THREAD_QOS_UNSPECIFIED) {
3820 			error = EINVAL;
3821 			break;
3822 		}
3823 		workq_lock_spin(wq);
3824 		bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false, false);
3825 		workq_unlock(wq);
3826 
3827 		*retval = should_narrow;
3828 		break;
3829 	}
3830 	case WQOPS_SETUP_DISPATCH: {
3831 		/*
3832 		 * item = pointer to workq_dispatch_config structure
3833 		 * arg2 = sizeof(item)
3834 		 */
3835 		struct workq_dispatch_config cfg;
3836 		bzero(&cfg, sizeof(cfg));
3837 
3838 		error = copyin(uap->item, &cfg, MIN(sizeof(cfg), (unsigned long) arg2));
3839 		if (error) {
3840 			break;
3841 		}
3842 
3843 		if (cfg.wdc_flags & ~WORKQ_DISPATCH_SUPPORTED_FLAGS ||
3844 		    cfg.wdc_version < WORKQ_DISPATCH_MIN_SUPPORTED_VERSION) {
3845 			error = ENOTSUP;
3846 			break;
3847 		}
3848 
3849 		/* Load fields from version 1 */
3850 		p->p_dispatchqueue_serialno_offset = cfg.wdc_queue_serialno_offs;
3851 
3852 		/* Load fields from version 2 */
3853 		if (cfg.wdc_version >= 2) {
3854 			p->p_dispatchqueue_label_offset = cfg.wdc_queue_label_offs;
3855 		}
3856 
3857 		break;
3858 	}
3859 	default:
3860 		error = EINVAL;
3861 		break;
3862 	}
3863 
3864 	return error;
3865 }
3866 
3867 /*
3868  * We have no work to do, park ourselves on the idle list.
3869  *
3870  * Consumes the workqueue lock and does not return.
3871  */
3872 __attribute__((noreturn, noinline))
3873 static void
workq_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)3874 workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth,
3875     uint32_t setup_flags)
3876 {
3877 	assert(uth == current_uthread());
3878 	assert(uth->uu_kqr_bound == NULL);
3879 	workq_push_idle_thread(p, wq, uth, setup_flags); // may not return
3880 
3881 	workq_thread_reset_cpupercent(NULL, uth);
3882 
3883 #if CONFIG_PREADOPT_TG
3884 	/* Clear the preadoption thread group on the thread.
3885 	 *
3886 	 * Case 1:
3887 	 *		Creator thread which never picked up a thread request. We set a
3888 	 *		preadoption thread group on creator threads but if it never picked
3889 	 *		up a thread request and didn't go to userspace, then the thread will
3890 	 *		park with a preadoption thread group but no explicitly adopted
3891 	 *		voucher or work interval.
3892 	 *
3893 	 *		We drop the preadoption thread group here before proceeding to park.
3894 	 *		Note - we may get preempted when we drop the workq lock below.
3895 	 *
3896 	 * Case 2:
3897 	 *		Thread picked up a thread request and bound to it and returned back
3898 	 *		from userspace and is parking. At this point, preadoption thread
3899 	 *		group should be NULL since the thread has unbound from the thread
3900 	 *		request. So this operation should be a no-op.
3901 	 */
3902 	thread_set_preadopt_thread_group(get_machthread(uth), NULL);
3903 #endif
3904 
3905 	if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) &&
3906 	    !(uth->uu_workq_flags & UT_WORKQ_DYING)) {
3907 		workq_unlock(wq);
3908 
3909 		/*
3910 		 * workq_push_idle_thread() will unset `has_stack`
3911 		 * if it wants us to free the stack before parking.
3912 		 */
3913 		if (!uth->uu_save.uus_workq_park_data.has_stack) {
3914 			pthread_functions->workq_markfree_threadstack(p,
3915 			    get_machthread(uth), get_task_map(proc_task(p)),
3916 			    uth->uu_workq_stackaddr);
3917 		}
3918 
3919 		/*
3920 		 * When we remove the voucher from the thread, we may lose our importance
3921 		 * causing us to get preempted, so we do this after putting the thread on
3922 		 * the idle list.  Then, when we get our importance back we'll be able to
3923 		 * use this thread from e.g. the kevent call out to deliver a boosting
3924 		 * message.
3925 		 *
3926 		 * Note that setting the voucher to NULL will not clear the preadoption
3927 		 * thread since this thread could have become the creator again and
3928 		 * perhaps acquired a preadoption thread group.
3929 		 */
3930 		__assert_only kern_return_t kr;
3931 		kr = thread_set_voucher_name(MACH_PORT_NULL);
3932 		assert(kr == KERN_SUCCESS);
3933 
3934 		workq_lock_spin(wq);
3935 		uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
3936 		setup_flags &= ~WQ_SETUP_CLEAR_VOUCHER;
3937 	}
3938 
3939 	WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0);
3940 
3941 	if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
3942 		/*
3943 		 * While we'd dropped the lock to unset our voucher, someone came
3944 		 * around and made us runnable.  But because we weren't waiting on the
3945 		 * event their thread_wakeup() was ineffectual.  To correct for that,
3946 		 * we just run the continuation ourselves.
3947 		 */
3948 		workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
3949 		__builtin_unreachable();
3950 	}
3951 
3952 	if (uth->uu_workq_flags & UT_WORKQ_DYING) {
3953 		workq_unpark_for_death_and_unlock(p, wq, uth,
3954 		    WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags);
3955 		__builtin_unreachable();
3956 	}
3957 
3958 	/* Disarm the workqueue quantum since the thread is now idle */
3959 	thread_disarm_workqueue_quantum(get_machthread(uth));
3960 
3961 	thread_set_pending_block_hint(get_machthread(uth), kThreadWaitParkedWorkQueue);
3962 	assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
3963 	workq_unlock(wq);
3964 	thread_block(workq_unpark_continue);
3965 	__builtin_unreachable();
3966 }
3967 
3968 static inline bool
workq_may_start_event_mgr_thread(struct workqueue * wq,struct uthread * uth)3969 workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
3970 {
3971 	/*
3972 	 * There's an event manager request and either:
3973 	 * - no event manager currently running
3974 	 * - we are re-using the event manager
3975 	 */
3976 	return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
3977 	       (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
3978 }
3979 
3980 /* Called with workq lock held. */
3981 static uint32_t
workq_constrained_allowance(struct workqueue * wq,thread_qos_t at_qos,struct uthread * uth,bool may_start_timer,bool record_failed_allowance)3982 workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
3983     struct uthread *uth, bool may_start_timer, bool record_failed_allowance)
3984 {
3985 	assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
3986 	uint32_t allowance_passed = 0;
3987 	uint32_t count = 0;
3988 
3989 	uint32_t max_count = wq->wq_constrained_threads_scheduled;
3990 	if (uth && workq_thread_is_nonovercommit(uth)) {
3991 		/*
3992 		 * don't count the current thread as scheduled
3993 		 */
3994 		assert(max_count > 0);
3995 		max_count--;
3996 	}
3997 	if (max_count >= wq_max_constrained_threads) {
3998 		WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
3999 		    wq->wq_constrained_threads_scheduled,
4000 		    wq_max_constrained_threads);
4001 		/*
4002 		 * we need 1 or more constrained threads to return to the kernel before
4003 		 * we can dispatch additional work
4004 		 */
4005 		allowance_passed = 0;
4006 		goto out;
4007 	}
4008 	max_count -= wq_max_constrained_threads;
4009 
4010 	/*
4011 	 * Compute a metric for many how many threads are active.  We find the
4012 	 * highest priority request outstanding and then add up the number of active
4013 	 * threads in that and all higher-priority buckets.  We'll also add any
4014 	 * "busy" threads which are not currently active but blocked recently enough
4015 	 * that we can't be sure that they won't be unblocked soon and start
4016 	 * being active again.
4017 	 *
4018 	 * We'll then compare this metric to our max concurrency to decide whether
4019 	 * to add a new thread.
4020 	 */
4021 
4022 	uint32_t busycount, thactive_count;
4023 
4024 	thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
4025 	    at_qos, &busycount, NULL);
4026 
4027 	if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
4028 	    at_qos <= uth->uu_workq_pri.qos_bucket) {
4029 		/*
4030 		 * Don't count this thread as currently active, but only if it's not
4031 		 * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
4032 		 * managers.
4033 		 */
4034 		assert(thactive_count > 0);
4035 		thactive_count--;
4036 	}
4037 
4038 	count = wq_max_parallelism[_wq_bucket(at_qos)];
4039 	if (count > thactive_count + busycount) {
4040 		count -= thactive_count + busycount;
4041 		WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
4042 		    thactive_count, busycount);
4043 		allowance_passed = MIN(count, max_count);
4044 		goto out;
4045 	} else {
4046 		WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
4047 		    thactive_count, busycount);
4048 		allowance_passed = 0;
4049 	}
4050 
4051 	if (may_start_timer) {
4052 		/*
4053 		 * If this is called from the add timer, we won't have another timer
4054 		 * fire when the thread exits the "busy" state, so rearm the timer.
4055 		 */
4056 		workq_schedule_delayed_thread_creation(wq, 0);
4057 	}
4058 
4059 out:
4060 	if (record_failed_allowance) {
4061 		wq->wq_exceeded_active_constrained_thread_limit = !allowance_passed;
4062 	}
4063 	return allowance_passed;
4064 }
4065 
4066 static bool
workq_threadreq_admissible(struct workqueue * wq,struct uthread * uth,workq_threadreq_t req)4067 workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
4068     workq_threadreq_t req)
4069 {
4070 	if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
4071 		return workq_may_start_event_mgr_thread(wq, uth);
4072 	}
4073 	if (workq_threadreq_is_cooperative(req)) {
4074 		return workq_cooperative_allowance(wq, req->tr_qos, uth, true);
4075 	}
4076 	if (workq_threadreq_is_nonovercommit(req)) {
4077 		return workq_constrained_allowance(wq, req->tr_qos, uth, true, true);
4078 	}
4079 
4080 	return true;
4081 }
4082 
4083 /*
4084  * Called from the context of selecting thread requests for threads returning
4085  * from userspace or creator thread
4086  */
4087 static workq_threadreq_t
workq_cooperative_queue_best_req(struct workqueue * wq,struct uthread * uth)4088 workq_cooperative_queue_best_req(struct workqueue *wq, struct uthread *uth)
4089 {
4090 	workq_lock_held(wq);
4091 
4092 	/*
4093 	 * If the current thread is cooperative, we need to exclude it as part of
4094 	 * cooperative schedule count since this thread is looking for a new
4095 	 * request. Change in the schedule count for cooperative pool therefore
4096 	 * requires us to reeevaluate the next best request for it.
4097 	 */
4098 	if (uth && workq_thread_is_cooperative(uth)) {
4099 		_wq_cooperative_queue_scheduled_count_dec(wq, uth->uu_workq_pri.qos_req);
4100 
4101 		(void) _wq_cooperative_queue_refresh_best_req_qos(wq);
4102 
4103 		_wq_cooperative_queue_scheduled_count_inc(wq, uth->uu_workq_pri.qos_req);
4104 	} else {
4105 		/*
4106 		 * The old value that was already precomputed should be safe to use -
4107 		 * add an assert that asserts that the best req QoS doesn't change in
4108 		 * this case
4109 		 */
4110 		assert(_wq_cooperative_queue_refresh_best_req_qos(wq) == false);
4111 	}
4112 
4113 	thread_qos_t qos = wq->wq_cooperative_queue_best_req_qos;
4114 
4115 	/* There are no eligible requests in the cooperative pool */
4116 	if (qos == THREAD_QOS_UNSPECIFIED) {
4117 		return NULL;
4118 	}
4119 	assert(qos != WORKQ_THREAD_QOS_ABOVEUI);
4120 	assert(qos != WORKQ_THREAD_QOS_MANAGER);
4121 
4122 	uint8_t bucket = _wq_bucket(qos);
4123 	assert(!STAILQ_EMPTY(&wq->wq_cooperative_queue[bucket]));
4124 
4125 	return STAILQ_FIRST(&wq->wq_cooperative_queue[bucket]);
4126 }
4127 
4128 static workq_threadreq_t
workq_threadreq_select_for_creator(struct workqueue * wq)4129 workq_threadreq_select_for_creator(struct workqueue *wq)
4130 {
4131 	workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
4132 	thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
4133 	uint8_t pri = 0;
4134 
4135 	/*
4136 	 * Compute the best priority request, and ignore the turnstile for now
4137 	 */
4138 
4139 	req_pri = priority_queue_max(&wq->wq_special_queue,
4140 	    struct workq_threadreq_s, tr_entry);
4141 	if (req_pri) {
4142 		pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
4143 		    &req_pri->tr_entry);
4144 	}
4145 
4146 	/*
4147 	 * Handle the manager thread request. The special queue might yield
4148 	 * a higher priority, but the manager always beats the QoS world.
4149 	 */
4150 
4151 	req_mgr = wq->wq_event_manager_threadreq;
4152 	if (req_mgr && workq_may_start_event_mgr_thread(wq, NULL)) {
4153 		uint32_t mgr_pri = wq->wq_event_manager_priority;
4154 
4155 		if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
4156 			mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
4157 		} else {
4158 			mgr_pri = thread_workq_pri_for_qos(
4159 				_pthread_priority_thread_qos(mgr_pri));
4160 		}
4161 
4162 		return mgr_pri >= pri ? req_mgr : req_pri;
4163 	}
4164 
4165 	/*
4166 	 * Compute the best QoS Request, and check whether it beats the "pri" one
4167 	 *
4168 	 * Start by comparing the overcommit and the cooperative pool
4169 	 */
4170 	req_qos = priority_queue_max(&wq->wq_overcommit_queue,
4171 	    struct workq_threadreq_s, tr_entry);
4172 	if (req_qos) {
4173 		qos = req_qos->tr_qos;
4174 	}
4175 
4176 	req_tmp = workq_cooperative_queue_best_req(wq, NULL);
4177 	if (req_tmp && qos <= req_tmp->tr_qos) {
4178 		/*
4179 		 * Cooperative TR is better between overcommit and cooperative.  Note
4180 		 * that if qos is same between overcommit and cooperative, we choose
4181 		 * cooperative.
4182 		 *
4183 		 * Pick cooperative pool if it passes the admissions check
4184 		 */
4185 		if (workq_cooperative_allowance(wq, req_tmp->tr_qos, NULL, true)) {
4186 			req_qos = req_tmp;
4187 			qos = req_qos->tr_qos;
4188 		}
4189 	}
4190 
4191 	/*
4192 	 * Compare the best QoS so far - either from overcommit or from cooperative
4193 	 * pool - and compare it with the constrained pool
4194 	 */
4195 	req_tmp = priority_queue_max(&wq->wq_constrained_queue,
4196 	    struct workq_threadreq_s, tr_entry);
4197 
4198 	if (req_tmp && qos < req_tmp->tr_qos) {
4199 		/*
4200 		 * Constrained pool is best in QoS between overcommit, cooperative
4201 		 * and constrained. Now check how it fairs against the priority case
4202 		 */
4203 		if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
4204 			return req_pri;
4205 		}
4206 
4207 		if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true, true)) {
4208 			/*
4209 			 * If the constrained thread request is the best one and passes
4210 			 * the admission check, pick it.
4211 			 */
4212 			return req_tmp;
4213 		}
4214 	}
4215 
4216 	/*
4217 	 * Compare the best of the QoS world with the priority
4218 	 */
4219 	if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
4220 		return req_pri;
4221 	}
4222 
4223 	if (req_qos) {
4224 		return req_qos;
4225 	}
4226 
4227 	/*
4228 	 * If we had no eligible request but we have a turnstile push,
4229 	 * it must be a non overcommit thread request that failed
4230 	 * the admission check.
4231 	 *
4232 	 * Just fake a BG thread request so that if the push stops the creator
4233 	 * priority just drops to 4.
4234 	 */
4235 	if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
4236 		static struct workq_threadreq_s workq_sync_push_fake_req = {
4237 			.tr_qos = THREAD_QOS_BACKGROUND,
4238 		};
4239 
4240 		return &workq_sync_push_fake_req;
4241 	}
4242 
4243 	return NULL;
4244 }
4245 
4246 /*
4247  * Returns true if this caused a change in the schedule counts of the
4248  * cooperative pool
4249  */
4250 static bool
workq_adjust_cooperative_constrained_schedule_counts(struct workqueue * wq,struct uthread * uth,thread_qos_t old_thread_qos,workq_tr_flags_t tr_flags)4251 workq_adjust_cooperative_constrained_schedule_counts(struct workqueue *wq,
4252     struct uthread *uth, thread_qos_t old_thread_qos, workq_tr_flags_t tr_flags)
4253 {
4254 	workq_lock_held(wq);
4255 
4256 	/*
4257 	 * Row: thread type
4258 	 * Column: Request type
4259 	 *
4260 	 *					overcommit		non-overcommit		cooperative
4261 	 * overcommit			X				case 1				case 2
4262 	 * cooperative		case 3				case 4				case 5
4263 	 * non-overcommit	case 6					X				case 7
4264 	 *
4265 	 * Move the thread to the right bucket depending on what state it currently
4266 	 * has and what state the thread req it picks, is going to have.
4267 	 *
4268 	 * Note that the creator thread is an overcommit thread.
4269 	 */
4270 	thread_qos_t new_thread_qos = uth->uu_workq_pri.qos_req;
4271 
4272 	/*
4273 	 * Anytime a cooperative bucket's schedule count changes, we need to
4274 	 * potentially refresh the next best QoS for that pool when we determine
4275 	 * the next request for the creator
4276 	 */
4277 	bool cooperative_pool_sched_count_changed = false;
4278 
4279 	if (workq_thread_is_overcommit(uth)) {
4280 		if (workq_tr_is_nonovercommit(tr_flags)) {
4281 			// Case 1: thread is overcommit, req is non-overcommit
4282 			wq->wq_constrained_threads_scheduled++;
4283 		} else if (workq_tr_is_cooperative(tr_flags)) {
4284 			// Case 2: thread is overcommit, req is cooperative
4285 			_wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
4286 			cooperative_pool_sched_count_changed = true;
4287 		}
4288 	} else if (workq_thread_is_cooperative(uth)) {
4289 		if (workq_tr_is_overcommit(tr_flags)) {
4290 			// Case 3: thread is cooperative, req is overcommit
4291 			_wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
4292 		} else if (workq_tr_is_nonovercommit(tr_flags)) {
4293 			// Case 4: thread is cooperative, req is non-overcommit
4294 			_wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
4295 			wq->wq_constrained_threads_scheduled++;
4296 		} else {
4297 			// Case 5: thread is cooperative, req is also cooperative
4298 			assert(workq_tr_is_cooperative(tr_flags));
4299 			_wq_cooperative_queue_scheduled_count_dec(wq, old_thread_qos);
4300 			_wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
4301 		}
4302 		cooperative_pool_sched_count_changed = true;
4303 	} else {
4304 		if (workq_tr_is_overcommit(tr_flags)) {
4305 			// Case 6: Thread is non-overcommit, req is overcommit
4306 			wq->wq_constrained_threads_scheduled--;
4307 		} else if (workq_tr_is_cooperative(tr_flags)) {
4308 			// Case 7: Thread is non-overcommit, req is cooperative
4309 			wq->wq_constrained_threads_scheduled--;
4310 			_wq_cooperative_queue_scheduled_count_inc(wq, new_thread_qos);
4311 			cooperative_pool_sched_count_changed = true;
4312 		}
4313 	}
4314 
4315 	return cooperative_pool_sched_count_changed;
4316 }
4317 
4318 static workq_threadreq_t
workq_threadreq_select(struct workqueue * wq,struct uthread * uth)4319 workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
4320 {
4321 	workq_threadreq_t req_qos, req_pri, req_tmp, req_mgr;
4322 	uintptr_t proprietor;
4323 	thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
4324 	uint8_t pri = 0;
4325 
4326 	if (uth == wq->wq_creator) {
4327 		uth = NULL;
4328 	}
4329 
4330 	/*
4331 	 * Compute the best priority request (special or turnstile)
4332 	 */
4333 
4334 	pri = (uint8_t)turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
4335 	    &proprietor);
4336 	if (pri) {
4337 		struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
4338 		req_pri = &kqwl->kqwl_request;
4339 		if (req_pri->tr_state != WORKQ_TR_STATE_QUEUED) {
4340 			panic("Invalid thread request (%p) state %d",
4341 			    req_pri, req_pri->tr_state);
4342 		}
4343 	} else {
4344 		req_pri = NULL;
4345 	}
4346 
4347 	req_tmp = priority_queue_max(&wq->wq_special_queue,
4348 	    struct workq_threadreq_s, tr_entry);
4349 	if (req_tmp && pri < priority_queue_entry_sched_pri(&wq->wq_special_queue,
4350 	    &req_tmp->tr_entry)) {
4351 		req_pri = req_tmp;
4352 		pri = (uint8_t)priority_queue_entry_sched_pri(&wq->wq_special_queue,
4353 		    &req_tmp->tr_entry);
4354 	}
4355 
4356 	/*
4357 	 * Handle the manager thread request. The special queue might yield
4358 	 * a higher priority, but the manager always beats the QoS world.
4359 	 */
4360 
4361 	req_mgr = wq->wq_event_manager_threadreq;
4362 	if (req_mgr && workq_may_start_event_mgr_thread(wq, uth)) {
4363 		uint32_t mgr_pri = wq->wq_event_manager_priority;
4364 
4365 		if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
4366 			mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
4367 		} else {
4368 			mgr_pri = thread_workq_pri_for_qos(
4369 				_pthread_priority_thread_qos(mgr_pri));
4370 		}
4371 
4372 		return mgr_pri >= pri ? req_mgr : req_pri;
4373 	}
4374 
4375 	/*
4376 	 * Compute the best QoS Request, and check whether it beats the "pri" one
4377 	 */
4378 
4379 	req_qos = priority_queue_max(&wq->wq_overcommit_queue,
4380 	    struct workq_threadreq_s, tr_entry);
4381 	if (req_qos) {
4382 		qos = req_qos->tr_qos;
4383 	}
4384 
4385 	req_tmp = workq_cooperative_queue_best_req(wq, uth);
4386 	if (req_tmp && qos <= req_tmp->tr_qos) {
4387 		/*
4388 		 * Cooperative TR is better between overcommit and cooperative.  Note
4389 		 * that if qos is same between overcommit and cooperative, we choose
4390 		 * cooperative.
4391 		 *
4392 		 * Pick cooperative pool if it passes the admissions check
4393 		 */
4394 		if (workq_cooperative_allowance(wq, req_tmp->tr_qos, uth, true)) {
4395 			req_qos = req_tmp;
4396 			qos = req_qos->tr_qos;
4397 		}
4398 	}
4399 
4400 	/*
4401 	 * Compare the best QoS so far - either from overcommit or from cooperative
4402 	 * pool - and compare it with the constrained pool
4403 	 */
4404 	req_tmp = priority_queue_max(&wq->wq_constrained_queue,
4405 	    struct workq_threadreq_s, tr_entry);
4406 
4407 	if (req_tmp && qos < req_tmp->tr_qos) {
4408 		/*
4409 		 * Constrained pool is best in QoS between overcommit, cooperative
4410 		 * and constrained. Now check how it fairs against the priority case
4411 		 */
4412 		if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
4413 			return req_pri;
4414 		}
4415 
4416 		if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true, true)) {
4417 			/*
4418 			 * If the constrained thread request is the best one and passes
4419 			 * the admission check, pick it.
4420 			 */
4421 			return req_tmp;
4422 		}
4423 	}
4424 
4425 	if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
4426 		return req_pri;
4427 	}
4428 
4429 	return req_qos;
4430 }
4431 
4432 /*
4433  * The creator is an anonymous thread that is counted as scheduled,
4434  * but otherwise without its scheduler callback set or tracked as active
4435  * that is used to make other threads.
4436  *
4437  * When more requests are added or an existing one is hurried along,
4438  * a creator is elected and setup, or the existing one overridden accordingly.
4439  *
4440  * While this creator is in flight, because no request has been dequeued,
4441  * already running threads have a chance at stealing thread requests avoiding
4442  * useless context switches, and the creator once scheduled may not find any
4443  * work to do and will then just park again.
4444  *
4445  * The creator serves the dual purpose of informing the scheduler of work that
4446  * hasn't be materialized as threads yet, and also as a natural pacing mechanism
4447  * for thread creation.
4448  *
4449  * By being anonymous (and not bound to anything) it means that thread requests
4450  * can be stolen from this creator by threads already on core yielding more
4451  * efficient scheduling and reduced context switches.
4452  */
4453 static void
workq_schedule_creator(proc_t p,struct workqueue * wq,workq_kern_threadreq_flags_t flags)4454 workq_schedule_creator(proc_t p, struct workqueue *wq,
4455     workq_kern_threadreq_flags_t flags)
4456 {
4457 	workq_threadreq_t req;
4458 	struct uthread *uth;
4459 	bool needs_wakeup;
4460 
4461 	workq_lock_held(wq);
4462 	assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
4463 
4464 again:
4465 	uth = wq->wq_creator;
4466 
4467 	if (!wq->wq_reqcount) {
4468 		/*
4469 		 * There is no thread request left.
4470 		 *
4471 		 * If there is a creator, leave everything in place, so that it cleans
4472 		 * up itself in workq_push_idle_thread().
4473 		 *
4474 		 * Else, make sure the turnstile state is reset to no inheritor.
4475 		 */
4476 		if (uth == NULL) {
4477 			workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
4478 		}
4479 		return;
4480 	}
4481 
4482 	req = workq_threadreq_select_for_creator(wq);
4483 	if (req == NULL) {
4484 		/*
4485 		 * There isn't a thread request that passes the admission check.
4486 		 *
4487 		 * If there is a creator, do not touch anything, the creator will sort
4488 		 * it out when it runs.
4489 		 *
4490 		 * Else, set the inheritor to "WORKQ" so that the turnstile propagation
4491 		 * code calls us if anything changes.
4492 		 */
4493 		if (uth == NULL) {
4494 			workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
4495 		}
4496 		return;
4497 	}
4498 
4499 
4500 	if (uth) {
4501 		/*
4502 		 * We need to maybe override the creator we already have
4503 		 */
4504 		if (workq_thread_needs_priority_change(req, uth)) {
4505 			WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
4506 			    wq, 1, uthread_tid(uth), req->tr_qos);
4507 			workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4508 		}
4509 		assert(wq->wq_inheritor == get_machthread(uth));
4510 	} else if (wq->wq_thidlecount) {
4511 		/*
4512 		 * We need to unpark a creator thread
4513 		 */
4514 		wq->wq_creator = uth = workq_pop_idle_thread(wq, UT_WORKQ_OVERCOMMIT,
4515 		    &needs_wakeup);
4516 		/* Always reset the priorities on the newly chosen creator */
4517 		workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4518 		workq_turnstile_update_inheritor(wq, get_machthread(uth),
4519 		    TURNSTILE_INHERITOR_THREAD);
4520 		WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
4521 		    wq, 2, uthread_tid(uth), req->tr_qos);
4522 		uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
4523 		uth->uu_save.uus_workq_park_data.yields = 0;
4524 		if (needs_wakeup) {
4525 			workq_thread_wakeup(uth);
4526 		}
4527 	} else {
4528 		/*
4529 		 * We need to allocate a thread...
4530 		 */
4531 		if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
4532 			/* out of threads, just go away */
4533 			flags = WORKQ_THREADREQ_NONE;
4534 		} else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
4535 			act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
4536 		} else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
4537 			/* This can drop the workqueue lock, and take it again */
4538 			workq_schedule_immediate_thread_creation(wq);
4539 		} else if ((workq_add_new_idle_thread(p, wq,
4540 		    workq_unpark_continue, false, NULL) == KERN_SUCCESS)) {
4541 			goto again;
4542 		} else {
4543 			workq_schedule_delayed_thread_creation(wq, 0);
4544 		}
4545 
4546 		/*
4547 		 * If the current thread is the inheritor:
4548 		 *
4549 		 * If we set the AST, then the thread will stay the inheritor until
4550 		 * either the AST calls workq_kern_threadreq_redrive(), or it parks
4551 		 * and calls workq_push_idle_thread().
4552 		 *
4553 		 * Else, the responsibility of the thread creation is with a thread-call
4554 		 * and we need to clear the inheritor.
4555 		 */
4556 		if ((flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) == 0 &&
4557 		    wq->wq_inheritor == current_thread()) {
4558 			workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
4559 		}
4560 	}
4561 }
4562 
4563 /**
4564  * Same as workq_unpark_select_threadreq_or_park_and_unlock,
4565  * but do not allow early binds.
4566  *
4567  * Called with the base pri frozen, will unfreeze it.
4568  */
4569 __attribute__((noreturn, noinline))
4570 static void
workq_select_threadreq_or_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)4571 workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
4572     struct uthread *uth, uint32_t setup_flags)
4573 {
4574 	workq_threadreq_t req = NULL;
4575 	bool is_creator = (wq->wq_creator == uth);
4576 	bool schedule_creator = false;
4577 
4578 	if (__improbable(_wq_exiting(wq))) {
4579 		WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0);
4580 		goto park;
4581 	}
4582 
4583 	if (wq->wq_reqcount == 0) {
4584 		WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0);
4585 		goto park;
4586 	}
4587 
4588 	req = workq_threadreq_select(wq, uth);
4589 	if (__improbable(req == NULL)) {
4590 		WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0);
4591 		goto park;
4592 	}
4593 
4594 	struct uu_workq_policy old_pri = uth->uu_workq_pri;
4595 	uint8_t tr_flags = req->tr_flags;
4596 	struct turnstile *req_ts = kqueue_threadreq_get_turnstile(req);
4597 
4598 	/*
4599 	 * Attempt to setup ourselves as the new thing to run, moving all priority
4600 	 * pushes to ourselves.
4601 	 *
4602 	 * If the current thread is the creator, then the fact that we are presently
4603 	 * running is proof that we'll do something useful, so keep going.
4604 	 *
4605 	 * For other cases, peek at the AST to know whether the scheduler wants
4606 	 * to preempt us, if yes, park instead, and move the thread request
4607 	 * turnstile back to the workqueue.
4608 	 */
4609 	if (req_ts) {
4610 		workq_perform_turnstile_operation_locked(wq, ^{
4611 			turnstile_update_inheritor(req_ts, get_machthread(uth),
4612 			TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
4613 			turnstile_update_inheritor_complete(req_ts,
4614 			TURNSTILE_INTERLOCK_HELD);
4615 		});
4616 	}
4617 
4618 	/* accounting changes of aggregate thscheduled_count and thactive which has
4619 	 * to be paired with the workq_thread_reset_pri below so that we have
4620 	 * uth->uu_workq_pri match with thactive.
4621 	 *
4622 	 * This is undone when the thread parks */
4623 	if (is_creator) {
4624 		WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
4625 		    uth->uu_save.uus_workq_park_data.yields);
4626 		wq->wq_creator = NULL;
4627 		_wq_thactive_inc(wq, req->tr_qos);
4628 		wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
4629 	} else if (old_pri.qos_bucket != req->tr_qos) {
4630 		_wq_thactive_move(wq, old_pri.qos_bucket, req->tr_qos);
4631 	}
4632 	workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
4633 
4634 	/*
4635 	 * Make relevant accounting changes for pool specific counts.
4636 	 *
4637 	 * The schedule counts changing can affect what the next best request
4638 	 * for cooperative thread pool is if this request is dequeued.
4639 	 */
4640 	bool cooperative_sched_count_changed =
4641 	    workq_adjust_cooperative_constrained_schedule_counts(wq, uth,
4642 	    old_pri.qos_req, tr_flags);
4643 
4644 	if (workq_tr_is_overcommit(tr_flags)) {
4645 		workq_thread_set_type(uth, UT_WORKQ_OVERCOMMIT);
4646 	} else if (workq_tr_is_cooperative(tr_flags)) {
4647 		workq_thread_set_type(uth, UT_WORKQ_COOPERATIVE);
4648 	} else {
4649 		workq_thread_set_type(uth, 0);
4650 	}
4651 
4652 	if (__improbable(thread_unfreeze_base_pri(get_machthread(uth)) && !is_creator)) {
4653 		if (req_ts) {
4654 			workq_perform_turnstile_operation_locked(wq, ^{
4655 				turnstile_update_inheritor(req_ts, wq->wq_turnstile,
4656 				TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
4657 				turnstile_update_inheritor_complete(req_ts,
4658 				TURNSTILE_INTERLOCK_HELD);
4659 			});
4660 		}
4661 		WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0);
4662 
4663 		/*
4664 		 * If a cooperative thread was the one which picked up the manager
4665 		 * thread request, we need to reevaluate the cooperative pool before
4666 		 * it goes and parks.
4667 		 *
4668 		 * For every other of thread request that it picks up, the logic in
4669 		 * workq_threadreq_select should have done this refresh.
4670 		 * See workq_push_idle_thread.
4671 		 */
4672 		if (cooperative_sched_count_changed) {
4673 			if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
4674 				_wq_cooperative_queue_refresh_best_req_qos(wq);
4675 			}
4676 		}
4677 		goto park_thawed;
4678 	}
4679 
4680 	/*
4681 	 * We passed all checks, dequeue the request, bind to it, and set it up
4682 	 * to return to user.
4683 	 */
4684 	WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
4685 	    workq_trace_req_id(req), tr_flags, 0);
4686 	wq->wq_fulfilled++;
4687 	schedule_creator = workq_threadreq_dequeue(wq, req,
4688 	    cooperative_sched_count_changed);
4689 
4690 	workq_thread_reset_cpupercent(req, uth);
4691 
4692 	if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
4693 		kqueue_threadreq_bind_prepost(p, req, uth);
4694 		req = NULL;
4695 	} else if (req->tr_count > 0) {
4696 		req = NULL;
4697 	}
4698 
4699 	if (uth->uu_workq_flags & UT_WORKQ_NEW) {
4700 		uth->uu_workq_flags ^= UT_WORKQ_NEW;
4701 		setup_flags |= WQ_SETUP_FIRST_USE;
4702 	}
4703 
4704 	/* If one of the following is true, call workq_schedule_creator (which also
4705 	 * adjusts priority of existing creator):
4706 	 *
4707 	 *	  - We are the creator currently so the wq may need a new creator
4708 	 *	  - The request we're binding to is the highest priority one, existing
4709 	 *	  creator's priority might need to be adjusted to reflect the next
4710 	 *	  highest TR
4711 	 */
4712 	if (is_creator || schedule_creator) {
4713 		/* This can drop the workqueue lock, and take it again */
4714 		workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
4715 	}
4716 
4717 	workq_unlock(wq);
4718 
4719 	if (req) {
4720 		zfree(workq_zone_threadreq, req);
4721 	}
4722 
4723 	/*
4724 	 * Run Thread, Run!
4725 	 */
4726 	uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
4727 	if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
4728 		upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
4729 	} else if (workq_tr_is_overcommit(tr_flags)) {
4730 		upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
4731 	} else if (workq_tr_is_cooperative(tr_flags)) {
4732 		upcall_flags |= WQ_FLAG_THREAD_COOPERATIVE;
4733 	}
4734 	if (tr_flags & WORKQ_TR_FLAG_KEVENT) {
4735 		upcall_flags |= WQ_FLAG_THREAD_KEVENT;
4736 		assert((upcall_flags & WQ_FLAG_THREAD_COOPERATIVE) == 0);
4737 	}
4738 
4739 	if (tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
4740 		upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
4741 	}
4742 	uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
4743 
4744 	if (tr_flags & (WORKQ_TR_FLAG_KEVENT | WORKQ_TR_FLAG_WORKLOOP)) {
4745 		kqueue_threadreq_bind_commit(p, get_machthread(uth));
4746 	} else {
4747 #if CONFIG_PREADOPT_TG
4748 		/*
4749 		 * The thread may have a preadopt thread group on it already because it
4750 		 * got tagged with it as a creator thread. So we need to make sure to
4751 		 * clear that since we don't have preadoption for anonymous thread
4752 		 * requests
4753 		 */
4754 		thread_set_preadopt_thread_group(get_machthread(uth), NULL);
4755 #endif
4756 	}
4757 
4758 	workq_setup_and_run(p, uth, setup_flags);
4759 	__builtin_unreachable();
4760 
4761 park:
4762 	thread_unfreeze_base_pri(get_machthread(uth));
4763 park_thawed:
4764 	workq_park_and_unlock(p, wq, uth, setup_flags);
4765 }
4766 
4767 /**
4768  * Runs a thread request on a thread
4769  *
4770  * - if thread is THREAD_NULL, will find a thread and run the request there.
4771  *   Otherwise, the thread must be the current thread.
4772  *
4773  * - if req is NULL, will find the highest priority request and run that.  If
4774  *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
4775  *   be run immediately, it will be enqueued and moved to state QUEUED.
4776  *
4777  *   Either way, the thread request object serviced will be moved to state
4778  *   BINDING and attached to the uthread.
4779  *
4780  * Should be called with the workqueue lock held.  Will drop it.
4781  * Should be called with the base pri not frozen.
4782  */
4783 __attribute__((noreturn, noinline))
4784 static void
workq_unpark_select_threadreq_or_park_and_unlock(proc_t p,struct workqueue * wq,struct uthread * uth,uint32_t setup_flags)4785 workq_unpark_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
4786     struct uthread *uth, uint32_t setup_flags)
4787 {
4788 	if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
4789 		if (uth->uu_workq_flags & UT_WORKQ_NEW) {
4790 			setup_flags |= WQ_SETUP_FIRST_USE;
4791 		}
4792 		uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
4793 		/*
4794 		 * This pointer is possibly freed and only used for tracing purposes.
4795 		 */
4796 		workq_threadreq_t req = uth->uu_save.uus_workq_park_data.thread_request;
4797 		workq_unlock(wq);
4798 		WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
4799 		    VM_KERNEL_ADDRHIDE(req), 0, 0);
4800 		(void)req;
4801 
4802 		workq_setup_and_run(p, uth, setup_flags);
4803 		__builtin_unreachable();
4804 	}
4805 
4806 	thread_freeze_base_pri(get_machthread(uth));
4807 	workq_select_threadreq_or_park_and_unlock(p, wq, uth, setup_flags);
4808 }
4809 
4810 static bool
workq_creator_should_yield(struct workqueue * wq,struct uthread * uth)4811 workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
4812 {
4813 	thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
4814 
4815 	if (qos >= THREAD_QOS_USER_INTERACTIVE) {
4816 		return false;
4817 	}
4818 
4819 	uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
4820 	if (wq->wq_fulfilled == snapshot) {
4821 		return false;
4822 	}
4823 
4824 	uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
4825 	if (wq->wq_fulfilled - snapshot > conc) {
4826 		/* we fulfilled more than NCPU requests since being dispatched */
4827 		WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
4828 		    wq->wq_fulfilled, snapshot);
4829 		return true;
4830 	}
4831 
4832 	for (uint8_t i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
4833 		cnt += wq->wq_thscheduled_count[i];
4834 	}
4835 	if (conc <= cnt) {
4836 		/* We fulfilled requests and have more than NCPU scheduled threads */
4837 		WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
4838 		    wq->wq_fulfilled, snapshot);
4839 		return true;
4840 	}
4841 
4842 	return false;
4843 }
4844 
4845 /**
4846  * parked idle thread wakes up
4847  */
4848 __attribute__((noreturn, noinline))
4849 static void
workq_unpark_continue(void * parameter __unused,wait_result_t wr __unused)4850 workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
4851 {
4852 	thread_t th = current_thread();
4853 	struct uthread *uth = get_bsdthread_info(th);
4854 	proc_t p = current_proc();
4855 	struct workqueue *wq = proc_get_wqptr_fast(p);
4856 
4857 	workq_lock_spin(wq);
4858 
4859 	if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
4860 		/*
4861 		 * If the number of threads we have out are able to keep up with the
4862 		 * demand, then we should avoid sending this creator thread to
4863 		 * userspace.
4864 		 */
4865 		uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
4866 		uth->uu_save.uus_workq_park_data.yields++;
4867 		workq_unlock(wq);
4868 		thread_yield_with_continuation(workq_unpark_continue, NULL);
4869 		__builtin_unreachable();
4870 	}
4871 
4872 	if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
4873 		workq_unpark_select_threadreq_or_park_and_unlock(p, wq, uth, WQ_SETUP_NONE);
4874 		__builtin_unreachable();
4875 	}
4876 
4877 	if (__probable(wr == THREAD_AWAKENED)) {
4878 		/*
4879 		 * We were set running, but for the purposes of dying.
4880 		 */
4881 		assert(uth->uu_workq_flags & UT_WORKQ_DYING);
4882 		assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
4883 	} else {
4884 		/*
4885 		 * workaround for <rdar://problem/38647347>,
4886 		 * in case we do hit userspace, make sure calling
4887 		 * workq_thread_terminate() does the right thing here,
4888 		 * and if we never call it, that workq_exit() will too because it sees
4889 		 * this thread on the runlist.
4890 		 */
4891 		assert(wr == THREAD_INTERRUPTED);
4892 		wq->wq_thdying_count++;
4893 		uth->uu_workq_flags |= UT_WORKQ_DYING;
4894 	}
4895 
4896 	workq_unpark_for_death_and_unlock(p, wq, uth,
4897 	    WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE);
4898 	__builtin_unreachable();
4899 }
4900 
4901 __attribute__((noreturn, noinline))
4902 static void
workq_setup_and_run(proc_t p,struct uthread * uth,int setup_flags)4903 workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
4904 {
4905 	thread_t th = get_machthread(uth);
4906 	vm_map_t vmap = get_task_map(proc_task(p));
4907 
4908 	if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
4909 		/*
4910 		 * For preemption reasons, we want to reset the voucher as late as
4911 		 * possible, so we do it in two places:
4912 		 *   - Just before parking (i.e. in workq_park_and_unlock())
4913 		 *   - Prior to doing the setup for the next workitem (i.e. here)
4914 		 *
4915 		 * Those two places are sufficient to ensure we always reset it before
4916 		 * it goes back out to user space, but be careful to not break that
4917 		 * guarantee.
4918 		 *
4919 		 * Note that setting the voucher to NULL will not clear the preadoption
4920 		 * thread group on this thread
4921 		 */
4922 		__assert_only kern_return_t kr;
4923 		kr = thread_set_voucher_name(MACH_PORT_NULL);
4924 		assert(kr == KERN_SUCCESS);
4925 	}
4926 
4927 	uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
4928 	if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
4929 		upcall_flags |= WQ_FLAG_THREAD_REUSE;
4930 	}
4931 
4932 	if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
4933 		/*
4934 		 * For threads that have an outside-of-QoS thread priority, indicate
4935 		 * to userspace that setting QoS should only affect the TSD and not
4936 		 * change QOS in the kernel.
4937 		 */
4938 		upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
4939 	} else {
4940 		/*
4941 		 * Put the QoS class value into the lower bits of the reuse_thread
4942 		 * register, this is where the thread priority used to be stored
4943 		 * anyway.
4944 		 */
4945 		upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
4946 		    WQ_FLAG_THREAD_PRIO_QOS;
4947 	}
4948 
4949 	if (uth->uu_workq_thport == MACH_PORT_NULL) {
4950 		/* convert_thread_to_port_pinned() consumes a reference */
4951 		thread_reference(th);
4952 		/* Convert to immovable/pinned thread port, but port is not pinned yet */
4953 		ipc_port_t port = convert_thread_to_port_pinned(th);
4954 		/* Atomically, pin and copy out the port */
4955 		uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(proc_task(p)));
4956 	}
4957 
4958 	/* Thread has been set up to run, arm its next workqueue quantum or disarm
4959 	 * if it is no longer supporting that */
4960 	if (thread_supports_cooperative_workqueue(th)) {
4961 		thread_arm_workqueue_quantum(th);
4962 	} else {
4963 		thread_disarm_workqueue_quantum(th);
4964 	}
4965 
4966 	/*
4967 	 * Call out to pthread, this sets up the thread, pulls in kevent structs
4968 	 * onto the stack, sets up the thread state and then returns to userspace.
4969 	 */
4970 	WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
4971 	    proc_get_wqptr_fast(p), 0, 0, 0);
4972 
4973 	if (workq_thread_is_cooperative(uth) || workq_thread_is_permanently_bound(uth)) {
4974 		thread_sched_call(th, NULL);
4975 	} else {
4976 		thread_sched_call(th, workq_sched_callback);
4977 	}
4978 
4979 	pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
4980 	    uth->uu_workq_thport, 0, setup_flags, upcall_flags);
4981 
4982 	__builtin_unreachable();
4983 }
4984 
4985 /**
4986  * A wrapper around workq_setup_and_run for permanently bound thread.
4987  */
4988 __attribute__((noreturn, noinline))
4989 static void
workq_bound_thread_setup_and_run(struct uthread * uth,int setup_flags)4990 workq_bound_thread_setup_and_run(struct uthread *uth, int setup_flags)
4991 {
4992 	struct workq_threadreq_s * kqr = uth->uu_kqr_bound;
4993 
4994 	uint32_t upcall_flags = (WQ_FLAG_THREAD_NEWSPI |
4995 	    WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT);
4996 	if (workq_tr_is_overcommit(kqr->tr_flags)) {
4997 		workq_thread_set_type(uth, UT_WORKQ_OVERCOMMIT);
4998 		upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
4999 	}
5000 	uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
5001 	workq_setup_and_run(current_proc(), uth, setup_flags);
5002 	__builtin_unreachable();
5003 }
5004 
5005 /**
5006  * A parked bound thread wakes up for the first time.
5007  */
5008 __attribute__((noreturn, noinline))
5009 static void
workq_bound_thread_initialize_and_unpark_continue(void * parameter __unused,wait_result_t wr)5010 workq_bound_thread_initialize_and_unpark_continue(void *parameter __unused,
5011     wait_result_t wr)
5012 {
5013 	/*
5014 	 * Locking model for accessing uu_workq_flags :
5015 	 *
5016 	 * The concurrent access to uu_workq_flags is synchronized with workq lock
5017 	 * until a thread gets permanently bound to a kqwl. Post that, kqlock
5018 	 * is used for subsequent synchronizations. This gives us a significant
5019 	 * benefit by avoiding having to take a process wide workq lock on every
5020 	 * wakeup of the bound thread.
5021 	 * This flip in locking model is tracked with UT_WORKQ_PERMANENT_BIND flag.
5022 	 *
5023 	 * There is one more optimization we can perform for when the thread is
5024 	 * awakened for running (i.e THREAD_AWAKENED) until it parks.
5025 	 * During this window, we know KQ_SLEEP bit is reset so there should not
5026 	 * be any concurrent attempts to modify uu_workq_flags by
5027 	 * kqworkloop_bound_thread_wakeup because the thread is already "awake".
5028 	 * So we can safely access uu_workq_flags within this window without having
5029 	 * to take kqlock. This KQ_SLEEP is later set by the bound thread under
5030 	 * kqlock on its way to parking.
5031 	 */
5032 	struct uthread *uth = get_bsdthread_info(current_thread());
5033 
5034 	if (__probable(wr == THREAD_AWAKENED)) {
5035 		/* At most one flag. */
5036 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING))
5037 		    != (UT_WORKQ_RUNNING | UT_WORKQ_DYING));
5038 
5039 		assert(workq_thread_is_permanently_bound(uth));
5040 
5041 		if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
5042 			assert(uth->uu_workq_flags & UT_WORKQ_NEW);
5043 			uth->uu_workq_flags &= ~UT_WORKQ_NEW;
5044 
5045 			struct workq_threadreq_s * kqr = uth->uu_kqr_bound;
5046 			if (kqr->tr_work_interval) {
5047 				kern_return_t kr;
5048 				kr = kern_work_interval_explicit_join(get_machthread(uth),
5049 				    kqr->tr_work_interval);
5050 				/*
5051 				 * The work interval functions requires to be called on the
5052 				 * current thread. If we fail here, we record the fact and
5053 				 * continue.
5054 				 * In the future, we can preflight checking that this join will
5055 				 * always be successful when the paird kqwl is configured; but,
5056 				 * for now, this should be a rare case (e.g. if you have passed
5057 				 * invalid arguments to the join).
5058 				 */
5059 				if (kr == KERN_SUCCESS) {
5060 					uth->uu_workq_flags |= UT_WORKQ_WORK_INTERVAL_JOINED;
5061 					/* Thread and kqwl both have +1 ref on the work interval. */
5062 				} else {
5063 					uth->uu_workq_flags |= UT_WORKQ_WORK_INTERVAL_FAILED;
5064 				}
5065 			}
5066 			workq_thread_reset_cpupercent(kqr, uth);
5067 			workq_bound_thread_setup_and_run(uth, WQ_SETUP_FIRST_USE);
5068 			__builtin_unreachable();
5069 		} else {
5070 			/*
5071 			 * The permanently bound kqworkloop is getting destroyed so we
5072 			 * are woken up to cleanly unbind ourselves from it and terminate.
5073 			 * See KQ_WORKLOOP_DESTROY -> workq_kern_bound_thread_wakeup.
5074 			 *
5075 			 * The actual full unbind happens from
5076 			 * uthread_cleanup -> kqueue_threadreq_unbind.
5077 			 */
5078 			assert(uth->uu_workq_flags & UT_WORKQ_DYING);
5079 		}
5080 	} else {
5081 		/*
5082 		 * The process is getting terminated so we are woken up to die.
5083 		 * E.g. SIGKILL'd.
5084 		 */
5085 		assert(wr == THREAD_INTERRUPTED);
5086 		/*
5087 		 * It is possible we started running as the process is aborted
5088 		 * due to termination; but, workq_kern_threadreq_permanent_bind
5089 		 * has not had a chance to bind us to the kqwl yet.
5090 		 *
5091 		 * We synchronize with it using workq lock.
5092 		 */
5093 		proc_t p = current_proc();
5094 		struct workqueue *wq = proc_get_wqptr_fast(p);
5095 		workq_lock_spin(wq);
5096 		assert(workq_thread_is_permanently_bound(uth));
5097 		workq_unlock(wq);
5098 
5099 		/*
5100 		 * We do the bind commit ourselves if workq_kern_threadreq_permanent_bind
5101 		 * has not done it for us yet so our state is aligned with what the
5102 		 * termination path below expects.
5103 		 */
5104 		kqueue_threadreq_bind_commit(p, get_machthread(uth));
5105 	}
5106 	workq_kern_bound_thread_terminate(uth->uu_kqr_bound);
5107 	__builtin_unreachable();
5108 }
5109 
5110 /**
5111  * A parked bound thread wakes up. Not the first time.
5112  */
5113 __attribute__((noreturn, noinline))
5114 static void
workq_bound_thread_unpark_continue(void * parameter __unused,wait_result_t wr)5115 workq_bound_thread_unpark_continue(void *parameter __unused, wait_result_t wr)
5116 {
5117 	struct uthread *uth = get_bsdthread_info(current_thread());
5118 	assert(workq_thread_is_permanently_bound(uth));
5119 
5120 	if (__probable(wr == THREAD_AWAKENED)) {
5121 		/* At most one flag. */
5122 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING))
5123 		    != (UT_WORKQ_RUNNING | UT_WORKQ_DYING));
5124 		if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
5125 			workq_bound_thread_setup_and_run(uth, WQ_SETUP_NONE);
5126 		} else {
5127 			assert(uth->uu_workq_flags & UT_WORKQ_DYING);
5128 		}
5129 	} else {
5130 		assert(wr == THREAD_INTERRUPTED);
5131 	}
5132 	workq_kern_bound_thread_terminate(uth->uu_kqr_bound);
5133 	__builtin_unreachable();
5134 }
5135 
5136 #pragma mark misc
5137 
5138 int
fill_procworkqueue(proc_t p,struct proc_workqueueinfo * pwqinfo)5139 fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
5140 {
5141 	struct workqueue *wq = proc_get_wqptr(p);
5142 	int error = 0;
5143 	int     activecount;
5144 
5145 	if (wq == NULL) {
5146 		return EINVAL;
5147 	}
5148 
5149 	/*
5150 	 * This is sometimes called from interrupt context by the kperf sampler.
5151 	 * In that case, it's not safe to spin trying to take the lock since we
5152 	 * might already hold it.  So, we just try-lock it and error out if it's
5153 	 * already held.  Since this is just a debugging aid, and all our callers
5154 	 * are able to handle an error, that's fine.
5155 	 */
5156 	bool locked = workq_lock_try(wq);
5157 	if (!locked) {
5158 		return EBUSY;
5159 	}
5160 
5161 	wq_thactive_t act = _wq_thactive(wq);
5162 	activecount = _wq_thactive_aggregate_downto_qos(wq, act,
5163 	    WORKQ_THREAD_QOS_MIN, NULL, NULL);
5164 	if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
5165 		activecount++;
5166 	}
5167 	pwqinfo->pwq_nthreads = wq->wq_nthreads;
5168 	pwqinfo->pwq_runthreads = activecount;
5169 	pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
5170 	pwqinfo->pwq_state = 0;
5171 
5172 	if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
5173 		pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
5174 	}
5175 
5176 	if (wq->wq_nthreads >= wq_max_threads) {
5177 		pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
5178 	}
5179 
5180 	uint64_t total_cooperative_threads;
5181 	total_cooperative_threads = workq_num_cooperative_threads_scheduled_total(wq);
5182 	if ((total_cooperative_threads == wq_cooperative_queue_max_size(wq)) &&
5183 	    workq_has_cooperative_thread_requests(wq)) {
5184 		pwqinfo->pwq_state |= WQ_EXCEEDED_COOPERATIVE_THREAD_LIMIT;
5185 	}
5186 
5187 	if (wq->wq_exceeded_active_constrained_thread_limit) {
5188 		pwqinfo->pwq_state |= WQ_EXCEEDED_ACTIVE_CONSTRAINED_THREAD_LIMIT;
5189 	}
5190 
5191 	workq_unlock(wq);
5192 	return error;
5193 }
5194 
5195 boolean_t
workqueue_get_pwq_exceeded(void * v,boolean_t * exceeded_total,boolean_t * exceeded_constrained)5196 workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
5197     boolean_t *exceeded_constrained)
5198 {
5199 	proc_t p = v;
5200 	struct proc_workqueueinfo pwqinfo;
5201 	int err;
5202 
5203 	assert(p != NULL);
5204 	assert(exceeded_total != NULL);
5205 	assert(exceeded_constrained != NULL);
5206 
5207 	err = fill_procworkqueue(p, &pwqinfo);
5208 	if (err) {
5209 		return FALSE;
5210 	}
5211 	if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
5212 		return FALSE;
5213 	}
5214 
5215 	*exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
5216 	*exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
5217 
5218 	return TRUE;
5219 }
5220 
5221 uint64_t
workqueue_get_task_ss_flags_from_pwq_state_kdp(void * v)5222 workqueue_get_task_ss_flags_from_pwq_state_kdp(void * v)
5223 {
5224 	static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
5225 	    kTaskWqExceededConstrainedThreadLimit);
5226 	static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
5227 	    kTaskWqExceededTotalThreadLimit);
5228 	static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
5229 	static_assert(((uint64_t)WQ_EXCEEDED_COOPERATIVE_THREAD_LIMIT << 34) ==
5230 	    (uint64_t)kTaskWqExceededCooperativeThreadLimit);
5231 	static_assert(((uint64_t)WQ_EXCEEDED_ACTIVE_CONSTRAINED_THREAD_LIMIT << 34) ==
5232 	    (uint64_t)kTaskWqExceededActiveConstrainedThreadLimit);
5233 	static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
5234 	    WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT |
5235 	    WQ_EXCEEDED_COOPERATIVE_THREAD_LIMIT |
5236 	    WQ_EXCEEDED_ACTIVE_CONSTRAINED_THREAD_LIMIT) == 0x1F);
5237 
5238 	if (v == NULL) {
5239 		return 0;
5240 	}
5241 
5242 	proc_t p = v;
5243 	struct workqueue *wq = proc_get_wqptr(p);
5244 
5245 	if (wq == NULL || workq_lock_is_acquired_kdp(wq)) {
5246 		return 0;
5247 	}
5248 
5249 	uint64_t ss_flags = kTaskWqFlagsAvailable;
5250 
5251 	if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
5252 		ss_flags |= kTaskWqExceededConstrainedThreadLimit;
5253 	}
5254 
5255 	if (wq->wq_nthreads >= wq_max_threads) {
5256 		ss_flags |= kTaskWqExceededTotalThreadLimit;
5257 	}
5258 
5259 	uint64_t total_cooperative_threads;
5260 	total_cooperative_threads = workq_num_cooperative_threads_scheduled_to_qos_internal(wq,
5261 	    WORKQ_THREAD_QOS_MIN);
5262 	if ((total_cooperative_threads == wq_cooperative_queue_max_size(wq)) &&
5263 	    workq_has_cooperative_thread_requests(wq)) {
5264 		ss_flags |= kTaskWqExceededCooperativeThreadLimit;
5265 	}
5266 
5267 	if (wq->wq_exceeded_active_constrained_thread_limit) {
5268 		ss_flags |= kTaskWqExceededActiveConstrainedThreadLimit;
5269 	}
5270 
5271 	return ss_flags;
5272 }
5273 
5274 void
workq_init(void)5275 workq_init(void)
5276 {
5277 	clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
5278 	    NSEC_PER_USEC, &wq_stalled_window.abstime);
5279 	clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
5280 	    NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
5281 	clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
5282 	    NSEC_PER_USEC, &wq_max_timer_interval.abstime);
5283 
5284 	thread_deallocate_daemon_register_queue(&workq_deallocate_queue,
5285 	    workq_deallocate_queue_invoke);
5286 }
5287