xref: /xnu-11215/bsd/kern/kern_event.c (revision d4514f0b)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  *
28  */
29 /*-
30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 /*
55  *	@(#)kern_event.c       1.0 (3/31/2000)
56  */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91 
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 #include <kern/work_interval.h>
107 
108 #include <pthread/priority_private.h>
109 #include <pthread/workqueue_syscalls.h>
110 #include <pthread/workqueue_internal.h>
111 #include <libkern/libkern.h>
112 
113 #include <os/log.h>
114 
115 #include "mach/kern_return.h"
116 #include "net/net_str_id.h"
117 
118 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
119 #include <skywalk/lib/net_filter_event.h>
120 
121 extern bool net_check_compatible_alf(void);
122 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
123 
124 #include <mach/task.h>
125 #include <libkern/section_keywords.h>
126 
127 #if CONFIG_MEMORYSTATUS
128 #include <sys/kern_memorystatus.h>
129 #endif
130 
131 #if DEVELOPMENT || DEBUG
132 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
133 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
134 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
135 #endif
136 
137 /* Enable bound thread support for kqworkloop. */
138 static TUNABLE(int, bootarg_thread_bound_kqwl_support_enabled,
139     "enable_thread_bound_kqwl_support", 0);
140 SYSCTL_NODE(_kern, OID_AUTO, kern_event, CTLFLAG_RD | CTLFLAG_LOCKED, 0, NULL);
141 SYSCTL_INT(_kern_kern_event, OID_AUTO, thread_bound_kqwl_support_enabled,
142     CTLFLAG_RD | CTLFLAG_LOCKED,
143     &bootarg_thread_bound_kqwl_support_enabled, 0,
144     "Whether thread bound kqwl support is enabled");
145 
146 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
147 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
148     VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
149 
150 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
151 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
152 
153 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
154 
155 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
156     vfs_context_t ctx);
157 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
158 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
159     struct kevent_qos_s *kev);
160 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
161 
162 static const struct fileops kqueueops = {
163 	.fo_type     = DTYPE_KQUEUE,
164 	.fo_read     = fo_no_read,
165 	.fo_write    = fo_no_write,
166 	.fo_ioctl    = fo_no_ioctl,
167 	.fo_select   = kqueue_select,
168 	.fo_close    = kqueue_close,
169 	.fo_drain    = kqueue_drain,
170 	.fo_kqfilter = kqueue_kqfilter,
171 };
172 
173 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
174 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
175 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
176     thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
177 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
178 static void kevent_register_wait_cleanup(struct knote *kn);
179 
180 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
181 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
182 
183 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
184 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
185 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
186 static void kqueue_update_iotier_override(kqueue_t kqu);
187 
188 static void kqworkloop_unbind(struct kqworkloop *kqwl);
189 
190 enum kqwl_unbind_locked_mode {
191 	KQWL_OVERRIDE_DROP_IMMEDIATELY,
192 	KQWL_OVERRIDE_DROP_DELAYED,
193 };
194 // The soft unbinding of kqworkloop only applies to kqwls configured
195 // with a permanently bound thread.
196 #define KQUEUE_THREADREQ_UNBIND_SOFT 0x1
197 static void kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
198     enum kqwl_unbind_locked_mode how, unsigned int flags);
199 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
200 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
201 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
202 static void kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread);
203 static void kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl);
204 
205 enum {
206 	KQWL_UTQ_NONE,
207 	/*
208 	 * The wakeup qos is the qos of QUEUED knotes.
209 	 *
210 	 * This QoS is accounted for with the events override in the
211 	 * kqr_override_index field. It is raised each time a new knote is queued at
212 	 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
213 	 * knote buckets and is recomputed after each event delivery.
214 	 */
215 	KQWL_UTQ_UPDATE_WAKEUP_QOS,
216 	KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
217 	KQWL_UTQ_UNBINDING, /* attempt to rebind */
218 	KQWL_UTQ_PARKING,
219 	/*
220 	 * The wakeup override is for suppressed knotes that have fired again at
221 	 * a higher QoS than the one for which they are suppressed already.
222 	 * This override is cleared when the knote suppressed list becomes empty.
223 	 */
224 	KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
225 	KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
226 	/*
227 	 * The QoS is the maximum QoS of an event enqueued on this workloop in
228 	 * userland. It is copied from the only EVFILT_WORKLOOP knote with
229 	 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
230 	 * such knote, this QoS is 0.
231 	 */
232 	KQWL_UTQ_SET_QOS_INDEX,
233 	KQWL_UTQ_REDRIVE_EVENTS,
234 };
235 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
236 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
237 
238 static struct knote *knote_alloc(void);
239 static void knote_free(struct knote *kn);
240 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
241     struct knote_lock_ctx *knlc, struct proc *p);
242 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
243     struct kevent_qos_s *kev, bool is_fd, struct proc *p);
244 
245 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
246 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
247 
248 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
249     struct kevent_qos_s *kev, int result);
250 static void knote_suppress(kqueue_t kqu, struct knote *kn);
251 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
252 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
253 
254 // both these functions may dequeue the knote and it is up to the caller
255 // to enqueue the knote back
256 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
257 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
258 
259 static ZONE_DEFINE(knote_zone, "knote zone",
260     sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
261 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
262     sizeof(struct kqfile), ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
263 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
264     sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
265 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
266     sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
267 
268 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
269 
270 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
271 static void filt_no_detach(struct knote *kn);
272 static int filt_bad_event(struct knote *kn, long hint);
273 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
274 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
275 
276 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
277 	.f_attach  = filt_no_attach,
278 	.f_detach  = filt_no_detach,
279 	.f_event   = filt_bad_event,
280 	.f_touch   = filt_bad_touch,
281 	.f_process = filt_bad_process,
282 };
283 
284 #if CONFIG_MEMORYSTATUS
285 extern const struct filterops memorystatus_filtops;
286 #endif /* CONFIG_MEMORYSTATUS */
287 extern const struct filterops fs_filtops;
288 extern const struct filterops sig_filtops;
289 extern const struct filterops machport_attach_filtops;
290 extern const struct filterops mach_port_filtops;
291 extern const struct filterops mach_port_set_filtops;
292 extern const struct filterops pipe_nfiltops;
293 extern const struct filterops pipe_rfiltops;
294 extern const struct filterops pipe_wfiltops;
295 extern const struct filterops ptsd_kqops;
296 extern const struct filterops ptmx_kqops;
297 extern const struct filterops soread_filtops;
298 extern const struct filterops sowrite_filtops;
299 extern const struct filterops sock_filtops;
300 extern const struct filterops soexcept_filtops;
301 extern const struct filterops spec_filtops;
302 extern const struct filterops bpfread_filtops;
303 extern const struct filterops necp_fd_rfiltops;
304 #if SKYWALK
305 extern const struct filterops skywalk_channel_rfiltops;
306 extern const struct filterops skywalk_channel_wfiltops;
307 extern const struct filterops skywalk_channel_efiltops;
308 #endif /* SKYWALK */
309 extern const struct filterops fsevent_filtops;
310 extern const struct filterops vnode_filtops;
311 extern const struct filterops tty_filtops;
312 
313 const static struct filterops file_filtops;
314 const static struct filterops kqread_filtops;
315 const static struct filterops proc_filtops;
316 const static struct filterops timer_filtops;
317 const static struct filterops user_filtops;
318 const static struct filterops workloop_filtops;
319 #if CONFIG_EXCLAVES
320 extern const struct filterops exclaves_notification_filtops;
321 #endif /* CONFIG_EXCLAVES */
322 
323 /*
324  *
325  * Rules for adding new filters to the system:
326  * Public filters:
327  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
328  *   in the exported section of the header
329  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
330  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
331  *   of the Public Filters section in the array.
332  * Private filters:
333  * - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
334  * - Update the EVFILTID_MAX value to reflect the new addition
335  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
336  *   the Private filters section of the array.
337  */
338 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
339 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
340 	/* Public Filters */
341 	[~EVFILT_READ]                  = &file_filtops,
342 	[~EVFILT_WRITE]                 = &file_filtops,
343 	[~EVFILT_AIO]                   = &bad_filtops,
344 	[~EVFILT_VNODE]                 = &file_filtops,
345 	[~EVFILT_PROC]                  = &proc_filtops,
346 	[~EVFILT_SIGNAL]                = &sig_filtops,
347 	[~EVFILT_TIMER]                 = &timer_filtops,
348 	[~EVFILT_MACHPORT]              = &machport_attach_filtops,
349 	[~EVFILT_FS]                    = &fs_filtops,
350 	[~EVFILT_USER]                  = &user_filtops,
351 	[~EVFILT_UNUSED_11]             = &bad_filtops,
352 	[~EVFILT_VM]                    = &bad_filtops,
353 	[~EVFILT_SOCK]                  = &file_filtops,
354 #if CONFIG_MEMORYSTATUS
355 	[~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
356 #else
357 	[~EVFILT_MEMORYSTATUS]          = &bad_filtops,
358 #endif
359 	[~EVFILT_EXCEPT]                = &file_filtops,
360 #if SKYWALK
361 	[~EVFILT_NW_CHANNEL]            = &file_filtops,
362 #else /* !SKYWALK */
363 	[~EVFILT_NW_CHANNEL]            = &bad_filtops,
364 #endif /* !SKYWALK */
365 	[~EVFILT_WORKLOOP]              = &workloop_filtops,
366 #if CONFIG_EXCLAVES
367 	[~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
368 #else /* !CONFIG_EXCLAVES */
369 	[~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
370 #endif /* CONFIG_EXCLAVES*/
371 
372 	/* Private filters */
373 	[EVFILTID_KQREAD]               = &kqread_filtops,
374 	[EVFILTID_PIPE_N]               = &pipe_nfiltops,
375 	[EVFILTID_PIPE_R]               = &pipe_rfiltops,
376 	[EVFILTID_PIPE_W]               = &pipe_wfiltops,
377 	[EVFILTID_PTSD]                 = &ptsd_kqops,
378 	[EVFILTID_SOREAD]               = &soread_filtops,
379 	[EVFILTID_SOWRITE]              = &sowrite_filtops,
380 	[EVFILTID_SCK]                  = &sock_filtops,
381 	[EVFILTID_SOEXCEPT]             = &soexcept_filtops,
382 	[EVFILTID_SPEC]                 = &spec_filtops,
383 	[EVFILTID_BPFREAD]              = &bpfread_filtops,
384 	[EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
385 #if SKYWALK
386 	[EVFILTID_SKYWALK_CHANNEL_W]    = &skywalk_channel_wfiltops,
387 	[EVFILTID_SKYWALK_CHANNEL_R]    = &skywalk_channel_rfiltops,
388 	[EVFILTID_SKYWALK_CHANNEL_E]    = &skywalk_channel_efiltops,
389 #else /* !SKYWALK */
390 	[EVFILTID_SKYWALK_CHANNEL_W]    = &bad_filtops,
391 	[EVFILTID_SKYWALK_CHANNEL_R]    = &bad_filtops,
392 	[EVFILTID_SKYWALK_CHANNEL_E]    = &bad_filtops,
393 #endif /* !SKYWALK */
394 	[EVFILTID_FSEVENT]              = &fsevent_filtops,
395 	[EVFILTID_VN]                   = &vnode_filtops,
396 	[EVFILTID_TTY]                  = &tty_filtops,
397 	[EVFILTID_PTMX]                 = &ptmx_kqops,
398 	[EVFILTID_MACH_PORT]            = &mach_port_filtops,
399 	[EVFILTID_MACH_PORT_SET]        = &mach_port_set_filtops,
400 
401 	/* fake filter for detached knotes, keep last */
402 	[EVFILTID_DETACHED]             = &bad_filtops,
403 };
404 
405 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)406 kqr_thread_bound(workq_threadreq_t kqr)
407 {
408 	return kqr->tr_state == WORKQ_TR_STATE_BOUND;
409 }
410 
411 static inline bool
kqr_thread_permanently_bound(workq_threadreq_t kqr)412 kqr_thread_permanently_bound(workq_threadreq_t kqr)
413 {
414 	return kqr_thread_bound(kqr) && (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
415 }
416 
417 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)418 kqr_thread_requested_pending(workq_threadreq_t kqr)
419 {
420 	workq_tr_state_t tr_state = kqr->tr_state;
421 	return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
422 }
423 
424 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)425 kqr_thread_requested(workq_threadreq_t kqr)
426 {
427 	return kqr->tr_state != WORKQ_TR_STATE_IDLE;
428 }
429 
430 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)431 kqr_thread_fast(workq_threadreq_t kqr)
432 {
433 	assert(kqr_thread_bound(kqr));
434 	return kqr->tr_thread;
435 }
436 
437 static inline thread_t
kqr_thread(workq_threadreq_t kqr)438 kqr_thread(workq_threadreq_t kqr)
439 {
440 	return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
441 }
442 
443 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)444 kqr_kqworkloop(workq_threadreq_t kqr)
445 {
446 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
447 		return __container_of(kqr, struct kqworkloop, kqwl_request);
448 	}
449 	return NULL;
450 }
451 
452 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)453 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
454 {
455 	kqueue_t kqu;
456 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
457 		kqu.kqwl = kqr_kqworkloop(kqr);
458 	} else {
459 		kqu.kqwq = p->p_fd.fd_wqkqueue;
460 		assert(kqr >= kqu.kqwq->kqwq_request &&
461 		    kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
462 	}
463 	return kqu;
464 }
465 
466 #if CONFIG_PREADOPT_TG
467 /* There are no guarantees about which locks are held when this is called */
468 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)469 kqr_preadopt_thread_group(workq_threadreq_t req)
470 {
471 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
472 	return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
473 }
474 
475 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)476 inline _Atomic(thread_group_qos_t) *
477 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
478 {
479 	struct kqworkloop *kqwl = kqr_kqworkloop(req);
480 	return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
481 }
482 #endif
483 
484 /*
485  * kqueue/note lock implementations
486  *
487  *	The kqueue lock guards the kq state, the state of its queues,
488  *	and the kqueue-aware status and locks of individual knotes.
489  *
490  *	The kqueue workq lock is used to protect state guarding the
491  *	interaction of the kqueue with the workq.  This state cannot
492  *	be guarded by the kq lock - as it needs to be taken when we
493  *	already have the waitq set lock held (during the waitq hook
494  *	callback).  It might be better to use the waitq lock itself
495  *	for this, but the IRQ requirements make that difficult).
496  *
497  *	Knote flags, filter flags, and associated data are protected
498  *	by the underlying object lock - and are only ever looked at
499  *	by calling the filter to get a [consistent] snapshot of that
500  *	data.
501  */
502 
503 static inline void
kqlock(kqueue_t kqu)504 kqlock(kqueue_t kqu)
505 {
506 	lck_spin_lock(&kqu.kq->kq_lock);
507 }
508 
509 static inline void
kqlock_held(__assert_only kqueue_t kqu)510 kqlock_held(__assert_only kqueue_t kqu)
511 {
512 	LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
513 }
514 
515 static inline void
kqunlock(kqueue_t kqu)516 kqunlock(kqueue_t kqu)
517 {
518 	lck_spin_unlock(&kqu.kq->kq_lock);
519 }
520 
521 static inline void
knhash_lock(struct filedesc * fdp)522 knhash_lock(struct filedesc *fdp)
523 {
524 	lck_mtx_lock(&fdp->fd_knhashlock);
525 }
526 
527 static inline void
knhash_unlock(struct filedesc * fdp)528 knhash_unlock(struct filedesc *fdp)
529 {
530 	lck_mtx_unlock(&fdp->fd_knhashlock);
531 }
532 
533 /* wait event for knote locks */
534 static inline event_t
knote_lock_wev(struct knote * kn)535 knote_lock_wev(struct knote *kn)
536 {
537 	return (event_t)(&kn->kn_hook);
538 }
539 
540 /* wait event for kevent_register_wait_* */
541 static inline event64_t
knote_filt_wev64(struct knote * kn)542 knote_filt_wev64(struct knote *kn)
543 {
544 	/* kdp_workloop_sync_wait_find_owner knows about this */
545 	return CAST_EVENT64_T(kn);
546 }
547 
548 /* wait event for knote_post/knote_drop */
549 static inline event_t
knote_post_wev(struct knote * kn)550 knote_post_wev(struct knote *kn)
551 {
552 	return &kn->kn_kevent;
553 }
554 
555 /*!
556  * @function knote_has_qos
557  *
558  * @brief
559  * Whether the knote has a regular QoS.
560  *
561  * @discussion
562  * kn_qos_override is:
563  * - 0 on kqfiles
564  * - THREAD_QOS_LAST for special buckets (manager)
565  *
566  * Other values mean the knote participates to QoS propagation.
567  */
568 static inline bool
knote_has_qos(struct knote * kn)569 knote_has_qos(struct knote *kn)
570 {
571 	return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
572 }
573 
574 #pragma mark knote locks
575 
576 /*
577  * Enum used by the knote_lock_* functions.
578  *
579  * KNOTE_KQ_LOCK_ALWAYS
580  *   The function will always return with the kq lock held.
581  *
582  * KNOTE_KQ_LOCK_ON_SUCCESS
583  *   The function will return with the kq lock held if it was successful
584  *   (knote_lock() is the only function that can fail).
585  *
586  * KNOTE_KQ_LOCK_ON_FAILURE
587  *   The function will return with the kq lock held if it was unsuccessful
588  *   (knote_lock() is the only function that can fail).
589  *
590  * KNOTE_KQ_UNLOCK:
591  *   The function returns with the kq unlocked.
592  */
593 enum kqlocking {
594 	KNOTE_KQ_LOCK_ALWAYS,
595 	KNOTE_KQ_LOCK_ON_SUCCESS,
596 	KNOTE_KQ_LOCK_ON_FAILURE,
597 	KNOTE_KQ_UNLOCK,
598 };
599 
600 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)601 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
602 {
603 	struct knote_lock_ctx *ctx;
604 	LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
605 		if (ctx->knlc_knote == kn) {
606 			return ctx;
607 		}
608 	}
609 	panic("knote lock context not found: %p", kn);
610 	__builtin_trap();
611 }
612 
613 /* slowpath of knote_lock() */
614 __attribute__((noinline))
615 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)616 knote_lock_slow(kqueue_t kqu, struct knote *kn,
617     struct knote_lock_ctx *knlc, int kqlocking)
618 {
619 	struct knote_lock_ctx *owner_lc;
620 	struct uthread *uth = current_uthread();
621 	wait_result_t wr;
622 
623 	kqlock_held(kqu);
624 
625 	owner_lc = knote_lock_ctx_find(kqu, kn);
626 #if MACH_ASSERT
627 	knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
628 #endif
629 	owner_lc->knlc_waiters++;
630 
631 	/*
632 	 * Make our lock context visible to knote_unlock()
633 	 */
634 	uth->uu_knlock = knlc;
635 
636 	wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
637 	    knote_lock_wev(kn), owner_lc->knlc_thread,
638 	    THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
639 
640 	if (wr == THREAD_RESTART) {
641 		/*
642 		 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
643 		 * We need to cleanup the state since no one did.
644 		 */
645 		uth->uu_knlock = NULL;
646 #if MACH_ASSERT
647 		assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
648 		knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
649 #endif
650 
651 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
652 		    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
653 			kqlock(kqu);
654 		}
655 		return false;
656 	} else {
657 		if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
658 		    kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
659 			kqlock(kqu);
660 			/*
661 			 * This state is set under the lock so we can't
662 			 * really assert this unless we hold the lock.
663 			 */
664 			assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
665 		}
666 		return true;
667 	}
668 }
669 
670 /*
671  * Attempts to take the "knote" lock.
672  *
673  * Called with the kqueue lock held.
674  *
675  * Returns true if the knote lock is acquired, false if it has been dropped
676  */
677 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)678 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
679     enum kqlocking kqlocking)
680 {
681 	kqlock_held(kqu);
682 
683 #if MACH_ASSERT
684 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
685 #endif
686 	knlc->knlc_knote = kn;
687 	knlc->knlc_thread = current_thread();
688 	knlc->knlc_waiters = 0;
689 
690 	if (__improbable(kn->kn_status & KN_LOCKED)) {
691 		return knote_lock_slow(kqu, kn, knlc, kqlocking);
692 	}
693 
694 	/*
695 	 * When the knote will be dropped, the knote lock is taken before
696 	 * KN_DROPPING is set, and then the knote will be removed from any
697 	 * hash table that references it before the lock is canceled.
698 	 */
699 	assert((kn->kn_status & KN_DROPPING) == 0);
700 	LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
701 	kn->kn_status |= KN_LOCKED;
702 #if MACH_ASSERT
703 	knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
704 #endif
705 
706 	if (kqlocking == KNOTE_KQ_UNLOCK ||
707 	    kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
708 		kqunlock(kqu);
709 	}
710 	return true;
711 }
712 
713 /*
714  * Unlocks a knote successfully locked with knote_lock().
715  *
716  * Called with the kqueue lock held.
717  *
718  * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
719  */
720 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)721 knote_unlock(kqueue_t kqu, struct knote *kn,
722     struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
723 {
724 	kqlock_held(kqu);
725 
726 	assert(knlc->knlc_knote == kn);
727 	assert(kn->kn_status & KN_LOCKED);
728 	assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
729 
730 	LIST_REMOVE(knlc, knlc_link);
731 
732 	if (knlc->knlc_waiters) {
733 		thread_t thread = THREAD_NULL;
734 
735 		wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
736 		    LCK_WAKE_DEFAULT, &thread);
737 
738 		/*
739 		 * knote_lock_slow() publishes the lock context of waiters
740 		 * in uthread::uu_knlock.
741 		 *
742 		 * Reach out and make this context the new owner.
743 		 */
744 		struct uthread *ut = get_bsdthread_info(thread);
745 		struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
746 
747 		assert(next_owner_lc->knlc_knote == kn);
748 		next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
749 		LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
750 #if MACH_ASSERT
751 		next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
752 #endif
753 		ut->uu_knlock = NULL;
754 		thread_deallocate_safe(thread);
755 	} else {
756 		kn->kn_status &= ~KN_LOCKED;
757 	}
758 
759 	if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
760 		/*
761 		 * No f_event() in flight anymore, we can leave QoS "Merge" mode
762 		 *
763 		 * See knote_adjust_qos()
764 		 */
765 		kn->kn_status &= ~KN_MERGE_QOS;
766 	}
767 	if (kqlocking == KNOTE_KQ_UNLOCK) {
768 		kqunlock(kqu);
769 	}
770 #if MACH_ASSERT
771 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
772 #endif
773 }
774 
775 /*
776  * Aborts all waiters for a knote lock, and unlock the knote.
777  *
778  * Called with the kqueue lock held.
779  *
780  * Returns with the kqueue unlocked.
781  */
782 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)783 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
784     struct knote_lock_ctx *knlc)
785 {
786 	kqlock_held(kq);
787 
788 	assert(knlc->knlc_knote == kn);
789 	assert(kn->kn_status & KN_LOCKED);
790 	assert(kn->kn_status & KN_DROPPING);
791 
792 	LIST_REMOVE(knlc, knlc_link);
793 	kn->kn_status &= ~KN_LOCKED;
794 	kqunlock(kq);
795 
796 	if (knlc->knlc_waiters) {
797 		wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
798 	}
799 #if MACH_ASSERT
800 	knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
801 #endif
802 }
803 
804 /*
805  * Call the f_event hook of a given filter.
806  *
807  * Takes a use count to protect against concurrent drops.
808  * Called with the object lock held.
809  */
810 static void
knote_post(struct knote * kn,long hint)811 knote_post(struct knote *kn, long hint)
812 {
813 	struct kqueue *kq = knote_get_kq(kn);
814 	int dropping, result;
815 
816 	kqlock(kq);
817 
818 	if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
819 		return kqunlock(kq);
820 	}
821 
822 	if (__improbable(kn->kn_status & KN_POSTING)) {
823 		panic("KNOTE() called concurrently on knote %p", kn);
824 	}
825 
826 	kn->kn_status |= KN_POSTING;
827 
828 	kqunlock(kq);
829 	result = filter_call(knote_fops(kn), f_event(kn, hint));
830 	kqlock(kq);
831 
832 	/* Someone dropped the knote/the monitored object vanished while we
833 	 * were in f_event, swallow the side effects of the post.
834 	 */
835 	dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
836 
837 	if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
838 		kqueue_update_iotier_override(kq);
839 	}
840 
841 	if (!dropping && (result & FILTER_ACTIVE)) {
842 		knote_activate(kq, kn, result);
843 	}
844 
845 	if ((kn->kn_status & KN_LOCKED) == 0) {
846 		/*
847 		 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
848 		 *
849 		 * See knote_adjust_qos()
850 		 */
851 		kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
852 	} else {
853 		kn->kn_status &= ~KN_POSTING;
854 	}
855 
856 	if (__improbable(dropping)) {
857 		thread_wakeup(knote_post_wev(kn));
858 	}
859 
860 	kqunlock(kq);
861 }
862 
863 /*
864  * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
865  * caller to be done.
866  *
867  *	- kq locked at entry
868  *	- kq unlocked at exit
869  */
870 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)871 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
872 {
873 	kqlock_held(kq);
874 
875 	assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
876 
877 	if (kn->kn_status & KN_POSTING) {
878 		lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
879 		    THREAD_UNINT | THREAD_WAIT_NOREPORT);
880 	} else {
881 		kqunlock(kq);
882 	}
883 }
884 
885 #pragma mark knote helpers for filters
886 
887 OS_ALWAYS_INLINE
888 void *
knote_kn_hook_get_raw(struct knote * kn)889 knote_kn_hook_get_raw(struct knote *kn)
890 {
891 	uintptr_t *addr = &kn->kn_hook;
892 
893 	void *hook = (void *) *addr;
894 #if __has_feature(ptrauth_calls)
895 	if (hook) {
896 		uint16_t blend = kn->kn_filter;
897 		blend |= (kn->kn_filtid << 8);
898 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
899 
900 		hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
901 		    ptrauth_blend_discriminator(addr, blend));
902 	}
903 #endif
904 
905 	return hook;
906 }
907 
908 OS_ALWAYS_INLINE void
knote_kn_hook_set_raw(struct knote * kn,void * kn_hook)909 knote_kn_hook_set_raw(struct knote *kn, void *kn_hook)
910 {
911 	uintptr_t *addr = &kn->kn_hook;
912 #if __has_feature(ptrauth_calls)
913 	if (kn_hook) {
914 		uint16_t blend = kn->kn_filter;
915 		blend |= (kn->kn_filtid << 8);
916 		blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
917 
918 		kn_hook = ptrauth_sign_unauthenticated(kn_hook,
919 		    ptrauth_key_process_independent_data,
920 		    ptrauth_blend_discriminator(addr, blend));
921 	}
922 #endif
923 	*addr = (uintptr_t) kn_hook;
924 }
925 
926 OS_ALWAYS_INLINE
927 void
knote_set_error(struct knote * kn,int error)928 knote_set_error(struct knote *kn, int error)
929 {
930 	kn->kn_flags |= EV_ERROR;
931 	kn->kn_sdata = error;
932 }
933 
934 OS_ALWAYS_INLINE
935 int64_t
knote_low_watermark(const struct knote * kn)936 knote_low_watermark(const struct knote *kn)
937 {
938 	return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
939 }
940 
941 /*!
942  * @function knote_fill_kevent_with_sdata
943  *
944  * @brief
945  * Fills in a kevent from the current content of a knote.
946  *
947  * @discussion
948  * This is meant to be called from filter's f_process hooks.
949  * The kevent data is filled with kn->kn_sdata.
950  *
951  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
952  *
953  * Using knote_fill_kevent is typically preferred.
954  */
955 OS_ALWAYS_INLINE
956 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)957 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
958 {
959 #define knote_assert_aliases(name1, offs1, name2) \
960 	static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
961 	    offsetof(struct kevent_internal_s, name2), \
962 	        "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
963 	/*
964 	 * All the code makes assumptions on these aliasing,
965 	 * so make sure we fail the build if we ever ever ever break them.
966 	 */
967 	knote_assert_aliases(ident, 0, kei_ident);
968 #ifdef __LITTLE_ENDIAN__
969 	knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
970 	knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
971 #else
972 	knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
973 	knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
974 #endif
975 	knote_assert_aliases(flags, 0, kei_flags);
976 	knote_assert_aliases(qos, 0, kei_qos);
977 	knote_assert_aliases(udata, 0, kei_udata);
978 	knote_assert_aliases(fflags, 0, kei_fflags);
979 	knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
980 	knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
981 	knote_assert_aliases(ext, 0, kei_ext);
982 #undef knote_assert_aliases
983 
984 	/*
985 	 * Fix the differences between kevent_qos_s and kevent_internal_s:
986 	 * - xflags is where kn_sfflags lives, we need to zero it
987 	 * - fixup the high bits of `filter` where kn_filtid lives
988 	 */
989 	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;
990 	kev->xflags = 0;
991 	kev->filter |= 0xff00;
992 	if (kn->kn_flags & EV_CLEAR) {
993 		kn->kn_fflags = 0;
994 	}
995 }
996 
997 /*!
998  * @function knote_fill_kevent
999  *
1000  * @brief
1001  * Fills in a kevent from the current content of a knote.
1002  *
1003  * @discussion
1004  * This is meant to be called from filter's f_process hooks.
1005  * The kevent data is filled with the passed in data.
1006  *
1007  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
1008  */
1009 OS_ALWAYS_INLINE
1010 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)1011 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
1012 {
1013 	knote_fill_kevent_with_sdata(kn, kev);
1014 	kev->filter = kn->kn_filter;
1015 	kev->data = data;
1016 }
1017 
1018 
1019 #pragma mark file_filtops
1020 
1021 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)1022 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
1023 {
1024 	return fo_kqfilter(kn->kn_fp, kn, kev);
1025 }
1026 
1027 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1028 	.f_isfd = 1,
1029 	.f_attach = filt_fileattach,
1030 };
1031 
1032 #pragma mark kqread_filtops
1033 
1034 #define f_flag fp_glob->fg_flag
1035 #define f_ops fp_glob->fg_ops
1036 #define f_lflags fp_glob->fg_lflags
1037 
1038 static void
filt_kqdetach(struct knote * kn)1039 filt_kqdetach(struct knote *kn)
1040 {
1041 	struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
1042 	struct kqueue *kq = &kqf->kqf_kqueue;
1043 
1044 	kqlock(kq);
1045 	KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1046 	kqunlock(kq);
1047 }
1048 
1049 static int
filt_kqueue(struct knote * kn,__unused long hint)1050 filt_kqueue(struct knote *kn, __unused long hint)
1051 {
1052 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1053 
1054 	return kq->kq_count > 0;
1055 }
1056 
1057 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)1058 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
1059 {
1060 #pragma unused(kev)
1061 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1062 	int res;
1063 
1064 	kqlock(kq);
1065 	res = (kq->kq_count > 0);
1066 	kqunlock(kq);
1067 
1068 	return res;
1069 }
1070 
1071 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1072 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1073 {
1074 	struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1075 	int res = 0;
1076 
1077 	kqlock(kq);
1078 	if (kq->kq_count) {
1079 		knote_fill_kevent(kn, kev, kq->kq_count);
1080 		res = 1;
1081 	}
1082 	kqunlock(kq);
1083 
1084 	return res;
1085 }
1086 
1087 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1088 	.f_isfd = 1,
1089 	.f_detach = filt_kqdetach,
1090 	.f_event = filt_kqueue,
1091 	.f_touch = filt_kqtouch,
1092 	.f_process = filt_kqprocess,
1093 };
1094 
1095 #pragma mark proc_filtops
1096 
1097 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1098 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1099 {
1100 	struct proc *p;
1101 
1102 	assert(PID_MAX < NOTE_PDATAMASK);
1103 
1104 	if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1105 		knote_set_error(kn, ENOTSUP);
1106 		return 0;
1107 	}
1108 
1109 	p = proc_find((int)kn->kn_id);
1110 	if (p == NULL) {
1111 		knote_set_error(kn, ESRCH);
1112 		return 0;
1113 	}
1114 
1115 	const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1116 
1117 	if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1118 		do {
1119 			pid_t selfpid = proc_selfpid();
1120 
1121 			if (p->p_ppid == selfpid) {
1122 				break;  /* parent => ok */
1123 			}
1124 			if ((p->p_lflag & P_LTRACED) != 0 &&
1125 			    (p->p_oppid == selfpid)) {
1126 				break;  /* parent-in-waiting => ok */
1127 			}
1128 			if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1129 				break; /* allowed to signal => ok */
1130 			}
1131 			proc_rele(p);
1132 			knote_set_error(kn, EACCES);
1133 			return 0;
1134 		} while (0);
1135 	}
1136 
1137 	kn->kn_proc = p;
1138 	kn->kn_flags |= EV_CLEAR;       /* automatically set */
1139 	kn->kn_sdata = 0;               /* incoming data is ignored */
1140 
1141 	proc_klist_lock();
1142 
1143 	KNOTE_ATTACH(&p->p_klist, kn);
1144 
1145 	proc_klist_unlock();
1146 
1147 	proc_rele(p);
1148 
1149 	/*
1150 	 * only captures edge-triggered events after this point
1151 	 * so it can't already be fired.
1152 	 */
1153 	return 0;
1154 }
1155 
1156 
1157 /*
1158  * The knote may be attached to a different process, which may exit,
1159  * leaving nothing for the knote to be attached to.  In that case,
1160  * the pointer to the process will have already been nulled out.
1161  */
1162 static void
filt_procdetach(struct knote * kn)1163 filt_procdetach(struct knote *kn)
1164 {
1165 	struct proc *p;
1166 
1167 	proc_klist_lock();
1168 
1169 	p = kn->kn_proc;
1170 	if (p != PROC_NULL) {
1171 		kn->kn_proc = PROC_NULL;
1172 		KNOTE_DETACH(&p->p_klist, kn);
1173 	}
1174 
1175 	proc_klist_unlock();
1176 }
1177 
1178 static int
filt_procevent(struct knote * kn,long hint)1179 filt_procevent(struct knote *kn, long hint)
1180 {
1181 	u_int event;
1182 
1183 	/* ALWAYS CALLED WITH proc_klist_lock */
1184 
1185 	/*
1186 	 * Note: a lot of bits in hint may be obtained from the knote
1187 	 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1188 	 * bits in hint for filt_procevent
1189 	 *
1190 	 * mask off extra data
1191 	 */
1192 	event = (u_int)hint & NOTE_PCTRLMASK;
1193 
1194 	/*
1195 	 * termination lifecycle events can happen while a debugger
1196 	 * has reparented a process, in which case notifications
1197 	 * should be quashed except to the tracing parent. When
1198 	 * the debugger reaps the child (either via wait4(2) or
1199 	 * process exit), the child will be reparented to the original
1200 	 * parent and these knotes re-fired.
1201 	 */
1202 	if (event & NOTE_EXIT) {
1203 		if ((kn->kn_proc->p_oppid != 0)
1204 		    && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1205 			/*
1206 			 * This knote is not for the current ptrace(2) parent, ignore.
1207 			 */
1208 			return 0;
1209 		}
1210 	}
1211 
1212 	/*
1213 	 * if the user is interested in this event, record it.
1214 	 */
1215 	if (kn->kn_sfflags & event) {
1216 		kn->kn_fflags |= event;
1217 	}
1218 
1219 #pragma clang diagnostic push
1220 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1221 	if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1222 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1223 	}
1224 #pragma clang diagnostic pop
1225 
1226 
1227 	/*
1228 	 * The kernel has a wrapper in place that returns the same data
1229 	 * as is collected here, in kn_hook32.  Any changes to how
1230 	 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1231 	 * should also be reflected in the proc_pidnoteexit() wrapper.
1232 	 */
1233 	if (event == NOTE_EXIT) {
1234 		kn->kn_hook32 = 0;
1235 		if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1236 			kn->kn_fflags |= NOTE_EXITSTATUS;
1237 			kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1238 		}
1239 		if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1240 			kn->kn_fflags |= NOTE_EXIT_DETAIL;
1241 			if ((kn->kn_proc->p_lflag &
1242 			    P_LTERM_DECRYPTFAIL) != 0) {
1243 				kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1244 			}
1245 			if ((kn->kn_proc->p_lflag &
1246 			    P_LTERM_JETSAM) != 0) {
1247 				kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1248 				switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1249 				case P_JETSAM_VMPAGESHORTAGE:
1250 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1251 					break;
1252 				case P_JETSAM_VMTHRASHING:
1253 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1254 					break;
1255 				case P_JETSAM_FCTHRASHING:
1256 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1257 					break;
1258 				case P_JETSAM_VNODE:
1259 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1260 					break;
1261 				case P_JETSAM_HIWAT:
1262 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1263 					break;
1264 				case P_JETSAM_PID:
1265 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1266 					break;
1267 				case P_JETSAM_IDLEEXIT:
1268 					kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1269 					break;
1270 				}
1271 			}
1272 			if ((proc_getcsflags(kn->kn_proc) &
1273 			    CS_KILLED) != 0) {
1274 				kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1275 			}
1276 		}
1277 	}
1278 
1279 	/* if we have any matching state, activate the knote */
1280 	return kn->kn_fflags != 0;
1281 }
1282 
1283 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1284 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1285 {
1286 	int res;
1287 
1288 	proc_klist_lock();
1289 
1290 	/* accept new filter flags and mask off output events no long interesting */
1291 	kn->kn_sfflags = kev->fflags;
1292 
1293 	/* restrict the current results to the (smaller?) set of new interest */
1294 	/*
1295 	 * For compatibility with previous implementations, we leave kn_fflags
1296 	 * as they were before.
1297 	 */
1298 	//kn->kn_fflags &= kn->kn_sfflags;
1299 
1300 	res = (kn->kn_fflags != 0);
1301 
1302 	proc_klist_unlock();
1303 
1304 	return res;
1305 }
1306 
1307 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1308 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1309 {
1310 	int res = 0;
1311 
1312 	proc_klist_lock();
1313 	if (kn->kn_fflags) {
1314 		knote_fill_kevent(kn, kev, kn->kn_hook32);
1315 		kn->kn_hook32 = 0;
1316 		res = 1;
1317 	}
1318 	proc_klist_unlock();
1319 	return res;
1320 }
1321 
1322 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1323 	.f_attach  = filt_procattach,
1324 	.f_detach  = filt_procdetach,
1325 	.f_event   = filt_procevent,
1326 	.f_touch   = filt_proctouch,
1327 	.f_process = filt_procprocess,
1328 };
1329 
1330 #pragma mark timer_filtops
1331 
1332 struct filt_timer_params {
1333 	uint64_t deadline; /* deadline in abs/cont time
1334 	                    *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1335 	uint64_t leeway;   /* leeway in abstime, or 0 if none */
1336 	uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1337 };
1338 
1339 /*
1340  * Values stored in the knote at rest (using Mach absolute time units)
1341  *
1342  * kn->kn_thcall        where the thread_call object is stored
1343  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1344  * kn->kn_ext[1]        leeway value
1345  * kn->kn_sdata         interval timer: the interval
1346  *                      absolute/deadline timer: 0
1347  * kn->kn_hook32        timer state (with gencount)
1348  *
1349  * TIMER_IDLE:
1350  *   The timer has either never been scheduled or been cancelled.
1351  *   It is safe to schedule a new one in this state.
1352  *
1353  * TIMER_ARMED:
1354  *   The timer has been scheduled
1355  *
1356  * TIMER_FIRED
1357  *   The timer has fired and an event needs to be delivered.
1358  *   When in this state, the callout may still be running.
1359  *
1360  * TIMER_IMMEDIATE
1361  *   The timer has fired at registration time, and the callout was never
1362  *   dispatched.
1363  */
1364 #define TIMER_IDLE       0x0
1365 #define TIMER_ARMED      0x1
1366 #define TIMER_FIRED      0x2
1367 #define TIMER_IMMEDIATE  0x3
1368 #define TIMER_STATE_MASK 0x3
1369 #define TIMER_GEN_INC    0x4
1370 
1371 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1372 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1373 {
1374 	kn->kn_ext[0] = params->deadline;
1375 	kn->kn_ext[1] = params->leeway;
1376 	kn->kn_sdata  = params->interval;
1377 }
1378 
1379 /*
1380  * filt_timervalidate - process data from user
1381  *
1382  * Sets up the deadline, interval, and leeway from the provided user data
1383  *
1384  * Input:
1385  *      kn_sdata        timer deadline or interval time
1386  *      kn_sfflags      style of timer, unit of measurement
1387  *
1388  * Output:
1389  *      struct filter_timer_params to apply to the filter with
1390  *      filt_timer_set_params when changes are ready to be commited.
1391  *
1392  * Returns:
1393  *      EINVAL          Invalid user data parameters
1394  *      ERANGE          Various overflows with the parameters
1395  *
1396  * Called with timer filter lock held.
1397  */
1398 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1399 filt_timervalidate(const struct kevent_qos_s *kev,
1400     struct filt_timer_params *params)
1401 {
1402 	/*
1403 	 * There are 5 knobs that need to be chosen for a timer registration:
1404 	 *
1405 	 * A) Units of time (what is the time duration of the specified number)
1406 	 *      Absolute and interval take:
1407 	 *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1408 	 *      Defaults to milliseconds if not specified
1409 	 *
1410 	 * B) Clock epoch (what is the zero point of the specified number)
1411 	 *      For interval, there is none
1412 	 *      For absolute, defaults to the gettimeofday/calendar epoch
1413 	 *      With NOTE_MACHTIME, uses mach_absolute_time()
1414 	 *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1415 	 *
1416 	 * C) The knote's behavior on delivery
1417 	 *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1418 	 *      Absolute is a forced one-shot timer which deletes on delivery
1419 	 *      TODO: Add a way for absolute to be not forced one-shot
1420 	 *
1421 	 * D) Whether the time duration is relative to now or absolute
1422 	 *      Interval fires at now + duration when it is set up
1423 	 *      Absolute fires at now + difference between now walltime and passed in walltime
1424 	 *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1425 	 *
1426 	 * E) Whether the timer continues to tick across sleep
1427 	 *      By default all three do not.
1428 	 *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1429 	 *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1430 	 *              expires when mach_continuous_time() is > the passed in value.
1431 	 */
1432 
1433 	uint64_t multiplier;
1434 
1435 	boolean_t use_abstime = FALSE;
1436 
1437 	switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1438 	case NOTE_SECONDS:
1439 		multiplier = NSEC_PER_SEC;
1440 		break;
1441 	case NOTE_USECONDS:
1442 		multiplier = NSEC_PER_USEC;
1443 		break;
1444 	case NOTE_NSECONDS:
1445 		multiplier = 1;
1446 		break;
1447 	case NOTE_MACHTIME:
1448 		multiplier = 0;
1449 		use_abstime = TRUE;
1450 		break;
1451 	case 0: /* milliseconds (default) */
1452 		multiplier = NSEC_PER_SEC / 1000;
1453 		break;
1454 	default:
1455 		return EINVAL;
1456 	}
1457 
1458 	/* transform the leeway in kn_ext[1] to same time scale */
1459 	if (kev->fflags & NOTE_LEEWAY) {
1460 		uint64_t leeway_abs;
1461 
1462 		if (use_abstime) {
1463 			leeway_abs = (uint64_t)kev->ext[1];
1464 		} else {
1465 			uint64_t leeway_ns;
1466 			if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1467 				return ERANGE;
1468 			}
1469 
1470 			nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1471 		}
1472 
1473 		params->leeway = leeway_abs;
1474 	} else {
1475 		params->leeway = 0;
1476 	}
1477 
1478 	if (kev->fflags & NOTE_ABSOLUTE) {
1479 		uint64_t deadline_abs;
1480 
1481 		if (use_abstime) {
1482 			deadline_abs = (uint64_t)kev->data;
1483 		} else {
1484 			uint64_t calendar_deadline_ns;
1485 
1486 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1487 				return ERANGE;
1488 			}
1489 
1490 			/* calendar_deadline_ns is in nanoseconds since the epoch */
1491 
1492 			clock_sec_t seconds;
1493 			clock_nsec_t nanoseconds;
1494 
1495 			/*
1496 			 * Note that the conversion through wall-time is only done once.
1497 			 *
1498 			 * If the relationship between MAT and gettimeofday changes,
1499 			 * the underlying timer does not update.
1500 			 *
1501 			 * TODO: build a wall-time denominated timer_call queue
1502 			 * and a flag to request DTRTing with wall-time timers
1503 			 */
1504 			clock_get_calendar_nanotime(&seconds, &nanoseconds);
1505 
1506 			uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1507 
1508 			/* if deadline is in the future */
1509 			if (calendar_now_ns < calendar_deadline_ns) {
1510 				uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1511 				uint64_t interval_abs;
1512 
1513 				nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1514 
1515 				/*
1516 				 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1517 				 * causes the timer to keep ticking across sleep, but
1518 				 * it does not change the calendar timebase.
1519 				 */
1520 
1521 				if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1522 					clock_continuoustime_interval_to_deadline(interval_abs,
1523 					    &deadline_abs);
1524 				} else {
1525 					clock_absolutetime_interval_to_deadline(interval_abs,
1526 					    &deadline_abs);
1527 				}
1528 			} else {
1529 				deadline_abs = 0; /* cause immediate expiration */
1530 			}
1531 		}
1532 
1533 		params->deadline = deadline_abs;
1534 		params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1535 	} else if (kev->data < 0) {
1536 		/*
1537 		 * Negative interval timers fire immediately, once.
1538 		 *
1539 		 * Ideally a negative interval would be an error, but certain clients
1540 		 * pass negative values on accident, and expect an event back.
1541 		 *
1542 		 * In the old implementation the timer would repeat with no delay
1543 		 * N times until mach_absolute_time() + (N * interval) underflowed,
1544 		 * then it would wait ~forever by accidentally arming a timer for the far future.
1545 		 *
1546 		 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1547 		 */
1548 
1549 		params->deadline = 0; /* expire immediately */
1550 		params->interval = 0; /* non-repeating */
1551 	} else {
1552 		uint64_t interval_abs = 0;
1553 
1554 		if (use_abstime) {
1555 			interval_abs = (uint64_t)kev->data;
1556 		} else {
1557 			uint64_t interval_ns;
1558 			if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1559 				return ERANGE;
1560 			}
1561 
1562 			nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1563 		}
1564 
1565 		uint64_t deadline = 0;
1566 
1567 		if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1568 			clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1569 		} else {
1570 			clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1571 		}
1572 
1573 		params->deadline = deadline;
1574 		params->interval = interval_abs;
1575 	}
1576 
1577 	return 0;
1578 }
1579 
1580 /*
1581  * filt_timerexpire - the timer callout routine
1582  */
1583 static void
filt_timerexpire(void * knx,void * state_on_arm)1584 filt_timerexpire(void *knx, void *state_on_arm)
1585 {
1586 	struct knote *kn = knx;
1587 
1588 	uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1589 	uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1590 
1591 	if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1592 		// our f_event always would say FILTER_ACTIVE,
1593 		// so be leaner and just do it.
1594 		struct kqueue *kq = knote_get_kq(kn);
1595 		kqlock(kq);
1596 		knote_activate(kq, kn, FILTER_ACTIVE);
1597 		kqunlock(kq);
1598 	} else {
1599 		/*
1600 		 * The timer has been reprogrammed or canceled since it was armed,
1601 		 * and this is a late firing for the timer, just ignore it.
1602 		 */
1603 	}
1604 }
1605 
1606 /*
1607  * Does this deadline needs a timer armed for it, or has it expired?
1608  */
1609 static bool
filt_timer_is_ready(struct knote * kn)1610 filt_timer_is_ready(struct knote *kn)
1611 {
1612 	uint64_t now, deadline = kn->kn_ext[0];
1613 
1614 	if (deadline == 0) {
1615 		return true;
1616 	}
1617 
1618 	if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1619 		now = mach_continuous_time();
1620 	} else {
1621 		now = mach_absolute_time();
1622 	}
1623 	return deadline <= now;
1624 }
1625 
1626 /*
1627  * Arm a timer
1628  *
1629  * It is the responsibility of the caller to make sure the timer call
1630  * has completed or been cancelled properly prior to arming it.
1631  */
1632 static void
filt_timerarm(struct knote * kn)1633 filt_timerarm(struct knote *kn)
1634 {
1635 	uint64_t deadline = kn->kn_ext[0];
1636 	uint64_t leeway   = kn->kn_ext[1];
1637 	uint32_t state;
1638 
1639 	int filter_flags = kn->kn_sfflags;
1640 	unsigned int timer_flags = 0;
1641 
1642 	if (filter_flags & NOTE_CRITICAL) {
1643 		timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1644 	} else if (filter_flags & NOTE_BACKGROUND) {
1645 		timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1646 	} else {
1647 		timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1648 	}
1649 
1650 	if (filter_flags & NOTE_LEEWAY) {
1651 		timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1652 	}
1653 
1654 	if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1655 		timer_flags |= THREAD_CALL_CONTINUOUS;
1656 	}
1657 
1658 	/*
1659 	 * Move to ARMED.
1660 	 *
1661 	 * We increase the gencount, and setup the thread call with this expected
1662 	 * state. It means that if there was a previous generation of the timer in
1663 	 * flight that needs to be ignored, then 3 things are possible:
1664 	 *
1665 	 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1666 	 *   but we clobber it with ARMED and a new gencount. The knote will still
1667 	 *   be activated, but filt_timerprocess() which is serialized with this
1668 	 *   call will not see the FIRED bit set and will not deliver an event.
1669 	 *
1670 	 * - this code runs first, but filt_timerexpire() comes second. Because it
1671 	 *   knows an old gencount, it will debounce and not activate the knote.
1672 	 *
1673 	 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1674 	 *   will just cancel it properly.
1675 	 *
1676 	 * This is important as userspace expects to never be woken up for past
1677 	 * timers after filt_timertouch ran.
1678 	 */
1679 	state = os_atomic_load(&kn->kn_hook32, relaxed);
1680 	state &= ~TIMER_STATE_MASK;
1681 	state += TIMER_GEN_INC + TIMER_ARMED;
1682 	os_atomic_store(&kn->kn_hook32, state, relaxed);
1683 
1684 	thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1685 	    (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1686 }
1687 
1688 /*
1689  * Mark a timer as "already fired" when it is being reprogrammed
1690  *
1691  * If there is a timer in flight, this will do a best effort at canceling it,
1692  * but will not wait. If the thread call was in flight, having set the
1693  * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1694  * cancelation.
1695  */
1696 static void
filt_timerfire_immediate(struct knote * kn)1697 filt_timerfire_immediate(struct knote *kn)
1698 {
1699 	uint32_t state;
1700 
1701 	static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1702 	    "validate that this atomic or will transition to IMMEDIATE");
1703 	state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1704 
1705 	if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1706 		thread_call_cancel(kn->kn_thcall);
1707 	}
1708 }
1709 
1710 /*
1711  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1712  */
1713 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1714 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1715 {
1716 	thread_call_t callout;
1717 	struct filt_timer_params params;
1718 	int error;
1719 
1720 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1721 		knote_set_error(kn, error);
1722 		return 0;
1723 	}
1724 
1725 	callout = thread_call_allocate_with_options(filt_timerexpire,
1726 	    (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1727 	    THREAD_CALL_OPTIONS_ONCE);
1728 
1729 	if (NULL == callout) {
1730 		knote_set_error(kn, ENOMEM);
1731 		return 0;
1732 	}
1733 
1734 	filt_timer_set_params(kn, &params);
1735 	kn->kn_thcall = callout;
1736 	kn->kn_flags |= EV_CLEAR;
1737 	os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1738 
1739 	/* NOTE_ABSOLUTE implies EV_ONESHOT */
1740 	if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1741 		kn->kn_flags |= EV_ONESHOT;
1742 	}
1743 
1744 	if (filt_timer_is_ready(kn)) {
1745 		os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1746 		return FILTER_ACTIVE;
1747 	} else {
1748 		filt_timerarm(kn);
1749 		return 0;
1750 	}
1751 }
1752 
1753 /*
1754  * Shut down the timer if it's running, and free the callout.
1755  */
1756 static void
filt_timerdetach(struct knote * kn)1757 filt_timerdetach(struct knote *kn)
1758 {
1759 	__assert_only boolean_t freed;
1760 
1761 	/*
1762 	 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1763 	 * running anymore.
1764 	 */
1765 	thread_call_cancel_wait(kn->kn_thcall);
1766 	freed = thread_call_free(kn->kn_thcall);
1767 	assert(freed);
1768 }
1769 
1770 /*
1771  * filt_timertouch - update timer knote with new user input
1772  *
1773  * Cancel and restart the timer based on new user data. When
1774  * the user picks up a knote, clear the count of how many timer
1775  * pops have gone off (in kn_data).
1776  */
1777 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1778 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1779 {
1780 	struct filt_timer_params params;
1781 	uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1782 	int error;
1783 
1784 	if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1785 	    !_pthread_priority_thread_qos(kev->qos)) {
1786 		/* validate usage of FILTER_UPDATE_REQ_QOS */
1787 		kev->flags |= EV_ERROR;
1788 		kev->data = ERANGE;
1789 		return 0;
1790 	}
1791 
1792 	if (changed_flags & NOTE_ABSOLUTE) {
1793 		kev->flags |= EV_ERROR;
1794 		kev->data = EINVAL;
1795 		return 0;
1796 	}
1797 
1798 	if ((error = filt_timervalidate(kev, &params)) != 0) {
1799 		kev->flags |= EV_ERROR;
1800 		kev->data = error;
1801 		return 0;
1802 	}
1803 
1804 	/* capture the new values used to compute deadline */
1805 	filt_timer_set_params(kn, &params);
1806 	kn->kn_sfflags = kev->fflags;
1807 
1808 	if (filt_timer_is_ready(kn)) {
1809 		filt_timerfire_immediate(kn);
1810 		return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1811 	} else {
1812 		filt_timerarm(kn);
1813 		return FILTER_UPDATE_REQ_QOS;
1814 	}
1815 }
1816 
1817 /*
1818  * filt_timerprocess - query state of knote and snapshot event data
1819  *
1820  * Determine if the timer has fired in the past, snapshot the state
1821  * of the kevent for returning to user-space, and clear pending event
1822  * counters for the next time.
1823  */
1824 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1825 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1826 {
1827 	uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1828 
1829 	/*
1830 	 * filt_timerprocess is serialized with any filter routine except for
1831 	 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1832 	 * transition, and on success, activates the knote.
1833 	 *
1834 	 * Hence, we don't need atomic modifications of the state, only to peek at
1835 	 * whether we see any of the "FIRED" state, and if we do, it is safe to
1836 	 * do simple state machine transitions.
1837 	 */
1838 	switch (state & TIMER_STATE_MASK) {
1839 	case TIMER_IDLE:
1840 	case TIMER_ARMED:
1841 		/*
1842 		 * This can happen if a touch resets a timer that had fired
1843 		 * without being processed
1844 		 */
1845 		return 0;
1846 	}
1847 
1848 	os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1849 
1850 	/*
1851 	 * Copy out the interesting kevent state,
1852 	 * but don't leak out the raw time calculations.
1853 	 *
1854 	 * TODO: potential enhancements - tell the user about:
1855 	 *      - deadline to which this timer thought it was expiring
1856 	 *      - return kn_sfflags in the fflags field so the client can know
1857 	 *        under what flags the timer fired
1858 	 */
1859 	knote_fill_kevent(kn, kev, 1);
1860 	kev->ext[0] = 0;
1861 	/* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1862 
1863 	if (kn->kn_sdata != 0) {
1864 		/*
1865 		 * This is a 'repeating' timer, so we have to emit
1866 		 * how many intervals expired between the arm
1867 		 * and the process.
1868 		 *
1869 		 * A very strange style of interface, because
1870 		 * this could easily be done in the client...
1871 		 */
1872 
1873 		uint64_t now;
1874 
1875 		if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1876 			now = mach_continuous_time();
1877 		} else {
1878 			now = mach_absolute_time();
1879 		}
1880 
1881 		uint64_t first_deadline = kn->kn_ext[0];
1882 		uint64_t interval_abs   = kn->kn_sdata;
1883 		uint64_t orig_arm_time  = first_deadline - interval_abs;
1884 
1885 		assert(now > orig_arm_time);
1886 		assert(now > first_deadline);
1887 
1888 		uint64_t elapsed = now - orig_arm_time;
1889 
1890 		uint64_t num_fired = elapsed / interval_abs;
1891 
1892 		/*
1893 		 * To reach this code, we must have seen the timer pop
1894 		 * and be in repeating mode, so therefore it must have been
1895 		 * more than 'interval' time since the attach or last
1896 		 * successful touch.
1897 		 */
1898 		assert(num_fired > 0);
1899 
1900 		/* report how many intervals have elapsed to the user */
1901 		kev->data = (int64_t)num_fired;
1902 
1903 		/* We only need to re-arm the timer if it's not about to be destroyed */
1904 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
1905 			/* fire at the end of the next interval */
1906 			uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1907 
1908 			assert(new_deadline > now);
1909 
1910 			kn->kn_ext[0] = new_deadline;
1911 
1912 			/*
1913 			 * This can't shortcut setting up the thread call, because
1914 			 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1915 			 */
1916 			filt_timerarm(kn);
1917 		}
1918 	}
1919 
1920 	return FILTER_ACTIVE;
1921 }
1922 
1923 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1924 	.f_extended_codes = true,
1925 	.f_attach   = filt_timerattach,
1926 	.f_detach   = filt_timerdetach,
1927 	.f_event    = filt_bad_event,
1928 	.f_touch    = filt_timertouch,
1929 	.f_process  = filt_timerprocess,
1930 };
1931 
1932 #pragma mark user_filtops
1933 
1934 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1935 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1936 {
1937 	if (kn->kn_sfflags & NOTE_TRIGGER) {
1938 		kn->kn_hook32 = FILTER_ACTIVE;
1939 	} else {
1940 		kn->kn_hook32 = 0;
1941 	}
1942 	return kn->kn_hook32;
1943 }
1944 
1945 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1946 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1947 {
1948 	uint32_t ffctrl;
1949 	int fflags;
1950 
1951 	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1952 	fflags = kev->fflags & NOTE_FFLAGSMASK;
1953 	switch (ffctrl) {
1954 	case NOTE_FFNOP:
1955 		break;
1956 	case NOTE_FFAND:
1957 		kn->kn_sfflags &= fflags;
1958 		break;
1959 	case NOTE_FFOR:
1960 		kn->kn_sfflags |= fflags;
1961 		break;
1962 	case NOTE_FFCOPY:
1963 		kn->kn_sfflags = fflags;
1964 		break;
1965 	}
1966 	kn->kn_sdata = kev->data;
1967 
1968 	if (kev->fflags & NOTE_TRIGGER) {
1969 		kn->kn_hook32 = FILTER_ACTIVE;
1970 	}
1971 	return (int)kn->kn_hook32;
1972 }
1973 
1974 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1975 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1976 {
1977 	int result = (int)kn->kn_hook32;
1978 
1979 	if (result) {
1980 		/* EVFILT_USER returns the data that was passed in */
1981 		knote_fill_kevent_with_sdata(kn, kev);
1982 		kev->fflags = kn->kn_sfflags;
1983 		if (kn->kn_flags & EV_CLEAR) {
1984 			/* knote_fill_kevent cleared kn_fflags */
1985 			kn->kn_hook32 = 0;
1986 		}
1987 	}
1988 
1989 	return result;
1990 }
1991 
1992 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1993 	.f_extended_codes = true,
1994 	.f_attach  = filt_userattach,
1995 	.f_detach  = filt_no_detach,
1996 	.f_event   = filt_bad_event,
1997 	.f_touch   = filt_usertouch,
1998 	.f_process = filt_userprocess,
1999 };
2000 
2001 #pragma mark workloop_filtops
2002 
2003 #define EPREEMPTDISABLED (-1)
2004 
2005 static inline void
filt_wllock(struct kqworkloop * kqwl)2006 filt_wllock(struct kqworkloop *kqwl)
2007 {
2008 	lck_spin_lock(&kqwl->kqwl_statelock);
2009 }
2010 
2011 static inline void
filt_wlunlock(struct kqworkloop * kqwl)2012 filt_wlunlock(struct kqworkloop *kqwl)
2013 {
2014 	lck_spin_unlock(&kqwl->kqwl_statelock);
2015 }
2016 
2017 /*
2018  * Returns true when the interlock for the turnstile is the workqueue lock
2019  *
2020  * When this is the case, all turnstiles operations are delegated
2021  * to the workqueue subsystem.
2022  *
2023  * This is required because kqueue_threadreq_bind_prepost only holds the
2024  * workqueue lock but needs to move the inheritor from the workloop turnstile
2025  * away from the creator thread, so that this now fulfilled request cannot be
2026  * picked anymore by other threads.
2027  */
2028 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)2029 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2030 {
2031 	return kqr_thread_requested_pending(&kqwl->kqwl_request);
2032 }
2033 
2034 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)2035 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
2036     turnstile_update_flags_t flags)
2037 {
2038 	turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2039 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2040 
2041 	/*
2042 	 * binding to the workq should always happen through
2043 	 * workq_kern_threadreq_update_inheritor()
2044 	 */
2045 	assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2046 
2047 	if ((inheritor = kqwl->kqwl_owner)) {
2048 		flags |= TURNSTILE_INHERITOR_THREAD;
2049 	} else if ((inheritor = kqr_thread(kqr))) {
2050 		flags |= TURNSTILE_INHERITOR_THREAD;
2051 	}
2052 
2053 	turnstile_update_inheritor(ts, inheritor, flags);
2054 }
2055 
2056 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2057 #define FILT_WLATTACH 0
2058 #define FILT_WLTOUCH  1
2059 #define FILT_WLDROP   2
2060 
2061 __result_use_check
2062 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)2063 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
2064     struct kevent_qos_s *kev, kq_index_t qos_index, int op)
2065 {
2066 	user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2067 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2068 	thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2069 	kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2070 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2071 	int action = KQWL_UTQ_NONE, error = 0;
2072 	bool wl_inheritor_updated = false, needs_wake = false;
2073 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2074 	uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2075 	uint64_t udata = 0;
2076 	struct turnstile *ts = TURNSTILE_NULL;
2077 
2078 	filt_wllock(kqwl);
2079 
2080 again:
2081 	new_owner = cur_owner = kqwl->kqwl_owner;
2082 
2083 	/*
2084 	 * Phase 1:
2085 	 *
2086 	 * If asked, load the uint64 value at the user provided address and compare
2087 	 * it against the passed in mask and expected value.
2088 	 *
2089 	 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2090 	 * a thread reference.
2091 	 *
2092 	 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2093 	 * the current thread, then end ownership.
2094 	 *
2095 	 * Lastly decide whether we need to perform a QoS update.
2096 	 */
2097 	if (uaddr) {
2098 		/*
2099 		 * Until <rdar://problem/24999882> exists,
2100 		 * disabling preemption copyin forces any
2101 		 * vm_fault we encounter to fail.
2102 		 */
2103 		error = copyin_atomic64(uaddr, &udata);
2104 
2105 		/*
2106 		 * If we get EFAULT, drop locks, and retry.
2107 		 * If we still get an error report it,
2108 		 * else assume the memory has been faulted
2109 		 * and attempt to copyin under lock again.
2110 		 */
2111 		switch (error) {
2112 		case 0:
2113 			break;
2114 		case EFAULT:
2115 			if (efault_retry-- > 0) {
2116 				filt_wlunlock(kqwl);
2117 				error = copyin_atomic64(uaddr, &udata);
2118 				filt_wllock(kqwl);
2119 				if (error == 0) {
2120 					goto again;
2121 				}
2122 			}
2123 			OS_FALLTHROUGH;
2124 		default:
2125 			goto out;
2126 		}
2127 
2128 		/* Update state as copied in.  */
2129 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2130 
2131 		if ((udata & mask) != (kdata & mask)) {
2132 			error = ESTALE;
2133 		} else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2134 			/*
2135 			 * Decipher the owner port name, and translate accordingly.
2136 			 * The low 2 bits were borrowed for other flags, so mask them off.
2137 			 *
2138 			 * Then attempt translation to a thread reference or fail.
2139 			 */
2140 			mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2141 			if (name != MACH_PORT_NULL) {
2142 				name = ipc_entry_name_mask(name);
2143 				extra_thread_ref = port_name_to_thread(name,
2144 				    PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2145 				if (extra_thread_ref == THREAD_NULL) {
2146 					error = EOWNERDEAD;
2147 					goto out;
2148 				}
2149 				new_owner = extra_thread_ref;
2150 			}
2151 		}
2152 	}
2153 
2154 	if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2155 		new_owner = THREAD_NULL;
2156 	}
2157 
2158 	if (error == 0) {
2159 		if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2160 			action = KQWL_UTQ_SET_QOS_INDEX;
2161 		} else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2162 			action = KQWL_UTQ_SET_QOS_INDEX;
2163 		}
2164 
2165 		if (op == FILT_WLTOUCH) {
2166 			/*
2167 			 * Save off any additional fflags/data we just accepted
2168 			 * But only keep the last round of "update" bits we acted on which helps
2169 			 * debugging a lot.
2170 			 */
2171 			kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2172 			kn->kn_sfflags |= kev->fflags;
2173 			if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2174 				needs_wake = (kn->kn_thread != THREAD_NULL);
2175 			}
2176 		} else if (op == FILT_WLDROP) {
2177 			if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2178 			    NOTE_WL_SYNC_WAIT) {
2179 				/*
2180 				 * When deleting a SYNC_WAIT knote that hasn't been woken up
2181 				 * explicitly, issue a wake up.
2182 				 */
2183 				kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2184 				needs_wake = (kn->kn_thread != THREAD_NULL);
2185 			}
2186 		}
2187 	}
2188 
2189 	/*
2190 	 * Phase 2:
2191 	 *
2192 	 * Commit ownership and QoS changes if any, possibly wake up waiters
2193 	 */
2194 
2195 	if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2196 		goto out;
2197 	}
2198 
2199 	kqlock(kqwl);
2200 
2201 	/* If already tracked as servicer, don't track as owner */
2202 	if (new_owner == kqr_thread(kqr)) {
2203 		new_owner = THREAD_NULL;
2204 	}
2205 
2206 	if (cur_owner != new_owner) {
2207 		kqwl->kqwl_owner = new_owner;
2208 		if (new_owner == extra_thread_ref) {
2209 			/* we just transfered this ref to kqwl_owner */
2210 			extra_thread_ref = THREAD_NULL;
2211 		}
2212 		cur_override = kqworkloop_override(kqwl);
2213 
2214 		if (new_owner) {
2215 			/* override it before we drop the old */
2216 			if (cur_override != THREAD_QOS_UNSPECIFIED) {
2217 				thread_add_kevent_override(new_owner, cur_override);
2218 			}
2219 			if (kqr_thread_requested_pending(kqr)) {
2220 				if (action == KQWL_UTQ_NONE) {
2221 					action = KQWL_UTQ_REDRIVE_EVENTS;
2222 				}
2223 			}
2224 		} else if (action == KQWL_UTQ_NONE &&
2225 		    !kqr_thread_requested(kqr) &&
2226 		    kqwl->kqwl_wakeup_qos) {
2227 			action = KQWL_UTQ_REDRIVE_EVENTS;
2228 		}
2229 	}
2230 
2231 	if (action != KQWL_UTQ_NONE) {
2232 		kqworkloop_update_threads_qos(kqwl, action, qos_index);
2233 	}
2234 
2235 	ts = kqwl->kqwl_turnstile;
2236 	if (cur_owner != new_owner && ts) {
2237 		if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2238 			/*
2239 			 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2240 			 * the code went through workq_kern_threadreq_initiate()
2241 			 * and the workqueue has set the inheritor already
2242 			 */
2243 			assert(filt_wlturnstile_interlock_is_workq(kqwl));
2244 		} else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2245 			workq_kern_threadreq_lock(kqwl->kqwl_p);
2246 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2247 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
2248 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2249 			if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2250 				/*
2251 				 * If the workq is no longer the interlock, then
2252 				 * workq_kern_threadreq_update_inheritor() has finished a bind
2253 				 * and we need to fallback to the regular path.
2254 				 */
2255 				filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2256 			}
2257 			wl_inheritor_updated = true;
2258 		} else {
2259 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2260 			wl_inheritor_updated = true;
2261 		}
2262 
2263 		/*
2264 		 * We need a turnstile reference because we are dropping the interlock
2265 		 * and the caller has not called turnstile_prepare.
2266 		 */
2267 		if (wl_inheritor_updated) {
2268 			turnstile_reference(ts);
2269 		}
2270 	}
2271 
2272 	if (needs_wake && ts) {
2273 		waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2274 		    kn->kn_thread, THREAD_AWAKENED);
2275 		if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2276 			disable_preemption();
2277 			error = EPREEMPTDISABLED;
2278 		}
2279 	}
2280 
2281 	kqunlock(kqwl);
2282 
2283 out:
2284 	/*
2285 	 * Phase 3:
2286 	 *
2287 	 * Unlock and cleanup various lingering references and things.
2288 	 */
2289 	filt_wlunlock(kqwl);
2290 
2291 #if CONFIG_WORKLOOP_DEBUG
2292 	KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2293 		.updater = current_thread(),
2294 		.servicer = kqr_thread(kqr), /* Note: racy */
2295 		.old_owner = cur_owner,
2296 		.new_owner = new_owner,
2297 
2298 		.kev_ident  = kev->ident,
2299 		.error      = (int16_t)error,
2300 		.kev_flags  = kev->flags,
2301 		.kev_fflags = kev->fflags,
2302 
2303 		.kev_mask   = mask,
2304 		.kev_value  = kdata,
2305 		.in_value   = udata,
2306 	});
2307 #endif // CONFIG_WORKLOOP_DEBUG
2308 
2309 	if (wl_inheritor_updated) {
2310 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2311 		turnstile_deallocate(ts);
2312 	}
2313 
2314 	if (cur_owner && new_owner != cur_owner) {
2315 		if (cur_override != THREAD_QOS_UNSPECIFIED) {
2316 			thread_drop_kevent_override(cur_owner);
2317 		}
2318 		thread_deallocate_safe(cur_owner);
2319 	}
2320 	if (extra_thread_ref) {
2321 		thread_deallocate_safe(extra_thread_ref);
2322 	}
2323 	return error;
2324 }
2325 
2326 /*
2327  * Remembers the last updated that came in from userspace for debugging reasons.
2328  * - fflags is mirrored from the userspace kevent
2329  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2330  * - ext[VALUE] is set to what the kernel loaded atomically
2331  * - data is set to the error if any
2332  */
2333 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2334 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2335     int error)
2336 {
2337 	kn->kn_fflags = kev->fflags;
2338 	kn->kn_sdata = error;
2339 	memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2340 }
2341 
2342 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2343 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2344     struct kevent_qos_s *kev, int op)
2345 {
2346 	user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2347 	uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2348 	uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
2349 	uint64_t udata = 0;
2350 	int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2351 	int error = 0;
2352 
2353 	if (op == FILT_WLATTACH) {
2354 		(void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2355 	} else if (uaddr == 0) {
2356 		return 0;
2357 	}
2358 
2359 	filt_wllock(kqwl);
2360 
2361 again:
2362 
2363 	/*
2364 	 * Do the debounce thing, the lock serializing the state is the knote lock.
2365 	 */
2366 	if (uaddr) {
2367 		/*
2368 		 * Until <rdar://problem/24999882> exists,
2369 		 * disabling preemption copyin forces any
2370 		 * vm_fault we encounter to fail.
2371 		 */
2372 		error = copyin_atomic64(uaddr, &udata);
2373 
2374 		/*
2375 		 * If we get EFAULT, drop locks, and retry.
2376 		 * If we still get an error report it,
2377 		 * else assume the memory has been faulted
2378 		 * and attempt to copyin under lock again.
2379 		 */
2380 		switch (error) {
2381 		case 0:
2382 			break;
2383 		case EFAULT:
2384 			if (efault_retry-- > 0) {
2385 				filt_wlunlock(kqwl);
2386 				error = copyin_atomic64(uaddr, &udata);
2387 				filt_wllock(kqwl);
2388 				if (error == 0) {
2389 					goto again;
2390 				}
2391 			}
2392 			OS_FALLTHROUGH;
2393 		default:
2394 			goto out;
2395 		}
2396 
2397 		kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2398 		kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2399 
2400 		if ((udata & mask) != (kdata & mask)) {
2401 			error = ESTALE;
2402 			goto out;
2403 		}
2404 	}
2405 
2406 	if (op == FILT_WLATTACH) {
2407 		error = filt_wlattach_sync_ipc(kn);
2408 		if (error == 0) {
2409 			disable_preemption();
2410 			error = EPREEMPTDISABLED;
2411 		}
2412 	}
2413 
2414 out:
2415 	filt_wlunlock(kqwl);
2416 	return error;
2417 }
2418 
2419 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2420 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2421 {
2422 	struct kqueue *kq = knote_get_kq(kn);
2423 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2424 	int error = 0, result = 0;
2425 	kq_index_t qos_index = 0;
2426 
2427 	if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2428 		error = ENOTSUP;
2429 		goto out;
2430 	}
2431 
2432 	uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2433 	switch (command) {
2434 	case NOTE_WL_THREAD_REQUEST:
2435 		if (kn->kn_id != kqwl->kqwl_dynamicid) {
2436 			error = EINVAL;
2437 			goto out;
2438 		}
2439 		qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2440 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
2441 			error = ERANGE;
2442 			goto out;
2443 		}
2444 		if (kqwl->kqwl_request.tr_kq_qos_index) {
2445 			/*
2446 			 * There already is a thread request, and well, you're only allowed
2447 			 * one per workloop, so fail the attach.
2448 			 */
2449 			error = EALREADY;
2450 			goto out;
2451 		}
2452 		break;
2453 	case NOTE_WL_SYNC_WAIT:
2454 	case NOTE_WL_SYNC_WAKE:
2455 		if (kn->kn_id == kqwl->kqwl_dynamicid) {
2456 			error = EINVAL;
2457 			goto out;
2458 		}
2459 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2460 			error = EINVAL;
2461 			goto out;
2462 		}
2463 		if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2464 			error = EINVAL;
2465 			goto out;
2466 		}
2467 		break;
2468 
2469 	case NOTE_WL_SYNC_IPC:
2470 		if ((kn->kn_flags & EV_DISABLE) == 0) {
2471 			error = EINVAL;
2472 			goto out;
2473 		}
2474 		if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2475 			error = EINVAL;
2476 			goto out;
2477 		}
2478 		break;
2479 	default:
2480 		error = EINVAL;
2481 		goto out;
2482 	}
2483 
2484 	if (command == NOTE_WL_SYNC_IPC) {
2485 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2486 	} else {
2487 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2488 	}
2489 
2490 	if (error == EPREEMPTDISABLED) {
2491 		error = 0;
2492 		result = FILTER_THREADREQ_NODEFEER;
2493 	}
2494 out:
2495 	if (error) {
2496 		/* If userland wants ESTALE to be hidden, fail the attach anyway */
2497 		if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2498 			error = 0;
2499 		}
2500 		knote_set_error(kn, error);
2501 		return result;
2502 	}
2503 	if (command == NOTE_WL_SYNC_WAIT) {
2504 		return kevent_register_wait_prepare(kn, kev, result);
2505 	}
2506 	/* Just attaching the thread request successfully will fire it */
2507 	if (command == NOTE_WL_THREAD_REQUEST) {
2508 		/*
2509 		 * Thread Request knotes need an explicit touch to be active again,
2510 		 * so delivering an event needs to also consume it.
2511 		 */
2512 		kn->kn_flags |= EV_CLEAR;
2513 		return result | FILTER_ACTIVE;
2514 	}
2515 	return result;
2516 }
2517 
2518 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2519 filt_wlwait_continue(void *parameter, wait_result_t wr)
2520 {
2521 	struct _kevent_register *cont_args = parameter;
2522 	struct kqworkloop *kqwl = cont_args->kqwl;
2523 
2524 	kqlock(kqwl);
2525 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2526 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2527 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2528 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2529 	} else {
2530 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2531 	}
2532 	kqunlock(kqwl);
2533 
2534 	turnstile_cleanup();
2535 
2536 	if (wr == THREAD_INTERRUPTED) {
2537 		cont_args->kev.flags |= EV_ERROR;
2538 		cont_args->kev.data = EINTR;
2539 	} else if (wr != THREAD_AWAKENED) {
2540 		panic("Unexpected wait result: %d", wr);
2541 	}
2542 
2543 	kevent_register_wait_return(cont_args);
2544 }
2545 
2546 /*
2547  * Called with the workloop mutex held, most of the time never returns as it
2548  * calls filt_wlwait_continue through a continuation.
2549  */
2550 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2551 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2552     struct _kevent_register *cont_args)
2553 {
2554 	struct kqworkloop *kqwl = cont_args->kqwl;
2555 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2556 	struct turnstile *ts;
2557 	bool workq_locked = false;
2558 
2559 	kqlock_held(kqwl);
2560 
2561 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2562 		workq_kern_threadreq_lock(kqwl->kqwl_p);
2563 		workq_locked = true;
2564 	}
2565 
2566 	ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2567 	    TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2568 
2569 	if (workq_locked) {
2570 		workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2571 		    &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2572 		    TURNSTILE_DELAYED_UPDATE);
2573 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2574 			/*
2575 			 * if the interlock is no longer the workqueue lock,
2576 			 * then we don't need to hold it anymore.
2577 			 */
2578 			workq_kern_threadreq_unlock(kqwl->kqwl_p);
2579 			workq_locked = false;
2580 		}
2581 	}
2582 	if (!workq_locked) {
2583 		/*
2584 		 * If the interlock is the workloop's, then it's our responsibility to
2585 		 * call update_inheritor, so just do it.
2586 		 */
2587 		filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2588 	}
2589 
2590 	thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2591 	waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2592 	    THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2593 
2594 	if (workq_locked) {
2595 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
2596 	}
2597 
2598 	thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2599 	if (thread) {
2600 		thread_reference(thread);
2601 	}
2602 
2603 	kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2604 }
2605 
2606 /* called in stackshot context to report the thread responsible for blocking this thread */
2607 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2608 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2609     event64_t event, thread_waitinfo_t *waitinfo)
2610 {
2611 	struct knote *kn = (struct knote *)event;
2612 
2613 	zone_require(knote_zone, kn);
2614 
2615 	assert(kn->kn_thread == thread);
2616 
2617 	struct kqueue *kq = knote_get_kq(kn);
2618 
2619 	zone_require(kqworkloop_zone, kq);
2620 	assert(kq->kq_state & KQ_WORKLOOP);
2621 
2622 	struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2623 	workq_threadreq_t kqr = &kqwl->kqwl_request;
2624 
2625 	thread_t kqwl_owner = kqwl->kqwl_owner;
2626 
2627 	if (kqwl_owner != THREAD_NULL) {
2628 		thread_require(kqwl_owner);
2629 		waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2630 	} else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2631 		thread_require(kqr->tr_thread);
2632 		waitinfo->owner = thread_tid(kqr->tr_thread);
2633 	} else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2634 		waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2635 	} else {
2636 		waitinfo->owner = 0;
2637 	}
2638 
2639 	waitinfo->context = kqwl->kqwl_dynamicid;
2640 }
2641 
2642 static void
filt_wldetach(struct knote * kn)2643 filt_wldetach(struct knote *kn)
2644 {
2645 	if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2646 		filt_wldetach_sync_ipc(kn);
2647 	} else if (kn->kn_thread) {
2648 		kevent_register_wait_cleanup(kn);
2649 	}
2650 }
2651 
2652 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2653 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2654     thread_qos_t *qos_index)
2655 {
2656 	uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2657 	uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2658 
2659 	if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2660 		return EINVAL;
2661 	}
2662 	if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2663 		if (kev->flags & EV_DELETE) {
2664 			return EINVAL;
2665 		}
2666 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2667 			return EINVAL;
2668 		}
2669 		if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2670 			return ERANGE;
2671 		}
2672 	}
2673 
2674 	switch (new_commands) {
2675 	case NOTE_WL_THREAD_REQUEST:
2676 		/* thread requests can only update themselves */
2677 		if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2678 			return EINVAL;
2679 		}
2680 		break;
2681 
2682 	case NOTE_WL_SYNC_WAIT:
2683 		if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2684 			return EINVAL;
2685 		}
2686 		goto sync_checks;
2687 
2688 	case NOTE_WL_SYNC_WAKE:
2689 sync_checks:
2690 		if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2691 			return EINVAL;
2692 		}
2693 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2694 			return EINVAL;
2695 		}
2696 		break;
2697 
2698 	case NOTE_WL_SYNC_IPC:
2699 		if (sav_commands != NOTE_WL_SYNC_IPC) {
2700 			return EINVAL;
2701 		}
2702 		if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2703 			return EINVAL;
2704 		}
2705 		break;
2706 
2707 	default:
2708 		return EINVAL;
2709 	}
2710 	return 0;
2711 }
2712 
2713 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2714 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2715 {
2716 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2717 	thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2718 	int result = 0;
2719 
2720 	int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2721 	if (error) {
2722 		goto out;
2723 	}
2724 
2725 	uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2726 	if (command == NOTE_WL_SYNC_IPC) {
2727 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2728 	} else {
2729 		error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2730 		filt_wlremember_last_update(kn, kev, error);
2731 	}
2732 	if (error == EPREEMPTDISABLED) {
2733 		error = 0;
2734 		result = FILTER_THREADREQ_NODEFEER;
2735 	}
2736 
2737 out:
2738 	if (error) {
2739 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2740 			/* If userland wants ESTALE to be hidden, do not activate */
2741 			return result;
2742 		}
2743 		kev->flags |= EV_ERROR;
2744 		kev->data = error;
2745 		return result;
2746 	}
2747 	if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2748 		return kevent_register_wait_prepare(kn, kev, result);
2749 	}
2750 	/* Just touching the thread request successfully will fire it */
2751 	if (command == NOTE_WL_THREAD_REQUEST) {
2752 		if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2753 			result |= FILTER_UPDATE_REQ_QOS;
2754 		}
2755 		result |= FILTER_ACTIVE;
2756 	}
2757 	return result;
2758 }
2759 
2760 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2761 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2762 {
2763 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2764 
2765 	int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2766 	if (error) {
2767 		goto out;
2768 	}
2769 
2770 	uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2771 	if (command == NOTE_WL_SYNC_IPC) {
2772 		error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2773 	} else {
2774 		error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2775 		filt_wlremember_last_update(kn, kev, error);
2776 	}
2777 	assert(error != EPREEMPTDISABLED);
2778 
2779 out:
2780 	if (error) {
2781 		if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2782 			return false;
2783 		}
2784 		kev->flags |= EV_ERROR;
2785 		kev->data = error;
2786 		return false;
2787 	}
2788 	return true;
2789 }
2790 
2791 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2792 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2793 {
2794 	struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2795 	int rc = 0;
2796 
2797 	assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2798 
2799 	kqlock(kqwl);
2800 
2801 	if (kqwl->kqwl_owner) {
2802 		/*
2803 		 * <rdar://problem/33584321> userspace sometimes due to events being
2804 		 * delivered but not triggering a drain session can cause a process
2805 		 * of the thread request knote.
2806 		 *
2807 		 * When that happens, the automatic deactivation due to process
2808 		 * would swallow the event, so we have to activate the knote again.
2809 		 */
2810 		knote_activate(kqwl, kn, FILTER_ACTIVE);
2811 	} else {
2812 #if DEBUG || DEVELOPMENT
2813 		if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2814 			/*
2815 			 * see src/queue_internal.h in libdispatch
2816 			 */
2817 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2818 			user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2819 			task_t t = current_task();
2820 			uint64_t val;
2821 			if (addr && task_is_active(t) && !task_is_halting(t) &&
2822 			    copyin_atomic64(addr, &val) == 0 &&
2823 			    val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2824 			    (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2825 				panic("kevent: workloop %#016llx is not enqueued "
2826 				    "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2827 				    kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2828 			}
2829 		}
2830 #endif
2831 		knote_fill_kevent(kn, kev, 0);
2832 		kev->fflags = kn->kn_sfflags;
2833 		rc |= FILTER_ACTIVE;
2834 	}
2835 
2836 	kqunlock(kqwl);
2837 
2838 	if (rc & FILTER_ACTIVE) {
2839 		workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2840 	}
2841 	return rc;
2842 }
2843 
2844 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2845 	.f_extended_codes = true,
2846 	.f_attach  = filt_wlattach,
2847 	.f_detach  = filt_wldetach,
2848 	.f_event   = filt_bad_event,
2849 	.f_touch   = filt_wltouch,
2850 	.f_process = filt_wlprocess,
2851 	.f_allow_drop = filt_wlallow_drop,
2852 	.f_post_register_wait = filt_wlpost_register_wait,
2853 };
2854 
2855 #pragma mark - kqueues allocation and deallocation
2856 
2857 OS_NOINLINE
2858 static void
2859 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2860 
2861 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2862 kqworkloop_try_retain(struct kqworkloop *kqwl)
2863 {
2864 	return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2865 }
2866 
2867 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2868 kqworkloop_retain(struct kqworkloop *kqwl)
2869 {
2870 	return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2871 }
2872 
2873 OS_ALWAYS_INLINE
2874 static inline void
kqueue_retain(kqueue_t kqu)2875 kqueue_retain(kqueue_t kqu)
2876 {
2877 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2878 		kqworkloop_retain(kqu.kqwl);
2879 	}
2880 }
2881 
2882 OS_ALWAYS_INLINE
2883 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2884 kqworkloop_release_live(struct kqworkloop *kqwl)
2885 {
2886 	os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2887 }
2888 
2889 OS_ALWAYS_INLINE
2890 static inline void
kqueue_release_live(kqueue_t kqu)2891 kqueue_release_live(kqueue_t kqu)
2892 {
2893 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2894 		kqworkloop_release_live(kqu.kqwl);
2895 	}
2896 }
2897 
2898 OS_ALWAYS_INLINE
2899 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2900 kqworkloop_release(struct kqworkloop *kqwl)
2901 {
2902 	if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2903 		kqworkloop_dealloc(kqwl, true);
2904 	}
2905 }
2906 
2907 OS_ALWAYS_INLINE
2908 static inline void
kqueue_release(kqueue_t kqu)2909 kqueue_release(kqueue_t kqu)
2910 {
2911 	if (kqu.kq->kq_state & KQ_DYNAMIC) {
2912 		kqworkloop_release(kqu.kqwl);
2913 	}
2914 }
2915 
2916 /*!
2917  * @function kqueue_destroy
2918  *
2919  * @brief
2920  * Common part to all kqueue dealloc functions.
2921  */
2922 OS_NOINLINE
2923 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2924 kqueue_destroy(kqueue_t kqu, zone_t zone)
2925 {
2926 	lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2927 
2928 	zfree(zone, kqu.kq);
2929 }
2930 
2931 /*!
2932  * @function kqueue_init
2933  *
2934  * @brief
2935  * Common part to all kqueue alloc functions.
2936  */
2937 static kqueue_t
kqueue_init(kqueue_t kqu)2938 kqueue_init(kqueue_t kqu)
2939 {
2940 	lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2941 	return kqu;
2942 }
2943 
2944 #pragma mark kqfile allocation and deallocation
2945 
2946 /*!
2947  * @function kqueue_dealloc
2948  *
2949  * @brief
2950  * Detach all knotes from a kqfile and free it.
2951  *
2952  * @discussion
2953  * We walk each list looking for knotes referencing this
2954  * this kqueue.  If we find one, we try to drop it.  But
2955  * if we fail to get a drop reference, that will wait
2956  * until it is dropped.  So, we can just restart again
2957  * safe in the assumption that the list will eventually
2958  * not contain any more references to this kqueue (either
2959  * we dropped them all, or someone else did).
2960  *
2961  * Assumes no new events are being added to the kqueue.
2962  * Nothing locked on entry or exit.
2963  */
2964 void
kqueue_dealloc(struct kqueue * kq)2965 kqueue_dealloc(struct kqueue *kq)
2966 {
2967 	KNOTE_LOCK_CTX(knlc);
2968 	struct proc *p = kq->kq_p;
2969 	struct filedesc *fdp = &p->p_fd;
2970 	struct knote *kn;
2971 
2972 	assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2973 
2974 	proc_fdlock(p);
2975 	for (int i = 0; i < fdp->fd_knlistsize; i++) {
2976 		kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2977 		while (kn != NULL) {
2978 			if (kq == knote_get_kq(kn)) {
2979 				kqlock(kq);
2980 				proc_fdunlock(p);
2981 				if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2982 					knote_drop(kq, kn, &knlc);
2983 				}
2984 				proc_fdlock(p);
2985 				/* start over at beginning of list */
2986 				kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2987 				continue;
2988 			}
2989 			kn = SLIST_NEXT(kn, kn_link);
2990 		}
2991 	}
2992 
2993 	knhash_lock(fdp);
2994 	proc_fdunlock(p);
2995 
2996 	if (fdp->fd_knhashmask != 0) {
2997 		for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2998 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2999 			while (kn != NULL) {
3000 				if (kq == knote_get_kq(kn)) {
3001 					kqlock(kq);
3002 					knhash_unlock(fdp);
3003 					if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3004 						knote_drop(kq, kn, &knlc);
3005 					}
3006 					knhash_lock(fdp);
3007 					/* start over at beginning of list */
3008 					kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3009 					continue;
3010 				}
3011 				kn = SLIST_NEXT(kn, kn_link);
3012 			}
3013 		}
3014 	}
3015 	knhash_unlock(fdp);
3016 
3017 	kqueue_destroy(kq, kqfile_zone);
3018 }
3019 
3020 /*!
3021  * @function kqueue_alloc
3022  *
3023  * @brief
3024  * Allocate a kqfile.
3025  */
3026 struct kqueue *
kqueue_alloc(struct proc * p)3027 kqueue_alloc(struct proc *p)
3028 {
3029 	struct kqfile *kqf;
3030 
3031 	/*
3032 	 * kqfiles are created with kqueue() so we need to wait for
3033 	 * the first kevent syscall to know which bit among
3034 	 * KQ_KEV_{32,64,QOS} will be set in kqf_state
3035 	 */
3036 	kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
3037 	kqf->kqf_p = p;
3038 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3039 	TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3040 
3041 	return kqueue_init(kqf).kq;
3042 }
3043 
3044 /*!
3045  * @function kqueue_internal
3046  *
3047  * @brief
3048  * Core implementation for kqueue and guarded_kqueue_np()
3049  */
3050 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)3051 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
3052 {
3053 	struct kqueue *kq;
3054 	struct fileproc *fp;
3055 	int fd, error;
3056 
3057 	error = falloc_withinit(p, current_cached_proc_cred(p),
3058 	    vfs_context_current(), &fp, &fd, fp_init, initarg);
3059 	if (error) {
3060 		return error;
3061 	}
3062 
3063 	kq = kqueue_alloc(p);
3064 	if (kq == NULL) {
3065 		fp_free(p, fd, fp);
3066 		return ENOMEM;
3067 	}
3068 
3069 	fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3070 	fp->f_flag = FREAD | FWRITE;
3071 	fp->f_ops = &kqueueops;
3072 	fp_set_data(fp, kq);
3073 	fp->f_lflags |= FG_CONFINED;
3074 
3075 	proc_fdlock(p);
3076 	procfdtbl_releasefd(p, fd, NULL);
3077 	fp_drop(p, fd, fp, 1);
3078 	proc_fdunlock(p);
3079 
3080 	*retval = fd;
3081 	return error;
3082 }
3083 
3084 /*!
3085  * @function kqueue
3086  *
3087  * @brief
3088  * The kqueue syscall.
3089  */
3090 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3091 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3092 {
3093 	return kqueue_internal(p, NULL, NULL, retval);
3094 }
3095 
3096 #pragma mark kqworkq allocation and deallocation
3097 
3098 /*!
3099  * @function kqworkq_dealloc
3100  *
3101  * @brief
3102  * Deallocates a workqueue kqueue.
3103  *
3104  * @discussion
3105  * This only happens at process death, or for races with concurrent
3106  * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3107  * this kqueue, either there are none, or someone else took care of them.
3108  */
3109 void
kqworkq_dealloc(struct kqworkq * kqwq)3110 kqworkq_dealloc(struct kqworkq *kqwq)
3111 {
3112 	kqueue_destroy(kqwq, kqworkq_zone);
3113 }
3114 
3115 /*!
3116  * @function kqworkq_alloc
3117  *
3118  * @brief
3119  * Allocates a workqueue kqueue.
3120  *
3121  * @discussion
3122  * This is the slow path of kevent_get_kqwq.
3123  * This takes care of making sure procs have a single workq kqueue.
3124  */
3125 OS_NOINLINE
3126 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3127 kqworkq_alloc(struct proc *p, unsigned int flags)
3128 {
3129 	struct kqworkq *kqwq, *tmp;
3130 
3131 	kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3132 
3133 	assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3134 	if (flags & KEVENT_FLAG_LEGACY64) {
3135 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3136 	} else {
3137 		kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3138 	}
3139 	kqwq->kqwq_p = p;
3140 
3141 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3142 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3143 		TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3144 	}
3145 	for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3146 		/*
3147 		 * Because of how the bucketized system works, we mix overcommit
3148 		 * sources with not overcommit: each time we move a knote from
3149 		 * one bucket to the next due to overrides, we'd had to track
3150 		 * overcommitness, and it's really not worth it in the workloop
3151 		 * enabled world that track this faithfully.
3152 		 *
3153 		 * Incidentally, this behaves like the original manager-based
3154 		 * kqwq where event delivery always happened (hence is
3155 		 * "overcommit")
3156 		 */
3157 		kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3158 		kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3159 		if (i != KQWQ_QOS_MANAGER) {
3160 			kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3161 		}
3162 		kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3163 	}
3164 
3165 	kqueue_init(kqwq);
3166 
3167 	if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3168 		kqworkq_dealloc(kqwq);
3169 		return tmp;
3170 	}
3171 
3172 	return kqwq;
3173 }
3174 
3175 #pragma mark kqworkloop allocation and deallocation
3176 
3177 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3178 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3179 
3180 OS_ALWAYS_INLINE
3181 static inline void
kqhash_lock(struct filedesc * fdp)3182 kqhash_lock(struct filedesc *fdp)
3183 {
3184 	lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3185 }
3186 
3187 OS_ALWAYS_INLINE
3188 static inline void
kqhash_unlock(struct filedesc * fdp)3189 kqhash_unlock(struct filedesc *fdp)
3190 {
3191 	lck_mtx_unlock(&fdp->fd_kqhashlock);
3192 }
3193 
3194 OS_ALWAYS_INLINE
3195 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3196 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3197     struct kqworkloop *kqwl)
3198 {
3199 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3200 	LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3201 }
3202 
3203 OS_ALWAYS_INLINE
3204 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3205 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3206 {
3207 	struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3208 	struct kqworkloop *kqwl;
3209 
3210 	LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3211 		if (kqwl->kqwl_dynamicid == id) {
3212 			return kqwl;
3213 		}
3214 	}
3215 	return NULL;
3216 }
3217 
3218 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3219 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3220 {
3221 	struct kqworkloop *kqwl = NULL;
3222 
3223 	kqhash_lock(fdp);
3224 	if (__probable(fdp->fd_kqhash)) {
3225 		kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3226 		if (kqwl && !kqworkloop_try_retain(kqwl)) {
3227 			kqwl = NULL;
3228 		}
3229 	}
3230 	kqhash_unlock(fdp);
3231 	return kqwl;
3232 }
3233 
3234 OS_NOINLINE
3235 static void
kqworkloop_hash_init(struct filedesc * fdp)3236 kqworkloop_hash_init(struct filedesc *fdp)
3237 {
3238 	struct kqwllist *alloc_hash;
3239 	u_long alloc_mask;
3240 
3241 	kqhash_unlock(fdp);
3242 	alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3243 	kqhash_lock(fdp);
3244 
3245 	/* See if we won the race */
3246 	if (__probable(fdp->fd_kqhashmask == 0)) {
3247 		fdp->fd_kqhash = alloc_hash;
3248 		fdp->fd_kqhashmask = alloc_mask;
3249 	} else {
3250 		kqhash_unlock(fdp);
3251 		hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3252 		kqhash_lock(fdp);
3253 	}
3254 }
3255 
3256 /*
3257  * kqueue iotier override is only supported for kqueue that has
3258  * only one port as a mach port source. Updating the iotier
3259  * override on the mach port source will update the override
3260  * on kqueue as well. Since kqueue with iotier override will
3261  * only have one port attached, there is no logic for saturation
3262  * like qos override, the iotier override of mach port source
3263  * would be reflected in kevent iotier override.
3264  */
3265 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3266 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3267 {
3268 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3269 		return;
3270 	}
3271 
3272 	struct kqworkloop *kqwl = kqu.kqwl;
3273 	os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3274 }
3275 
3276 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3277 kqueue_get_iotier_override(kqueue_t kqu)
3278 {
3279 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3280 		return THROTTLE_LEVEL_END;
3281 	}
3282 
3283 	struct kqworkloop *kqwl = kqu.kqwl;
3284 	return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3285 }
3286 
3287 #if CONFIG_PREADOPT_TG
3288 /*
3289  * This function is called with a borrowed reference on the thread group without
3290  * kq lock held with the mqueue lock held. It may or may not have the knote lock
3291  * (called from both fevent as well as fattach/ftouch). Upon success, an
3292  * additional reference on the TG is taken
3293  */
3294 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3295 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3296 {
3297 	if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3298 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3299 		    (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3300 		return;
3301 	}
3302 
3303 	struct kqworkloop *kqwl = kqu.kqwl;
3304 
3305 	assert(qos < THREAD_QOS_LAST);
3306 
3307 	thread_group_retain(tg);
3308 
3309 	thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3310 	int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3311 		if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3312 		        os_atomic_rmw_loop_give_up(break);
3313 		}
3314 
3315 		if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3316 		        /*
3317 		         * Note that old_tg could be a NULL TG pointer but with a QoS
3318 		         * set. See also workq_thread_reset_pri.
3319 		         *
3320 		         * Compare the QoS of existing preadopted tg with new one and
3321 		         * only overwrite the thread group if we have one with a higher
3322 		         * QoS.
3323 		         */
3324 		        thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3325 		        if (existing_qos >= qos) {
3326 		                os_atomic_rmw_loop_give_up(break);
3327 			}
3328 		}
3329 
3330 		// Transfer the ref taken earlier in the function to the kqwl
3331 		new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3332 	});
3333 
3334 	if (ret) {
3335 		KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3336 
3337 		if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3338 			thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3339 		}
3340 
3341 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3342 	} else {
3343 		// We failed to write to the kqwl_preadopt_tg, drop the ref we took
3344 		// earlier in the function
3345 		thread_group_deallocate_safe(tg);
3346 	}
3347 }
3348 
3349 /*
3350  * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3351  */
3352 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3353 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3354 {
3355 	bool success = false;
3356 	if (kq->kq_state & KQ_WORKLOOP) {
3357 		struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3358 		thread_group_qos_t old_tg;
3359 		success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3360 		    KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3361 		    &old_tg, relaxed);
3362 		if (success) {
3363 			thread_set_preadopt_thread_group(thread, tg);
3364 		} else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3365 			/*
3366 			 * Technically the following set_preadopt should be a no-op since this
3367 			 * servicer thread preadopts kqwl's permanent tg at bind time.
3368 			 * See kqueue_threadreq_bind.
3369 			 */
3370 			thread_set_preadopt_thread_group(thread, KQWL_GET_PREADOPTED_TG(old_tg));
3371 		} else {
3372 			assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3373 			    old_tg == KQWL_PREADOPTED_TG_NEVER);
3374 		}
3375 	}
3376 	return success;
3377 }
3378 #endif
3379 
3380 /*!
3381  * @function kqworkloop_dealloc
3382  *
3383  * @brief
3384  * Deallocates a workloop kqueue.
3385  *
3386  * @discussion
3387  * Knotes hold references on the workloop, so we can't really reach this
3388  * function unless all of these are already gone.
3389  *
3390  * Nothing locked on entry or exit.
3391  *
3392  * @param hash_remove
3393  * Whether to remove the workloop from its hash table.
3394  */
3395 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3396 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3397 {
3398 	thread_t cur_owner;
3399 
3400 	cur_owner = kqwl->kqwl_owner;
3401 	if (cur_owner) {
3402 		if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3403 			thread_drop_kevent_override(cur_owner);
3404 		}
3405 		thread_deallocate(cur_owner);
3406 		kqwl->kqwl_owner = THREAD_NULL;
3407 	}
3408 
3409 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3410 		struct turnstile *ts;
3411 		turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3412 		    &ts, TURNSTILE_WORKLOOPS);
3413 		turnstile_cleanup();
3414 		turnstile_deallocate(ts);
3415 	}
3416 
3417 	if (hash_remove) {
3418 		struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3419 
3420 		kqhash_lock(fdp);
3421 		LIST_REMOVE(kqwl, kqwl_hashlink);
3422 #if CONFIG_PROC_RESOURCE_LIMITS
3423 		fdp->num_kqwls--;
3424 #endif
3425 		kqhash_unlock(fdp);
3426 	}
3427 
3428 #if CONFIG_PREADOPT_TG
3429 	thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3430 	if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3431 		thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3432 	}
3433 #endif
3434 
3435 	workq_threadreq_t kqr = &kqwl->kqwl_request;
3436 	if ((kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) && kqr->tr_work_interval) {
3437 		kern_work_interval_release(kqr->tr_work_interval);
3438 	}
3439 
3440 	assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3441 	assert(kqwl->kqwl_owner == THREAD_NULL);
3442 	assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3443 
3444 	lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3445 	kqueue_destroy(kqwl, kqworkloop_zone);
3446 }
3447 
3448 /*!
3449  * @function kqworkloop_init
3450  *
3451  * @brief
3452  * Initializes an allocated kqworkloop.
3453  */
3454 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended)3455 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3456     kqueue_id_t id, workq_threadreq_param_t *trp,
3457     struct workq_threadreq_extended_param_s *trp_extended)
3458 {
3459 	kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3460 	os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3461 	kqwl->kqwl_dynamicid = id;
3462 	kqwl->kqwl_p         = p;
3463 	if (trp) {
3464 		kqwl->kqwl_params = trp->trp_value;
3465 	}
3466 
3467 	workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3468 	if (trp) {
3469 		if (trp->trp_flags & TRP_PRIORITY) {
3470 			tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3471 		}
3472 		if (trp->trp_flags & TRP_BOUND_THREAD) {
3473 			tr_flags |= WORKQ_TR_FLAG_PERMANENT_BIND;
3474 		}
3475 		if (trp->trp_flags) {
3476 			tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3477 		}
3478 	}
3479 	kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3480 	kqwl->kqwl_request.tr_flags = tr_flags;
3481 	os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3482 #if CONFIG_PREADOPT_TG
3483 	if (trp_extended && trp_extended->trp_permanent_preadopt_tg) {
3484 		/*
3485 		 * This kqwl is permanently configured with a thread group.
3486 		 * By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3487 		 * has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3488 		 * trp_extended->trp_permanent_preadopt_tg is transferred to the kqwl.
3489 		 */
3490 		thread_group_qos_t kqwl_preadopt_tg;
3491 		kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_extended->trp_permanent_preadopt_tg);
3492 		os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3493 	} else if (task_is_app(current_task())) {
3494 		/*
3495 		 * Not a specially preconfigured kqwl so it is open to participate in sync IPC
3496 		 * thread group preadoption; but, apps will never adopt a thread group that
3497 		 * is not their own. This is a gross hack to simulate the post-process that
3498 		 * is done in the voucher subsystem today for thread groups.
3499 		 */
3500 		os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3501 	}
3502 #endif
3503 	if (trp_extended) {
3504 		if (trp_extended->trp_work_interval) {
3505 			/*
3506 			 * The +1 ref on the work interval is transferred to the kqwl.
3507 			 */
3508 			assert(tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
3509 			kqwl->kqwl_request.tr_work_interval = trp_extended->trp_work_interval;
3510 		}
3511 	}
3512 	for (int i = 0; i < KQWL_NBUCKETS; i++) {
3513 		TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3514 	}
3515 	TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3516 
3517 	lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3518 
3519 	kqueue_init(kqwl);
3520 }
3521 
3522 #if CONFIG_PROC_RESOURCE_LIMITS
3523 void
kqworkloop_check_limit_exceeded(struct filedesc * fdp)3524 kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3525 {
3526 	int num_kqwls = fdp->num_kqwls;
3527 	if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > 0 &&
3528 	    num_kqwls > fdp->kqwl_dyn_soft_limit) {
3529 		kqwl_above_soft_limit_send_notification(fdp);
3530 		act_set_astproc_resource(current_thread());
3531 	} else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > 0
3532 	    && num_kqwls > fdp->kqwl_dyn_hard_limit) {
3533 		kqwl_above_hard_limit_send_notification(fdp);
3534 		act_set_astproc_resource(current_thread());
3535 	}
3536 }
3537 #endif
3538 
3539 /*!
3540  * @function kqworkloop_get_or_create
3541  *
3542  * @brief
3543  * Wrapper around kqworkloop_init that handles the uniquing of workloops.
3544  *
3545  * @returns
3546  * 0:      success
3547  * EINVAL: invalid parameters
3548  * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3549  * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3550  * ENOMEM: allocation failed
3551  */
3552 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended,unsigned int flags,struct kqworkloop ** kqwlp)3553 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3554     workq_threadreq_param_t *trp,
3555     struct workq_threadreq_extended_param_s *trp_extended,
3556     unsigned int flags, struct kqworkloop **kqwlp)
3557 {
3558 	struct filedesc *fdp = &p->p_fd;
3559 	struct kqworkloop *alloc_kqwl = NULL;
3560 	struct kqworkloop *kqwl = NULL;
3561 	int error = 0;
3562 
3563 	assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3564 
3565 	if (id == 0 || id == (kqueue_id_t)-1) {
3566 		return EINVAL;
3567 	}
3568 
3569 	for (;;) {
3570 		kqhash_lock(fdp);
3571 		if (__improbable(fdp->fd_kqhash == NULL)) {
3572 			kqworkloop_hash_init(fdp);
3573 		}
3574 
3575 		kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3576 		if (kqwl) {
3577 			if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3578 				/*
3579 				 * If MUST_NOT_EXIST was passed, even if we would have failed
3580 				 * the try_retain, it could have gone the other way, and
3581 				 * userspace can't tell. Let'em fix their race.
3582 				 */
3583 				error = EEXIST;
3584 				break;
3585 			}
3586 
3587 			if (__probable(kqworkloop_try_retain(kqwl))) {
3588 				/*
3589 				 * This is a valid live workloop !
3590 				 */
3591 				*kqwlp = kqwl;
3592 				error = 0;
3593 				break;
3594 			}
3595 		}
3596 
3597 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3598 			error = ENOENT;
3599 			break;
3600 		}
3601 
3602 		/*
3603 		 * We didn't find what we were looking for.
3604 		 *
3605 		 * If this is the second time we reach this point (alloc_kqwl != NULL),
3606 		 * then we're done.
3607 		 *
3608 		 * If this is the first time we reach this point (alloc_kqwl == NULL),
3609 		 * then try to allocate one without blocking.
3610 		 */
3611 		if (__probable(alloc_kqwl == NULL)) {
3612 			alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3613 		}
3614 		if (__probable(alloc_kqwl)) {
3615 #if CONFIG_PROC_RESOURCE_LIMITS
3616 			fdp->num_kqwls++;
3617 			kqworkloop_check_limit_exceeded(fdp);
3618 #endif
3619 			kqworkloop_init(alloc_kqwl, p, id, trp, trp_extended);
3620 			/*
3621 			 * The newly allocated and initialized kqwl has a retain count of 1.
3622 			 */
3623 			kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3624 			if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3625 				/*
3626 				 * If this kqworkloop is configured to be permanently bound to
3627 				 * a thread, we take +1 ref on that thread's behalf before we
3628 				 * unlock the kqhash below. The reason being this new kqwl is
3629 				 * findable in the hash table as soon as we unlock the kqhash
3630 				 * and we want to make sure this kqwl does not get deleted from
3631 				 * under us by the time we create a new thread and bind to it.
3632 				 *
3633 				 * This ref is released when the bound thread unbinds itself
3634 				 * from the kqwl on its way to termination.
3635 				 * See uthread_cleanup -> kqueue_threadreq_unbind.
3636 				 *
3637 				 * The kqwl now has a retain count of 2.
3638 				 */
3639 				kqworkloop_retain(alloc_kqwl);
3640 			}
3641 			kqhash_unlock(fdp);
3642 			/*
3643 			 * We do not want to keep holding kqhash lock when workq is
3644 			 * busy creating and initializing a new thread to bind to this
3645 			 * kqworkloop.
3646 			 */
3647 			if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3648 				error = workq_kern_threadreq_permanent_bind(p, &alloc_kqwl->kqwl_request);
3649 				if (error != KERN_SUCCESS) {
3650 					/*
3651 					 * The kqwl we just created and initialized has a retain
3652 					 * count of 2 at this point i.e. 1 from kqworkloop_init and
3653 					 * 1 on behalf of the bound thread. We need to release
3654 					 * both the references here to successfully deallocate this
3655 					 * kqwl before we return an error.
3656 					 *
3657 					 * The latter release should take care of deallocating
3658 					 * the kqwl itself and removing it from the kqhash.
3659 					 */
3660 					kqworkloop_release(alloc_kqwl);
3661 					kqworkloop_release(alloc_kqwl);
3662 					alloc_kqwl = NULL;
3663 					if (trp_extended) {
3664 						/*
3665 						 * Since we transferred these refs to kqwl during
3666 						 * kqworkloop_init, the kqwl takes care of releasing them.
3667 						 * We don't have any refs to return to our caller
3668 						 * in this case.
3669 						 */
3670 #if CONFIG_PREADOPT_TG
3671 						if (trp_extended->trp_permanent_preadopt_tg) {
3672 							trp_extended->trp_permanent_preadopt_tg = NULL;
3673 						}
3674 #endif
3675 						if (trp_extended->trp_work_interval) {
3676 							trp_extended->trp_work_interval = NULL;
3677 						}
3678 					}
3679 					return error;
3680 				} else {
3681 					/*
3682 					 * For kqwl configured with a bound thread, KQ_SLEEP is used
3683 					 * to track whether the bound thread needs to be woken up
3684 					 * when such a kqwl is woken up.
3685 					 *
3686 					 * See kqworkloop_bound_thread_wakeup and
3687 					 * kqworkloop_bound_thread_park_prepost.
3688 					 *
3689 					 * Once the kqwl is initialized, this state
3690 					 * should always be manipulated under kqlock.
3691 					 */
3692 					kqlock(alloc_kqwl);
3693 					alloc_kqwl->kqwl_state |= KQ_SLEEP;
3694 					kqunlock(alloc_kqwl);
3695 				}
3696 			}
3697 			*kqwlp = alloc_kqwl;
3698 			return 0;
3699 		}
3700 
3701 		/*
3702 		 * We have to block to allocate a workloop, drop the lock,
3703 		 * allocate one, but then we need to retry lookups as someone
3704 		 * else could race with us.
3705 		 */
3706 		kqhash_unlock(fdp);
3707 
3708 		alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3709 	}
3710 
3711 	kqhash_unlock(fdp);
3712 
3713 	if (__improbable(alloc_kqwl)) {
3714 		zfree(kqworkloop_zone, alloc_kqwl);
3715 	}
3716 
3717 	return error;
3718 }
3719 
3720 #pragma mark - knotes
3721 
3722 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3723 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3724 {
3725 	knote_set_error(kn, ENOTSUP);
3726 	return 0;
3727 }
3728 
3729 static void
filt_no_detach(__unused struct knote * kn)3730 filt_no_detach(__unused struct knote *kn)
3731 {
3732 }
3733 
3734 static int __dead2
filt_bad_event(struct knote * kn,long hint)3735 filt_bad_event(struct knote *kn, long hint)
3736 {
3737 	panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3738 }
3739 
3740 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3741 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3742 {
3743 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3744 }
3745 
3746 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3747 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3748 {
3749 	panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3750 }
3751 
3752 /*
3753  * knotes_dealloc - detach all knotes for the process and drop them
3754  *
3755  *		Process is in such a state that it will not try to allocate
3756  *		any more knotes during this process (stopped for exit or exec).
3757  */
3758 void
knotes_dealloc(proc_t p)3759 knotes_dealloc(proc_t p)
3760 {
3761 	struct filedesc *fdp = &p->p_fd;
3762 	struct kqueue *kq;
3763 	struct knote *kn;
3764 	struct  klist *kn_hash = NULL;
3765 	u_long kn_hashmask;
3766 	int i;
3767 
3768 	proc_fdlock(p);
3769 
3770 	/* Close all the fd-indexed knotes up front */
3771 	if (fdp->fd_knlistsize > 0) {
3772 		for (i = 0; i < fdp->fd_knlistsize; i++) {
3773 			while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3774 				kq = knote_get_kq(kn);
3775 				kqlock(kq);
3776 				proc_fdunlock(p);
3777 				knote_drop(kq, kn, NULL);
3778 				proc_fdlock(p);
3779 			}
3780 		}
3781 		/* free the table */
3782 		kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3783 	}
3784 	fdp->fd_knlistsize = 0;
3785 
3786 	proc_fdunlock(p);
3787 
3788 	knhash_lock(fdp);
3789 
3790 	/* Clean out all the hashed knotes as well */
3791 	if (fdp->fd_knhashmask != 0) {
3792 		for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3793 			while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3794 				kq = knote_get_kq(kn);
3795 				kqlock(kq);
3796 				knhash_unlock(fdp);
3797 				knote_drop(kq, kn, NULL);
3798 				knhash_lock(fdp);
3799 			}
3800 		}
3801 		kn_hash = fdp->fd_knhash;
3802 		kn_hashmask = fdp->fd_knhashmask;
3803 		fdp->fd_knhashmask = 0;
3804 		fdp->fd_knhash = NULL;
3805 	}
3806 
3807 	knhash_unlock(fdp);
3808 
3809 	if (kn_hash) {
3810 		hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3811 	}
3812 }
3813 
3814 /*
3815  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3816  * scheduling parameters
3817  *
3818  * Process is in such a state that it will not try to allocate
3819  * any more kqs or knotes during this process (stopped for exit or exec).
3820  */
3821 void
kqworkloops_dealloc(proc_t p)3822 kqworkloops_dealloc(proc_t p)
3823 {
3824 	struct filedesc *fdp = &p->p_fd;
3825 	struct kqworkloop *kqwl, *kqwln;
3826 	struct kqwllist tofree;
3827 
3828 	if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3829 		return;
3830 	}
3831 
3832 	kqhash_lock(fdp);
3833 
3834 	if (fdp->fd_kqhashmask == 0) {
3835 		kqhash_unlock(fdp);
3836 		return;
3837 	}
3838 
3839 	LIST_INIT(&tofree);
3840 
3841 	for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3842 		LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3843 #if CONFIG_PREADOPT_TG
3844 			/*
3845 			 * kqworkloops that have scheduling parameters have an
3846 			 * implicit retain from kqueue_workloop_ctl that needs
3847 			 * to be balanced on process exit.
3848 			 */
3849 			__assert_only thread_group_qos_t preadopt_tg;
3850 			preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3851 #endif
3852 			assert(kqwl->kqwl_params
3853 #if CONFIG_PREADOPT_TG
3854 			    || KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3855 #endif
3856 			    );
3857 
3858 			LIST_REMOVE(kqwl, kqwl_hashlink);
3859 			LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3860 		}
3861 	}
3862 #if CONFIG_PROC_RESOURCE_LIMITS
3863 	fdp->num_kqwls = 0;
3864 #endif
3865 	kqhash_unlock(fdp);
3866 
3867 	LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3868 		uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3869 		if (ref != 1) {
3870 			panic("kq(%p) invalid refcount %d", kqwl, ref);
3871 		}
3872 		kqworkloop_dealloc(kqwl, false);
3873 	}
3874 }
3875 
3876 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3877 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3878     struct kevent_qos_s *kev)
3879 {
3880 	/* We don't care about the priority of a disabled or deleted knote */
3881 	if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3882 		return 0;
3883 	}
3884 
3885 	if (kq->kq_state & KQ_WORKLOOP) {
3886 		/*
3887 		 * Workloops need valid priorities with a QOS (excluding manager) for
3888 		 * any enabled knote.
3889 		 *
3890 		 * When it is pre-existing, just make sure it has a valid QoS as
3891 		 * kevent_register() will not use the incoming priority (filters who do
3892 		 * have the responsibility to validate it again, see filt_wltouch).
3893 		 *
3894 		 * If the knote is being made, validate the incoming priority.
3895 		 */
3896 		if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3897 			return ERANGE;
3898 		}
3899 	}
3900 
3901 	return 0;
3902 }
3903 
3904 /*
3905  * Prepare a filter for waiting after register.
3906  *
3907  * The f_post_register_wait hook will be called later by kevent_register()
3908  * and should call kevent_register_wait_block()
3909  */
3910 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3911 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3912 {
3913 	thread_t thread = current_thread();
3914 
3915 	assert(knote_fops(kn)->f_extended_codes);
3916 
3917 	if (kn->kn_thread == NULL) {
3918 		thread_reference(thread);
3919 		kn->kn_thread = thread;
3920 	} else if (kn->kn_thread != thread) {
3921 		/*
3922 		 * kn_thread may be set from a previous aborted wait
3923 		 * However, it has to be from the same thread.
3924 		 */
3925 		kev->flags |= EV_ERROR;
3926 		kev->data = EXDEV;
3927 		return 0;
3928 	}
3929 
3930 	return FILTER_REGISTER_WAIT | rc;
3931 }
3932 
3933 /*
3934  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3935  * aborted instead of properly woken up with thread_wakeup_thread().
3936  */
3937 static void
kevent_register_wait_cleanup(struct knote * kn)3938 kevent_register_wait_cleanup(struct knote *kn)
3939 {
3940 	thread_t thread = kn->kn_thread;
3941 	kn->kn_thread = NULL;
3942 	thread_deallocate(thread);
3943 }
3944 
3945 /*
3946  * Must be called at the end of a f_post_register_wait call from a filter.
3947  */
3948 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3949 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3950     thread_continue_t cont, struct _kevent_register *cont_args)
3951 {
3952 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3953 	kqunlock(cont_args->kqwl);
3954 	cont_args->handoff_thread = thread;
3955 	thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3956 }
3957 
3958 /*
3959  * Called by Filters using a f_post_register_wait to return from their wait.
3960  */
3961 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3962 kevent_register_wait_return(struct _kevent_register *cont_args)
3963 {
3964 	struct kqworkloop *kqwl = cont_args->kqwl;
3965 	struct kevent_qos_s *kev = &cont_args->kev;
3966 	int error = 0;
3967 
3968 	if (cont_args->handoff_thread) {
3969 		thread_deallocate(cont_args->handoff_thread);
3970 	}
3971 
3972 	if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3973 		if ((kev->flags & EV_ERROR) == 0) {
3974 			kev->flags |= EV_ERROR;
3975 			kev->data = 0;
3976 		}
3977 		error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3978 		if (error == 0) {
3979 			cont_args->eventout++;
3980 		}
3981 	}
3982 
3983 	kqworkloop_release(kqwl);
3984 	if (error == 0) {
3985 		*(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3986 	}
3987 	unix_syscall_return(error);
3988 }
3989 
3990 /*
3991  * kevent_register - add a new event to a kqueue
3992  *
3993  *	Creates a mapping between the event source and
3994  *	the kqueue via a knote data structure.
3995  *
3996  *	Because many/most the event sources are file
3997  *	descriptor related, the knote is linked off
3998  *	the filedescriptor table for quick access.
3999  *
4000  *	called with nothing locked
4001  *	caller holds a reference on the kqueue
4002  */
4003 
4004 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)4005 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
4006     struct knote **kn_out)
4007 {
4008 	struct proc *p = kq->kq_p;
4009 	const struct filterops *fops;
4010 	struct knote *kn = NULL;
4011 	int result = 0, error = 0;
4012 	unsigned short kev_flags = kev->flags;
4013 	KNOTE_LOCK_CTX(knlc);
4014 
4015 	if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
4016 		fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
4017 	} else {
4018 		error = EINVAL;
4019 		goto out;
4020 	}
4021 
4022 	/* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4023 	if (__improbable((kev->flags & EV_VANISHED) &&
4024 	    (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
4025 		error = EINVAL;
4026 		goto out;
4027 	}
4028 
4029 	/* Simplify the flags - delete and disable overrule */
4030 	if (kev->flags & EV_DELETE) {
4031 		kev->flags &= ~EV_ADD;
4032 	}
4033 	if (kev->flags & EV_DISABLE) {
4034 		kev->flags &= ~EV_ENABLE;
4035 	}
4036 
4037 	if (kq->kq_state & KQ_WORKLOOP) {
4038 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4039 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4040 		    kev->udata, kev->flags, kev->filter);
4041 	} else if (kq->kq_state & KQ_WORKQ) {
4042 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4043 		    0, kev->udata, kev->flags, kev->filter);
4044 	} else {
4045 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4046 		    VM_KERNEL_UNSLIDE_OR_PERM(kq),
4047 		    kev->udata, kev->flags, kev->filter);
4048 	}
4049 
4050 restart:
4051 	/* find the matching knote from the fd tables/hashes */
4052 	kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4053 	error = kevent_register_validate_priority(kq, kn, kev);
4054 	result = 0;
4055 	if (error) {
4056 		if (kn) {
4057 			kqunlock(kq);
4058 		}
4059 		goto out;
4060 	}
4061 
4062 	if (kn == NULL && (kev->flags & EV_ADD) == 0) {
4063 		/*
4064 		 * No knote found, EV_ADD wasn't specified
4065 		 */
4066 
4067 		if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4068 		    (kq->kq_state & KQ_WORKLOOP)) {
4069 			/*
4070 			 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4071 			 * that doesn't care about ENOENT, so just pretend the deletion
4072 			 * happened.
4073 			 */
4074 		} else {
4075 			error = ENOENT;
4076 		}
4077 		goto out;
4078 	} else if (kn == NULL) {
4079 		/*
4080 		 * No knote found, need to attach a new one (attach)
4081 		 */
4082 
4083 		struct fileproc *knote_fp = NULL;
4084 
4085 		/* grab a file reference for the new knote */
4086 		if (fops->f_isfd) {
4087 			if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
4088 				goto out;
4089 			}
4090 		}
4091 
4092 		kn = knote_alloc();
4093 		kn->kn_fp = knote_fp;
4094 		kn->kn_is_fd = fops->f_isfd;
4095 		kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
4096 		kn->kn_status = 0;
4097 
4098 		/* was vanish support requested */
4099 		if (kev->flags & EV_VANISHED) {
4100 			kev->flags &= ~EV_VANISHED;
4101 			kn->kn_status |= KN_REQVANISH;
4102 		}
4103 
4104 		/* snapshot matching/dispatching protocol flags into knote */
4105 		if (kev->flags & EV_DISABLE) {
4106 			kn->kn_status |= KN_DISABLED;
4107 		}
4108 
4109 		/*
4110 		 * copy the kevent state into knote
4111 		 * protocol is that fflags and data
4112 		 * are saved off, and cleared before
4113 		 * calling the attach routine.
4114 		 *
4115 		 * - kn->kn_sfflags aliases with kev->xflags
4116 		 * - kn->kn_sdata   aliases with kev->data
4117 		 * - kn->kn_filter  is the top 8 bits of kev->filter
4118 		 */
4119 		kn->kn_kevent  = *(struct kevent_internal_s *)kev;
4120 		kn->kn_sfflags = kev->fflags;
4121 		kn->kn_filtid  = (uint8_t)~kev->filter;
4122 		kn->kn_fflags  = 0;
4123 		knote_reset_priority(kq, kn, kev->qos);
4124 
4125 		/* Add the knote for lookup thru the fd table */
4126 		error = kq_add_knote(kq, kn, &knlc, p);
4127 		if (error) {
4128 			knote_free(kn);
4129 			if (knote_fp != NULL) {
4130 				fp_drop(p, (int)kev->ident, knote_fp, 0);
4131 			}
4132 
4133 			if (error == ERESTART) {
4134 				goto restart;
4135 			}
4136 			goto out;
4137 		}
4138 
4139 		/* fp reference count now applies to knote */
4140 
4141 		/*
4142 		 * we can't use filter_call() because f_attach can change the filter ops
4143 		 * for a filter that supports f_extended_codes, so we need to reload
4144 		 * knote_fops() and not use `fops`.
4145 		 */
4146 		result = fops->f_attach(kn, kev);
4147 		if (result && !knote_fops(kn)->f_extended_codes) {
4148 			result = FILTER_ACTIVE;
4149 		}
4150 
4151 		kqlock(kq);
4152 
4153 		if (result & FILTER_THREADREQ_NODEFEER) {
4154 			enable_preemption();
4155 		}
4156 
4157 		if (kn->kn_flags & EV_ERROR) {
4158 			/*
4159 			 * Failed to attach correctly, so drop.
4160 			 */
4161 			kn->kn_filtid = EVFILTID_DETACHED;
4162 			error = (int)kn->kn_sdata;
4163 			knote_drop(kq, kn, &knlc);
4164 			result = 0;
4165 			goto out;
4166 		}
4167 
4168 		/*
4169 		 * end "attaching" phase - now just attached
4170 		 *
4171 		 * Mark the thread request overcommit, if appropos
4172 		 *
4173 		 * If the attach routine indicated that an
4174 		 * event is already fired, activate the knote.
4175 		 */
4176 		if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4177 		    (kq->kq_state & KQ_WORKLOOP)) {
4178 			kqworkloop_set_overcommit((struct kqworkloop *)kq);
4179 		}
4180 	} else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4181 		/*
4182 		 * The knote was dropped while we were waiting for the lock,
4183 		 * we need to re-evaluate entirely
4184 		 */
4185 
4186 		goto restart;
4187 	} else if (kev->flags & EV_DELETE) {
4188 		/*
4189 		 * Deletion of a knote (drop)
4190 		 *
4191 		 * If the filter wants to filter drop events, let it do so.
4192 		 *
4193 		 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4194 		 * we must wait for the knote to be re-enabled (unless it is being
4195 		 * re-enabled atomically here).
4196 		 */
4197 
4198 		if (knote_fops(kn)->f_allow_drop) {
4199 			bool drop;
4200 
4201 			kqunlock(kq);
4202 			drop = knote_fops(kn)->f_allow_drop(kn, kev);
4203 			kqlock(kq);
4204 
4205 			if (!drop) {
4206 				goto out_unlock;
4207 			}
4208 		}
4209 
4210 		if ((kev->flags & EV_ENABLE) == 0 &&
4211 		    (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4212 		    (kn->kn_status & KN_DISABLED) != 0) {
4213 			kn->kn_status |= KN_DEFERDELETE;
4214 			error = EINPROGRESS;
4215 			goto out_unlock;
4216 		}
4217 
4218 		knote_drop(kq, kn, &knlc);
4219 		goto out;
4220 	} else {
4221 		/*
4222 		 * Regular update of a knote (touch)
4223 		 *
4224 		 * Call touch routine to notify filter of changes in filter values
4225 		 * (and to re-determine if any events are fired).
4226 		 *
4227 		 * If the knote is in defer-delete, avoid calling the filter touch
4228 		 * routine (it has delivered its last event already).
4229 		 *
4230 		 * If the touch routine had no failure,
4231 		 * apply the requested side effects to the knote.
4232 		 */
4233 
4234 		if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4235 			if (kev->flags & EV_ENABLE) {
4236 				result = FILTER_ACTIVE;
4237 			}
4238 		} else {
4239 			kqunlock(kq);
4240 			result = filter_call(knote_fops(kn), f_touch(kn, kev));
4241 			kqlock(kq);
4242 			if (result & FILTER_THREADREQ_NODEFEER) {
4243 				enable_preemption();
4244 			}
4245 		}
4246 
4247 		if (kev->flags & EV_ERROR) {
4248 			result = 0;
4249 			goto out_unlock;
4250 		}
4251 
4252 		if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4253 		    kn->kn_udata != kev->udata) {
4254 			// this allows klist_copy_udata() not to take locks
4255 			os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4256 		}
4257 		if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4258 			kn->kn_status |= KN_DISABLED;
4259 			knote_dequeue(kq, kn);
4260 		}
4261 	}
4262 
4263 	/* accept new kevent state */
4264 	knote_apply_touch(kq, kn, kev, result);
4265 
4266 out_unlock:
4267 	/*
4268 	 * When the filter asked for a post-register wait,
4269 	 * we leave the kqueue locked for kevent_register()
4270 	 * to call the filter's f_post_register_wait hook.
4271 	 */
4272 	if (result & FILTER_REGISTER_WAIT) {
4273 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4274 		*kn_out = kn;
4275 	} else {
4276 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4277 	}
4278 
4279 out:
4280 	/* output local errors through the kevent */
4281 	if (error) {
4282 		kev->flags |= EV_ERROR;
4283 		kev->data = error;
4284 	}
4285 	return result;
4286 }
4287 
4288 /*
4289  * knote_process - process a triggered event
4290  *
4291  *	Validate that it is really still a triggered event
4292  *	by calling the filter routines (if necessary).  Hold
4293  *	a use reference on the knote to avoid it being detached.
4294  *
4295  *	If it is still considered triggered, we will have taken
4296  *	a copy of the state under the filter lock.  We use that
4297  *	snapshot to dispatch the knote for future processing (or
4298  *	not, if this was a lost event).
4299  *
4300  *	Our caller assures us that nobody else can be processing
4301  *	events from this knote during the whole operation. But
4302  *	others can be touching or posting events to the knote
4303  *	interspersed with our processing it.
4304  *
4305  *	caller holds a reference on the kqueue.
4306  *	kqueue locked on entry and exit - but may be dropped
4307  */
4308 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4309 knote_process(struct knote *kn, kevent_ctx_t kectx,
4310     kevent_callback_t callback)
4311 {
4312 	struct kevent_qos_s kev;
4313 	struct kqueue *kq = knote_get_kq(kn);
4314 	KNOTE_LOCK_CTX(knlc);
4315 	int result = FILTER_ACTIVE;
4316 	int error = 0;
4317 	bool drop = false;
4318 
4319 	/*
4320 	 * Must be active
4321 	 * Must be queued and not disabled/suppressed or dropping
4322 	 */
4323 	assert(kn->kn_status & KN_QUEUED);
4324 	assert(kn->kn_status & KN_ACTIVE);
4325 	assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4326 
4327 	if (kq->kq_state & KQ_WORKLOOP) {
4328 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4329 		    ((struct kqworkloop *)kq)->kqwl_dynamicid,
4330 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4331 		    kn->kn_filtid);
4332 	} else if (kq->kq_state & KQ_WORKQ) {
4333 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4334 		    0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4335 		    kn->kn_filtid);
4336 	} else {
4337 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4338 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4339 		    kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4340 	}
4341 
4342 	if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4343 		/*
4344 		 * When the knote is dropping or has dropped,
4345 		 * then there's nothing we want to process.
4346 		 */
4347 		return EJUSTRETURN;
4348 	}
4349 
4350 	/*
4351 	 * While waiting for the knote lock, we may have dropped the kq lock.
4352 	 * and a touch may have disabled and dequeued the knote.
4353 	 */
4354 	if (!(kn->kn_status & KN_QUEUED)) {
4355 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4356 		return EJUSTRETURN;
4357 	}
4358 
4359 	/*
4360 	 * For deferred-drop or vanished events, we just create a fake
4361 	 * event to acknowledge end-of-life.  Otherwise, we call the
4362 	 * filter's process routine to snapshot the kevent state under
4363 	 * the filter's locking protocol.
4364 	 *
4365 	 * suppress knotes to avoid returning the same event multiple times in
4366 	 * a single call.
4367 	 */
4368 	knote_suppress(kq, kn);
4369 
4370 	if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4371 		uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4372 		if (kn->kn_status & KN_DEFERDELETE) {
4373 			kev_flags |= EV_DELETE;
4374 		} else {
4375 			kev_flags |= EV_VANISHED;
4376 		}
4377 
4378 		/* create fake event */
4379 		kev = (struct kevent_qos_s){
4380 			.filter = kn->kn_filter,
4381 			.ident  = kn->kn_id,
4382 			.flags  = kev_flags,
4383 			.udata  = kn->kn_udata,
4384 		};
4385 	} else {
4386 		kqunlock(kq);
4387 		kev = (struct kevent_qos_s) { };
4388 		result = filter_call(knote_fops(kn), f_process(kn, &kev));
4389 		kqlock(kq);
4390 	}
4391 
4392 	/*
4393 	 * Determine how to dispatch the knote for future event handling.
4394 	 * not-fired: just return (do not callout, leave deactivated).
4395 	 * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4396 	 *            is the deferred delete event delivery itself).  Otherwise,
4397 	 *            drop it.
4398 	 * Dispatch:  don't clear state, just mark it disabled.
4399 	 * Cleared:   just leave it deactivated.
4400 	 * Others:    re-activate as there may be more events to handle.
4401 	 *            This will not wake up more handlers right now, but
4402 	 *            at the completion of handling events it may trigger
4403 	 *            more handler threads (TODO: optimize based on more than
4404 	 *            just this one event being detected by the filter).
4405 	 */
4406 	if ((result & FILTER_ACTIVE) == 0) {
4407 		if ((kn->kn_status & KN_ACTIVE) == 0) {
4408 			/*
4409 			 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4410 			 * within f_process() but that doesn't necessarily make them
4411 			 * ready to process, so we should leave them be.
4412 			 *
4413 			 * For other knotes, since we will not return an event,
4414 			 * there's no point keeping the knote suppressed.
4415 			 */
4416 			knote_unsuppress(kq, kn);
4417 		}
4418 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4419 		return EJUSTRETURN;
4420 	}
4421 
4422 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4423 		knote_adjust_qos(kq, kn, result);
4424 	}
4425 
4426 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4427 		kqueue_update_iotier_override(kq);
4428 	}
4429 
4430 	kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4431 
4432 	if (kev.flags & EV_ONESHOT) {
4433 		if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4434 		    (kn->kn_status & KN_DEFERDELETE) == 0) {
4435 			/* defer dropping non-delete oneshot dispatch2 events */
4436 			kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4437 		} else {
4438 			drop = true;
4439 		}
4440 	} else if (kn->kn_flags & EV_DISPATCH) {
4441 		/* disable all dispatch knotes */
4442 		kn->kn_status |= KN_DISABLED;
4443 	} else if ((kn->kn_flags & EV_CLEAR) == 0) {
4444 		/* re-activate in case there are more events */
4445 		knote_activate(kq, kn, FILTER_ACTIVE);
4446 	}
4447 
4448 	/*
4449 	 * callback to handle each event as we find it.
4450 	 * If we have to detach and drop the knote, do
4451 	 * it while we have the kq unlocked.
4452 	 */
4453 	if (drop) {
4454 		knote_drop(kq, kn, &knlc);
4455 	} else {
4456 		knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4457 	}
4458 
4459 	if (kev.flags & EV_VANISHED) {
4460 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4461 		    kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4462 		    kn->kn_filtid);
4463 	}
4464 
4465 	error = (callback)(&kev, kectx);
4466 	kqlock(kq);
4467 	return error;
4468 }
4469 
4470 /*
4471  * Returns -1 if the kqueue was unbound and processing should not happen
4472  */
4473 #define KQWQAE_BEGIN_PROCESSING 1
4474 #define KQWQAE_END_PROCESSING   2
4475 #define KQWQAE_UNBIND           3
4476 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4477 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4478     int kevent_flags, int kqwqae_op)
4479 {
4480 	struct knote *kn;
4481 	int rc = 0;
4482 	bool unbind;
4483 	struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4484 	struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4485 
4486 	kqlock_held(&kqwq->kqwq_kqueue);
4487 
4488 	/*
4489 	 * Return suppressed knotes to their original state.
4490 	 * For workq kqueues, suppressed ones that are still
4491 	 * truly active (not just forced into the queue) will
4492 	 * set flags we check below to see if anything got
4493 	 * woken up.
4494 	 */
4495 	while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4496 		knote_unsuppress(kqwq, kn);
4497 	}
4498 
4499 	if (kqwqae_op == KQWQAE_UNBIND) {
4500 		unbind = true;
4501 	} else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4502 		unbind = false;
4503 	} else {
4504 		unbind = TAILQ_EMPTY(queue);
4505 	}
4506 	if (unbind) {
4507 		thread_t thread = kqr_thread_fast(kqr);
4508 		thread_qos_t old_override;
4509 
4510 #if MACH_ASSERT
4511 		thread_t self = current_thread();
4512 		struct uthread *ut = get_bsdthread_info(self);
4513 
4514 		assert(thread == self);
4515 		assert(ut->uu_kqr_bound == kqr);
4516 #endif // MACH_ASSERT
4517 
4518 		old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4519 		if (!TAILQ_EMPTY(queue)) {
4520 			/*
4521 			 * Request a new thread if we didn't process the whole
4522 			 * queue.
4523 			 */
4524 			kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4525 			    kqr->tr_kq_qos_index, 0);
4526 		}
4527 		if (old_override) {
4528 			thread_drop_kevent_override(thread);
4529 		}
4530 		rc = -1;
4531 	}
4532 
4533 	return rc;
4534 }
4535 
4536 /*
4537  * Return 0 to indicate that processing should proceed,
4538  * -1 if there is nothing to process.
4539  *
4540  * Called with kqueue locked and returns the same way,
4541  * but may drop lock temporarily.
4542  */
4543 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4544 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4545     int kevent_flags)
4546 {
4547 	int rc = 0;
4548 
4549 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4550 	    0, kqr->tr_kq_qos_index);
4551 
4552 	rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4553 	    KQWQAE_BEGIN_PROCESSING);
4554 
4555 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4556 	    thread_tid(kqr_thread(kqr)),
4557 	    !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4558 
4559 	return rc;
4560 }
4561 
4562 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4563 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4564 {
4565 	kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4566 	struct knote *kn, *tmp;
4567 
4568 	kqlock_held(kqwl);
4569 
4570 	TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4571 		/*
4572 		 * If a knote that can adjust QoS is disabled because of the automatic
4573 		 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4574 		 * further overrides keep pushing.
4575 		 */
4576 		if (knote_fops(kn)->f_adjusts_qos &&
4577 		    (kn->kn_status & KN_DISABLED) != 0 &&
4578 		    (kn->kn_status & KN_DROPPING) == 0 &&
4579 		    (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4580 			qos = MAX(qos, kn->kn_qos_override);
4581 			continue;
4582 		}
4583 		knote_unsuppress(kqwl, kn);
4584 	}
4585 
4586 	return qos;
4587 }
4588 
4589 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4590 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4591 {
4592 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4593 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4594 	int rc = 0, op = KQWL_UTQ_NONE;
4595 
4596 	kqlock_held(kq);
4597 
4598 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4599 	    kqwl->kqwl_dynamicid, 0, 0);
4600 
4601 	/* nobody else should still be processing */
4602 	assert((kq->kq_state & KQ_PROCESSING) == 0);
4603 
4604 	kq->kq_state |= KQ_PROCESSING;
4605 
4606 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4607 		/*
4608 		 * When "parking" we want to process events and if no events are found
4609 		 * unbind. (Except for WORKQ_TR_FLAG_PERMANENT_BIND where the soft unbind
4610 		 * and bound thread park happen in the caller.)
4611 		 *
4612 		 * However, non overcommit threads sometimes park even when they have
4613 		 * more work so that the pool can narrow.  For these, we need to unbind
4614 		 * early, so that calling kqworkloop_update_threads_qos() can ask the
4615 		 * workqueue subsystem whether the thread should park despite having
4616 		 * pending events.
4617 		 *
4618 		 */
4619 		if (kqr->tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT | WORKQ_TR_FLAG_PERMANENT_BIND)) {
4620 			op = KQWL_UTQ_PARKING;
4621 		} else {
4622 			op = KQWL_UTQ_UNBINDING;
4623 		}
4624 	} else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4625 		op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4626 	}
4627 
4628 	if (op != KQWL_UTQ_NONE) {
4629 		thread_qos_t qos_override;
4630 		thread_t thread = kqr_thread_fast(kqr);
4631 
4632 		qos_override = kqworkloop_acknowledge_events(kqwl);
4633 
4634 		if (op == KQWL_UTQ_UNBINDING) {
4635 			kqworkloop_unbind_locked(kqwl, thread,
4636 			    KQWL_OVERRIDE_DROP_IMMEDIATELY, 0);
4637 			kqworkloop_release_live(kqwl);
4638 		}
4639 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
4640 		if (op == KQWL_UTQ_PARKING &&
4641 		    (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4642 			if ((kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) &&
4643 			    (!(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND))) {
4644 				kqworkloop_unbind_locked(kqwl, thread,
4645 				    KQWL_OVERRIDE_DROP_DELAYED, 0);
4646 				kqworkloop_release_live(kqwl);
4647 			}
4648 			rc = -1; /* To indicate stop begin processing. */
4649 		} else if (op == KQWL_UTQ_UNBINDING &&
4650 		    kqr_thread(kqr) != thread) {
4651 			rc = -1; /* To indicate stop begin processing. */
4652 		}
4653 
4654 		if (rc == -1) {
4655 			kq->kq_state &= ~KQ_PROCESSING;
4656 			if (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) {
4657 				goto done;
4658 			}
4659 			kqworkloop_unbind_delayed_override_drop(thread);
4660 		}
4661 	}
4662 done:
4663 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4664 	    kqwl->kqwl_dynamicid, 0, 0);
4665 
4666 	return rc;
4667 }
4668 
4669 /*
4670  * Return 0 to indicate that processing should proceed,
4671  * -1 if there is nothing to process.
4672  * EBADF if the kqueue is draining
4673  *
4674  * Called with kqueue locked and returns the same way,
4675  * but may drop lock temporarily.
4676  * May block.
4677  */
4678 static int
kqfile_begin_processing(struct kqfile * kq)4679 kqfile_begin_processing(struct kqfile *kq)
4680 {
4681 	kqlock_held(kq);
4682 
4683 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4684 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4685 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4686 
4687 	/* wait to become the exclusive processing thread */
4688 	while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4689 		kq->kqf_state |= KQ_PROCWAIT;
4690 		lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4691 		    &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4692 	}
4693 
4694 	if (kq->kqf_state & KQ_DRAIN) {
4695 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4696 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4697 		return EBADF;
4698 	}
4699 
4700 	/* Nobody else processing */
4701 
4702 	/* anything left to process? */
4703 	if (kq->kqf_count == 0) {
4704 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4705 		    VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4706 		return -1;
4707 	}
4708 
4709 	/* convert to processing mode */
4710 	kq->kqf_state |= KQ_PROCESSING;
4711 
4712 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4713 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4714 	return 0;
4715 }
4716 
4717 /*
4718  * Try to end the processing, only called when a workq thread is attempting to
4719  * park (KEVENT_FLAG_PARKING is set).
4720  *
4721  * When returning -1, the kqworkq is setup again so that it is ready to be
4722  * processed.
4723  */
4724 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4725 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4726     int kevent_flags)
4727 {
4728 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4729 		/*
4730 		 * if acknowledge events "succeeds" it means there are events,
4731 		 * which is a failure condition for end_processing.
4732 		 */
4733 		int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4734 		    KQWQAE_END_PROCESSING);
4735 		if (rc == 0) {
4736 			return -1;
4737 		}
4738 	}
4739 
4740 	return 0;
4741 }
4742 
4743 /*
4744  * Try to end the processing, only called when a workq thread is attempting to
4745  * park (KEVENT_FLAG_PARKING is set).
4746  *
4747  * When returning -1, the kqworkq is setup again so that it is ready to be
4748  * processed (as if kqworkloop_begin_processing had just been called).
4749  *
4750  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4751  * the kqworkloop is unbound from its servicer as a side effect.
4752  */
4753 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4754 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4755 {
4756 	struct kqueue *kq = &kqwl->kqwl_kqueue;
4757 	workq_threadreq_t kqr = &kqwl->kqwl_request;
4758 	int rc = 0;
4759 
4760 	kqlock_held(kq);
4761 
4762 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4763 	    kqwl->kqwl_dynamicid, 0, 0);
4764 
4765 	if (kevent_flags & KEVENT_FLAG_PARKING) {
4766 		thread_t thread = kqr_thread_fast(kqr);
4767 		thread_qos_t qos_override;
4768 
4769 		/*
4770 		 * When KEVENT_FLAG_PARKING is set, we need to attempt
4771 		 * an unbind while still under the lock.
4772 		 *
4773 		 * So we do everything kqworkloop_unbind() would do, but because
4774 		 * we're inside kqueue_process(), if the workloop actually
4775 		 * received events while our locks were dropped, we have
4776 		 * the opportunity to fail the end processing and loop again.
4777 		 *
4778 		 * This avoids going through the process-wide workqueue lock
4779 		 * hence scales better.
4780 		 */
4781 		assert(flags & KQ_PROCESSING);
4782 		qos_override = kqworkloop_acknowledge_events(kqwl);
4783 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4784 
4785 		if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4786 			rc = -1; /* To indicate we should continue processing. */
4787 		} else {
4788 			if (kqr_thread_permanently_bound(kqr)) {
4789 				/*
4790 				 * For these, the actual soft unbind and bound thread park
4791 				 * happen in the caller.
4792 				 */
4793 				kq->kq_state &= ~flags;
4794 			} else {
4795 				kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
4796 				kqworkloop_release_live(kqwl);
4797 				kq->kq_state &= ~flags;
4798 				kqworkloop_unbind_delayed_override_drop(thread);
4799 			}
4800 		}
4801 	} else {
4802 		kq->kq_state &= ~flags;
4803 		kq->kq_state |= KQ_R2K_ARMED;
4804 		kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4805 	}
4806 
4807 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4808 	    kqwl->kqwl_dynamicid, 0, 0);
4809 
4810 	return rc;
4811 }
4812 
4813 /*
4814  * Called with kqueue lock held.
4815  *
4816  * 0: no more events
4817  * -1: has more events
4818  * EBADF: kqueue is in draining mode
4819  */
4820 static int
kqfile_end_processing(struct kqfile * kq)4821 kqfile_end_processing(struct kqfile *kq)
4822 {
4823 	struct knote *kn;
4824 	int procwait;
4825 
4826 	kqlock_held(kq);
4827 
4828 	assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4829 
4830 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4831 	    VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4832 
4833 	/*
4834 	 * Return suppressed knotes to their original state.
4835 	 */
4836 	while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4837 		knote_unsuppress(kq, kn);
4838 	}
4839 
4840 	procwait = (kq->kqf_state & KQ_PROCWAIT);
4841 	kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4842 
4843 	if (procwait) {
4844 		/* first wake up any thread already waiting to process */
4845 		thread_wakeup(&kq->kqf_suppressed);
4846 	}
4847 
4848 	if (kq->kqf_state & KQ_DRAIN) {
4849 		return EBADF;
4850 	}
4851 	return kq->kqf_count != 0 ? -1 : 0;
4852 }
4853 
4854 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4855 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4856     struct kqueue_workloop_params *params, int *retval)
4857 {
4858 	int error = 0;
4859 	struct kqworkloop *kqwl;
4860 	struct filedesc *fdp = &p->p_fd;
4861 	workq_threadreq_param_t trp = { };
4862 	struct workq_threadreq_extended_param_s trp_extended = {0};
4863 	integer_t trp_preadopt_priority = 0;
4864 	integer_t trp_preadopt_policy = 0;
4865 
4866 	switch (cmd) {
4867 	case KQ_WORKLOOP_CREATE:
4868 		if (!params->kqwlp_flags) {
4869 			error = EINVAL;
4870 			break;
4871 		}
4872 
4873 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4874 		    (params->kqwlp_sched_pri < 1 ||
4875 		    params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4876 			error = EINVAL;
4877 			break;
4878 		}
4879 
4880 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4881 		    invalid_policy(params->kqwlp_sched_pol)) {
4882 			error = EINVAL;
4883 			break;
4884 		}
4885 
4886 		if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4887 		    (params->kqwlp_cpu_percent <= 0 ||
4888 		    params->kqwlp_cpu_percent > 100 ||
4889 		    params->kqwlp_cpu_refillms <= 0 ||
4890 		    params->kqwlp_cpu_refillms > 0x00ffffff)) {
4891 			error = EINVAL;
4892 			break;
4893 		}
4894 
4895 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WITH_BOUND_THREAD) {
4896 			if (!bootarg_thread_bound_kqwl_support_enabled) {
4897 				error = ENOTSUP;
4898 				break;
4899 			}
4900 			trp.trp_flags |= TRP_BOUND_THREAD;
4901 		}
4902 
4903 		if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4904 			/*
4905 			 * This flag serves the purpose of preadopting tg from work interval
4906 			 * on servicer/creator/bound thread at wakeup/creation time in kernel.
4907 			 *
4908 			 * Additionally, it helps the bound thread join the work interval
4909 			 * before it comes out to userspace for the first time.
4910 			 */
4911 			struct work_interval *work_interval = NULL;
4912 			kern_return_t kr;
4913 
4914 			kr = kern_port_name_to_work_interval(params->kqwl_wi_port,
4915 			    &work_interval);
4916 			if (kr != KERN_SUCCESS) {
4917 				error = EINVAL;
4918 				break;
4919 			}
4920 			/* work_interval has a +1 ref */
4921 
4922 			kr = kern_work_interval_get_policy(work_interval,
4923 			    &trp_preadopt_policy,
4924 			    &trp_preadopt_priority);
4925 			if (kr != KERN_SUCCESS) {
4926 				kern_work_interval_release(work_interval);
4927 				error = EINVAL;
4928 				break;
4929 			}
4930 			/* The work interval comes with scheduling policy. */
4931 			if (trp_preadopt_policy) {
4932 				trp.trp_flags |= TRP_POLICY;
4933 				trp.trp_pol = (uint8_t)trp_preadopt_policy;
4934 
4935 				trp.trp_flags |= TRP_PRIORITY;
4936 				trp.trp_pri = (uint8_t)trp_preadopt_priority;
4937 			}
4938 #if CONFIG_PREADOPT_TG
4939 			kr = kern_work_interval_get_thread_group(work_interval,
4940 			    &trp_extended.trp_permanent_preadopt_tg);
4941 			if (kr != KERN_SUCCESS) {
4942 				kern_work_interval_release(work_interval);
4943 				error = EINVAL;
4944 				break;
4945 			}
4946 			/*
4947 			 * In case of KERN_SUCCESS, we take
4948 			 * : +1 ref on a thread group backing this work interval
4949 			 * via kern_work_interval_get_thread_group and pass it on to kqwl.
4950 			 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4951 			 * get back this ref, we release them before returning.
4952 			 */
4953 #endif
4954 			if (trp.trp_flags & TRP_BOUND_THREAD) {
4955 				/*
4956 				 * For TRP_BOUND_THREAD, we pass +1 ref on the work_interval on to
4957 				 * kqwl so the bound thread can join it before coming out to
4958 				 * userspace.
4959 				 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4960 				 * get back this ref, we release them before returning.
4961 				 */
4962 				trp_extended.trp_work_interval = work_interval;
4963 			} else {
4964 				kern_work_interval_release(work_interval);
4965 			}
4966 		}
4967 
4968 		if (!(trp.trp_flags & (TRP_POLICY | TRP_PRIORITY))) {
4969 			/*
4970 			 * We always prefer scheduling policy + priority that comes with
4971 			 * a work interval. It it does not exist, we fallback to what the user
4972 			 * has asked.
4973 			 */
4974 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4975 				trp.trp_flags |= TRP_PRIORITY;
4976 				trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4977 			}
4978 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4979 				trp.trp_flags |= TRP_POLICY;
4980 				trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4981 			}
4982 			if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4983 				trp.trp_flags |= TRP_CPUPERCENT;
4984 				trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4985 				trp.trp_refillms = params->kqwlp_cpu_refillms;
4986 			}
4987 		}
4988 
4989 #if CONFIG_PREADOPT_TG
4990 		if ((trp.trp_flags == 0) &&
4991 		    (trp_extended.trp_permanent_preadopt_tg == NULL)) {
4992 #else
4993 		if (trp.trp_flags == 0) {
4994 #endif
4995 			error = EINVAL;
4996 			break;
4997 		}
4998 
4999 		error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
5000 		    &trp_extended,
5001 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5002 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
5003 		if (error) {
5004 			/* kqworkloop_get_or_create did not consume these refs. */
5005 #if CONFIG_PREADOPT_TG
5006 			if (trp_extended.trp_permanent_preadopt_tg) {
5007 				thread_group_release(trp_extended.trp_permanent_preadopt_tg);
5008 			}
5009 #endif
5010 			if (trp_extended.trp_work_interval) {
5011 				kern_work_interval_release(trp_extended.trp_work_interval);
5012 			}
5013 			break;
5014 		}
5015 
5016 		if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
5017 			/* FD_WORKLOOP indicates we've ever created a workloop
5018 			 * via this syscall but its only ever added to a process, never
5019 			 * removed.
5020 			 */
5021 			proc_fdlock(p);
5022 			fdt_flag_set(fdp, FD_WORKLOOP);
5023 			proc_fdunlock(p);
5024 		}
5025 		break;
5026 	case KQ_WORKLOOP_DESTROY:
5027 		error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL, NULL,
5028 		    KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5029 		    KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
5030 		if (error) {
5031 			break;
5032 		}
5033 		kqlock(kqwl);
5034 		trp.trp_value = kqwl->kqwl_params;
5035 		if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5036 			trp.trp_flags |= TRP_RELEASED;
5037 			kqwl->kqwl_params = trp.trp_value;
5038 			if (trp.trp_flags & TRP_BOUND_THREAD) {
5039 				kqworkloop_bound_thread_wakeup(kqwl);
5040 			}
5041 			kqworkloop_release_live(kqwl);
5042 		} else {
5043 			error = EINVAL;
5044 		}
5045 		kqunlock(kqwl);
5046 		kqworkloop_release(kqwl);
5047 		break;
5048 	}
5049 	*retval = 0;
5050 	return error;
5051 }
5052 
5053 int
5054 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
5055 {
5056 	struct kqueue_workloop_params params = {
5057 		.kqwlp_id = 0,
5058 	};
5059 	if (uap->sz < sizeof(params.kqwlp_version)) {
5060 		return EINVAL;
5061 	}
5062 
5063 	size_t copyin_sz = MIN(sizeof(params), uap->sz);
5064 	int rv = copyin(uap->addr, &params, copyin_sz);
5065 	if (rv) {
5066 		return rv;
5067 	}
5068 
5069 	if (params.kqwlp_version != (int)uap->sz) {
5070 		return EINVAL;
5071 	}
5072 
5073 	return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
5074 	           retval);
5075 }
5076 
5077 static int
5078 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
5079 {
5080 	struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
5081 	int retnum = 0;
5082 
5083 	assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5084 
5085 	if (which == FREAD) {
5086 		kqlock(kq);
5087 		if (kqfile_begin_processing(kq) == 0) {
5088 			retnum = kq->kqf_count;
5089 			kqfile_end_processing(kq);
5090 		} else if ((kq->kqf_state & KQ_DRAIN) == 0) {
5091 			selrecord(kq->kqf_p, &kq->kqf_sel, wql);
5092 		}
5093 		kqunlock(kq);
5094 	}
5095 	return retnum;
5096 }
5097 
5098 /*
5099  * kqueue_close -
5100  */
5101 static int
5102 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5103 {
5104 	struct kqfile *kqf = fg_get_data(fg);
5105 
5106 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5107 	kqlock(kqf);
5108 	selthreadclear(&kqf->kqf_sel);
5109 	kqunlock(kqf);
5110 	kqueue_dealloc(&kqf->kqf_kqueue);
5111 	fg_set_data(fg, NULL);
5112 	return 0;
5113 }
5114 
5115 /*
5116  * Max depth of the nested kq path that can be created.
5117  * Note that this has to be less than the size of kq_level
5118  * to avoid wrapping around and mislabeling the level. We also
5119  * want to be aggressive about this so that we don't overflow the
5120  * kernel stack while posting kevents
5121  */
5122 #define MAX_NESTED_KQ 10
5123 
5124 /*
5125  * The callers has taken a use-count reference on this kqueue and will donate it
5126  * to the kqueue we are being added to.  This keeps the kqueue from closing until
5127  * that relationship is torn down.
5128  */
5129 static int
5130 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
5131     __unused struct kevent_qos_s *kev)
5132 {
5133 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5134 	struct kqueue *kq = &kqf->kqf_kqueue;
5135 	struct kqueue *parentkq = knote_get_kq(kn);
5136 
5137 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5138 
5139 	if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
5140 		knote_set_error(kn, EINVAL);
5141 		return 0;
5142 	}
5143 
5144 	/*
5145 	 * We have to avoid creating a cycle when nesting kqueues
5146 	 * inside another.  Rather than trying to walk the whole
5147 	 * potential DAG of nested kqueues, we just use a simple
5148 	 * ceiling protocol.  When a kqueue is inserted into another,
5149 	 * we check that the (future) parent is not already nested
5150 	 * into another kqueue at a lower level than the potenial
5151 	 * child (because it could indicate a cycle).  If that test
5152 	 * passes, we just mark the nesting levels accordingly.
5153 	 *
5154 	 * Only up to MAX_NESTED_KQ can be nested.
5155 	 *
5156 	 * Note: kqworkq and kqworkloop cannot be nested and have reused their
5157 	 *       kq_level field, so ignore these as parent.
5158 	 */
5159 
5160 	kqlock(parentkq);
5161 
5162 	if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
5163 		if (parentkq->kq_level > 0 &&
5164 		    parentkq->kq_level < kq->kq_level) {
5165 			kqunlock(parentkq);
5166 			knote_set_error(kn, EINVAL);
5167 			return 0;
5168 		}
5169 
5170 		/* set parent level appropriately */
5171 		uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5172 		if (plevel < kq->kq_level + 1) {
5173 			if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5174 				kqunlock(parentkq);
5175 				knote_set_error(kn, EINVAL);
5176 				return 0;
5177 			}
5178 			plevel = kq->kq_level + 1;
5179 		}
5180 
5181 		parentkq->kq_level = plevel;
5182 	}
5183 
5184 	kqunlock(parentkq);
5185 
5186 	kn->kn_filtid = EVFILTID_KQREAD;
5187 	kqlock(kq);
5188 	KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5189 	/* indicate nesting in child, if needed */
5190 	if (kq->kq_level == 0) {
5191 		kq->kq_level = 1;
5192 	}
5193 
5194 	int count = kq->kq_count;
5195 	kqunlock(kq);
5196 	return count > 0;
5197 }
5198 
5199 __attribute__((noinline))
5200 static void
5201 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
5202 {
5203 	/* wakeup a thread waiting on this queue */
5204 	selwakeup(&kqf->kqf_sel);
5205 
5206 	/* wake up threads in kqueue_scan() */
5207 	if (kqf->kqf_state & KQ_SLEEP) {
5208 		kqf->kqf_state &= ~KQ_SLEEP;
5209 		thread_wakeup_with_result(&kqf->kqf_count, wr);
5210 	}
5211 
5212 	if (hint == NOTE_REVOKE) {
5213 		/* wakeup threads waiting their turn to process */
5214 		if (kqf->kqf_state & KQ_PROCWAIT) {
5215 			assert(kqf->kqf_state & KQ_PROCESSING);
5216 			kqf->kqf_state &= ~KQ_PROCWAIT;
5217 			thread_wakeup(&kqf->kqf_suppressed);
5218 		}
5219 
5220 		/* no need to KNOTE: knote_fdclose() takes care of it */
5221 	} else {
5222 		/* wakeup other kqueues/select sets we're inside */
5223 		KNOTE(&kqf->kqf_sel.si_note, hint);
5224 	}
5225 }
5226 
5227 /*
5228  * kqueue_drain - called when kq is closed
5229  */
5230 static int
5231 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5232 {
5233 	struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5234 
5235 	assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5236 
5237 	kqlock(kqf);
5238 	kqf->kqf_state |= KQ_DRAIN;
5239 	kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5240 	kqunlock(kqf);
5241 	return 0;
5242 }
5243 
5244 int
5245 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5246 {
5247 	assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5248 
5249 	kqlock(kq);
5250 	if (isstat64 != 0) {
5251 		struct stat64 *sb64 = (struct stat64 *)ub;
5252 
5253 		bzero((void *)sb64, sizeof(*sb64));
5254 		sb64->st_size = kq->kq_count;
5255 		if (kq->kq_state & KQ_KEV_QOS) {
5256 			sb64->st_blksize = sizeof(struct kevent_qos_s);
5257 		} else if (kq->kq_state & KQ_KEV64) {
5258 			sb64->st_blksize = sizeof(struct kevent64_s);
5259 		} else if (IS_64BIT_PROCESS(p)) {
5260 			sb64->st_blksize = sizeof(struct user64_kevent);
5261 		} else {
5262 			sb64->st_blksize = sizeof(struct user32_kevent);
5263 		}
5264 		sb64->st_mode = S_IFIFO;
5265 	} else {
5266 		struct stat *sb = (struct stat *)ub;
5267 
5268 		bzero((void *)sb, sizeof(*sb));
5269 		sb->st_size = kq->kq_count;
5270 		if (kq->kq_state & KQ_KEV_QOS) {
5271 			sb->st_blksize = sizeof(struct kevent_qos_s);
5272 		} else if (kq->kq_state & KQ_KEV64) {
5273 			sb->st_blksize = sizeof(struct kevent64_s);
5274 		} else if (IS_64BIT_PROCESS(p)) {
5275 			sb->st_blksize = sizeof(struct user64_kevent);
5276 		} else {
5277 			sb->st_blksize = sizeof(struct user32_kevent);
5278 		}
5279 		sb->st_mode = S_IFIFO;
5280 	}
5281 	kqunlock(kq);
5282 	return 0;
5283 }
5284 
5285 static inline bool
5286 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5287 {
5288 	if (current_proc() == kq->kq_p) {
5289 		/*
5290 		 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5291 		 * do combined send/receive and in the case of self-IPC, the AST may bet
5292 		 * set on a thread that will not return to userspace and needs the
5293 		 * thread the AST would create to unblock itself.
5294 		 *
5295 		 * At this time, we really want to target:
5296 		 *
5297 		 * - kevent variants that can cause thread creations, and dispatch
5298 		 *   really only uses kevent_qos and kevent_id,
5299 		 *
5300 		 * - workq_kernreturn (directly about thread creations)
5301 		 *
5302 		 * - bsdthread_ctl which is used for qos changes and has direct impact
5303 		 *   on the creator thread scheduling decisions.
5304 		 */
5305 		switch (current_uthread()->syscall_code) {
5306 		case SYS_kevent_qos:
5307 		case SYS_kevent_id:
5308 		case SYS_workq_kernreturn:
5309 		case SYS_bsdthread_ctl:
5310 			return true;
5311 		}
5312 	}
5313 	return false;
5314 }
5315 
5316 /*
5317  * Interact with the pthread kext to request a servicing there at a specific QoS
5318  * level.
5319  *
5320  * - Caller holds the kqlock
5321  *
5322  * - May be called with the kqueue's wait queue set locked,
5323  *   so cannot do anything that could recurse on that.
5324  */
5325 static void
5326 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5327     kq_index_t qos, int flags)
5328 {
5329 	assert(kqr_thread(kqr) == THREAD_NULL);
5330 	assert(!kqr_thread_requested(kqr));
5331 	struct turnstile *ts = TURNSTILE_NULL;
5332 
5333 	if (workq_is_exiting(kqu.kq->kq_p)) {
5334 		return;
5335 	}
5336 
5337 	kqlock_held(kqu);
5338 
5339 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5340 		struct kqworkloop *kqwl = kqu.kqwl;
5341 
5342 		assert(kqwl->kqwl_owner == THREAD_NULL);
5343 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5344 		    kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5345 		ts = kqwl->kqwl_turnstile;
5346 		/* Add a thread request reference on the kqueue. */
5347 		kqworkloop_retain(kqwl);
5348 
5349 #if CONFIG_PREADOPT_TG
5350 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5351 			&kqwl->kqwl_preadopt_tg, relaxed);
5352 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5353 			/*
5354 			 * This kqwl has been permanently configured with a thread group.
5355 			 * See kqworkloops with scheduling parameters.
5356 			 */
5357 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5358 		} else {
5359 			/*
5360 			 * This thread is the one which is ack-ing the thread group on the kqwl
5361 			 * under the kqlock and will take action accordingly, pairs with the
5362 			 * release barrier in kqueue_set_preadopted_thread_group
5363 			 */
5364 			uint16_t tg_acknowledged;
5365 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5366 			    KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5367 			    &tg_acknowledged, acquire)) {
5368 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5369 			}
5370 		}
5371 #endif
5372 	} else {
5373 		assert(kqu.kq->kq_state & KQ_WORKQ);
5374 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5375 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5376 	}
5377 
5378 	/*
5379 	 * New-style thread request supported.
5380 	 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5381 	 * its use until a corresponding kqueue_threadreq_bind callback.
5382 	 */
5383 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5384 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5385 	}
5386 	if (qos == KQWQ_QOS_MANAGER) {
5387 		qos = WORKQ_THREAD_QOS_MANAGER;
5388 	}
5389 
5390 	if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5391 		/*
5392 		 * Process is shutting down or exec'ing.
5393 		 * All the kqueues are going to be cleaned up
5394 		 * soon. Forget we even asked for a thread -
5395 		 * and make sure we don't ask for more.
5396 		 */
5397 		kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5398 		kqueue_release_live(kqu);
5399 	}
5400 }
5401 
5402 /*
5403  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5404  *
5405  * This is used when kqueue_threadreq_bind may cause a lock inversion.
5406  */
5407 __attribute__((always_inline))
5408 void
5409 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5410     struct uthread *ut)
5411 {
5412 	ut->uu_kqr_bound = kqr;
5413 	kqr->tr_thread = get_machthread(ut);
5414 	kqr->tr_state = WORKQ_TR_STATE_BINDING;
5415 }
5416 
5417 /*
5418  * kqueue_threadreq_bind_commit - commit a bind prepost
5419  *
5420  * The workq code has to commit any binding prepost before the thread has
5421  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5422  */
5423 void
5424 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5425 {
5426 	struct uthread *ut = get_bsdthread_info(thread);
5427 	workq_threadreq_t kqr = ut->uu_kqr_bound;
5428 	kqueue_t kqu = kqr_kqueue(p, kqr);
5429 
5430 	kqlock(kqu);
5431 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5432 		kqueue_threadreq_bind(p, kqr, thread, 0);
5433 	}
5434 	kqunlock(kqu);
5435 }
5436 
5437 void
5438 kqworkloop_bound_thread_terminate(workq_threadreq_t kqr,
5439     uint16_t *uu_workq_flags_orig)
5440 {
5441 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5442 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5443 
5444 	assert(uth == current_uthread());
5445 
5446 	kqlock(kqwl);
5447 
5448 	*uu_workq_flags_orig = uth->uu_workq_flags;
5449 
5450 	uth->uu_workq_flags &= ~UT_WORKQ_NEW;
5451 	uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_JOINED;
5452 	uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_FAILED;
5453 
5454 	workq_kern_bound_thread_reset_pri(NULL, uth);
5455 
5456 	kqunlock(kqwl);
5457 }
5458 
5459 /*
5460  * This is called from kqueue_process with kqlock held.
5461  */
5462 __attribute__((noreturn, noinline))
5463 static void
5464 kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread)
5465 {
5466 	assert(thread == current_thread());
5467 
5468 	kqlock_held(kqwl);
5469 
5470 	assert(!kqwl->kqwl_count);
5471 
5472 	/*
5473 	 * kevent entry points will take a reference on workloops so we need to
5474 	 * undo it before we park for good.
5475 	 */
5476 	kqworkloop_release_live(kqwl);
5477 
5478 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5479 	workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5480 
5481 	if (trp.trp_flags & TRP_RELEASED) {
5482 		/*
5483 		 * We need this check since the kqlock is dropped and retaken
5484 		 * multiple times during kqueue_process and because KQ_SLEEP is not
5485 		 * set, kqworkloop_bound_thread_wakeup is going to be a no-op.
5486 		 */
5487 		kqunlock(kqwl);
5488 		workq_kern_bound_thread_terminate(kqr);
5489 	} else {
5490 		kqworkloop_unbind_locked(kqwl,
5491 		    thread, KQWL_OVERRIDE_DROP_DELAYED, KQUEUE_THREADREQ_UNBIND_SOFT);
5492 		workq_kern_bound_thread_park(kqr);
5493 	}
5494 	__builtin_unreachable();
5495 }
5496 
5497 /*
5498  * A helper function for pthread workqueue subsystem.
5499  *
5500  * This is used to keep things that the workq code needs to do after
5501  * the bound thread's assert_wait minimum.
5502  */
5503 void
5504 kqworkloop_bound_thread_park_prepost(workq_threadreq_t kqr)
5505 {
5506 	assert(current_thread() == kqr->tr_thread);
5507 
5508 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5509 
5510 	kqlock_held(kqwl);
5511 
5512 	kqwl->kqwl_state |= KQ_SLEEP;
5513 
5514 	/* uu_kqueue_override is protected under kqlock. */
5515 	kqworkloop_unbind_delayed_override_drop(kqr->tr_thread);
5516 
5517 	kqunlock(kqwl);
5518 }
5519 
5520 /*
5521  * A helper function for pthread workqueue subsystem.
5522  *
5523  * This is used to keep things that the workq code needs to do after
5524  * the bound thread's assert_wait minimum.
5525  */
5526 void
5527 kqworkloop_bound_thread_park_commit(workq_threadreq_t kqr,
5528     event_t event,
5529     thread_continue_t continuation)
5530 {
5531 	assert(current_thread() == kqr->tr_thread);
5532 
5533 	struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5534 	struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5535 
5536 	kqlock(kqwl);
5537 	if (!(kqwl->kqwl_state & KQ_SLEEP)) {
5538 		/*
5539 		 * When we dropped the kqlock to unset the voucher, someone came
5540 		 * around and made us runnable.  But because we weren't waiting on the
5541 		 * event their thread_wakeup() was ineffectual.  To correct for that,
5542 		 * we just run the continuation ourselves.
5543 		 */
5544 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)));
5545 		if (uth->uu_workq_flags & UT_WORKQ_DYING) {
5546 			__assert_only workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5547 			assert(trp.trp_flags & TRP_RELEASED);
5548 		}
5549 		kqunlock(kqwl);
5550 		continuation(NULL, THREAD_AWAKENED);
5551 	} else {
5552 		assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)) == 0);
5553 		thread_set_pending_block_hint(get_machthread(uth),
5554 		    kThreadWaitParkedBoundWorkQueue);
5555 		assert_wait(event, THREAD_INTERRUPTIBLE);
5556 		kqunlock(kqwl);
5557 		thread_block(continuation);
5558 	}
5559 }
5560 
5561 static void
5562 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5563     workq_kern_threadreq_flags_t flags)
5564 {
5565 	assert(kqr_thread_requested_pending(kqr));
5566 
5567 	kqlock_held(kqu);
5568 
5569 	if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5570 		flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5571 	}
5572 
5573 #if CONFIG_PREADOPT_TG
5574 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5575 		struct kqworkloop *kqwl = kqu.kqwl;
5576 		thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5577 			&kqwl->kqwl_preadopt_tg, relaxed);
5578 		if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5579 			/*
5580 			 * This kqwl has been permanently configured with a thread group.
5581 			 * See kqworkloops with scheduling parameters.
5582 			 */
5583 			flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5584 		} else {
5585 			uint16_t tg_ack_status;
5586 			/*
5587 			 * This thread is the one which is ack-ing the thread group on the kqwl
5588 			 * under the kqlock and will take action accordingly, needs acquire
5589 			 * barrier.
5590 			 */
5591 			if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5592 			    KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5593 				flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5594 			}
5595 		}
5596 	}
5597 #endif
5598 
5599 	workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5600 }
5601 
5602 /*
5603  * kqueue_threadreq_bind - bind thread to processing kqrequest
5604  *
5605  * The provided thread will be responsible for delivering events
5606  * associated with the given kqrequest.  Bind it and get ready for
5607  * the thread to eventually arrive.
5608  */
5609 void
5610 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5611     unsigned int flags)
5612 {
5613 	kqueue_t kqu = kqr_kqueue(p, kqr);
5614 	struct uthread *ut = get_bsdthread_info(thread);
5615 
5616 	kqlock_held(kqu);
5617 
5618 	assert(ut->uu_kqueue_override == 0);
5619 
5620 	if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5621 		assert(ut->uu_kqr_bound == kqr);
5622 		assert(kqr->tr_thread == thread);
5623 	} else if (kqr->tr_state == WORKQ_TR_STATE_BOUND) {
5624 		assert(flags & KQUEUE_THREADREQ_BIND_SOFT);
5625 		assert(kqr_thread_permanently_bound(kqr));
5626 	} else {
5627 		assert(kqr_thread_requested_pending(kqr));
5628 		assert(kqr->tr_thread == THREAD_NULL);
5629 		assert(ut->uu_kqr_bound == NULL);
5630 		ut->uu_kqr_bound = kqr;
5631 		kqr->tr_thread = thread;
5632 	}
5633 
5634 	kqr->tr_state = WORKQ_TR_STATE_BOUND;
5635 
5636 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
5637 		struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5638 
5639 		if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5640 			/*
5641 			 * <rdar://problem/38626999> shows that asserting here is not ok.
5642 			 *
5643 			 * This is not supposed to happen for correct use of the interface,
5644 			 * but it is sadly possible for userspace (with the help of memory
5645 			 * corruption, such as over-release of a dispatch queue) to make
5646 			 * the creator thread the "owner" of a workloop.
5647 			 *
5648 			 * Once that happens, and that creator thread picks up the same
5649 			 * workloop as a servicer, we trip this codepath. We need to fixup
5650 			 * the state to forget about this thread being the owner, as the
5651 			 * entire workloop state machine expects servicers to never be
5652 			 * owners and everything would basically go downhill from here.
5653 			 */
5654 			kqu.kqwl->kqwl_owner = THREAD_NULL;
5655 			if (kqworkloop_override(kqu.kqwl)) {
5656 				thread_drop_kevent_override(thread);
5657 			}
5658 		}
5659 
5660 		if (ts && (flags & KQUEUE_THREADREQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5661 			/*
5662 			 * Past this point, the interlock is the kq req lock again,
5663 			 * so we can fix the inheritor for good.
5664 			 */
5665 			filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5666 			turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5667 		}
5668 
5669 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5670 		    thread_tid(thread), kqr->tr_kq_qos_index,
5671 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5672 
5673 		ut->uu_kqueue_override = kqr->tr_kq_override_index;
5674 		if (kqr->tr_kq_override_index) {
5675 			thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5676 		}
5677 
5678 #if CONFIG_PREADOPT_TG
5679 		/* Remove reference from kqwl and mark it as bound with the SENTINEL */
5680 		thread_group_qos_t old_tg;
5681 		thread_group_qos_t new_tg;
5682 		int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5683 			if ((old_tg == KQWL_PREADOPTED_TG_NEVER) || KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5684 			        /*
5685 			         * Either an app or a kqwl permanently configured with a thread group.
5686 			         * Nothing to do.
5687 			         */
5688 			        os_atomic_rmw_loop_give_up(break);
5689 			}
5690 			assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5691 			new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5692 		});
5693 
5694 		if (ret) {
5695 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5696 
5697 			if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5698 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5699 				assert(tg != NULL);
5700 
5701 				thread_set_preadopt_thread_group(thread, tg);
5702 				thread_group_release_live(tg); // The thread has a reference
5703 			} else {
5704 				/*
5705 				 * The thread may already have a preadopt thread group on it -
5706 				 * we need to make sure to clear that.
5707 				 */
5708 				thread_set_preadopt_thread_group(thread, NULL);
5709 			}
5710 
5711 			/* We have taken action on the preadopted thread group set on the
5712 			 * set on the kqwl, clear any redrive requests */
5713 			os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5714 		} else {
5715 			if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5716 				struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5717 				assert(tg != NULL);
5718 				/*
5719 				 * For KQUEUE_THREADREQ_BIND_SOFT, technically the following
5720 				 * set_preadopt should be a no-op since this bound servicer thread
5721 				 * preadopts kqwl's permanent tg at first-initial bind time and
5722 				 * never leaves it until its termination.
5723 				 */
5724 				thread_set_preadopt_thread_group(thread, tg);
5725 				/*
5726 				 * From this point on, kqwl and thread both have +1 ref on this tg.
5727 				 */
5728 			}
5729 		}
5730 #endif
5731 		kqueue_update_iotier_override(kqu);
5732 	} else {
5733 		assert(kqr->tr_kq_override_index == 0);
5734 
5735 #if CONFIG_PREADOPT_TG
5736 		/*
5737 		 * The thread may have a preadopt thread group on it already because it
5738 		 * got tagged with it as a creator thread. So we need to make sure to
5739 		 * clear that since we don't have preadopt thread groups for non-kqwl
5740 		 * cases
5741 		 */
5742 		thread_set_preadopt_thread_group(thread, NULL);
5743 #endif
5744 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5745 		    thread_tid(thread), kqr->tr_kq_qos_index,
5746 		    (kqr->tr_kq_override_index << 16) |
5747 		    !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5748 	}
5749 }
5750 
5751 /*
5752  * kqueue_threadreq_cancel - abort a pending thread request
5753  *
5754  * Called when exiting/exec'ing. Forget our pending request.
5755  */
5756 void
5757 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5758 {
5759 	kqueue_release(kqr_kqueue(p, kqr));
5760 }
5761 
5762 workq_threadreq_param_t
5763 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5764 {
5765 	struct kqworkloop *kqwl;
5766 	workq_threadreq_param_t trp;
5767 
5768 	assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5769 	kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5770 	trp.trp_value = kqwl->kqwl_params;
5771 	return trp;
5772 }
5773 
5774 /*
5775  *	kqueue_threadreq_unbind - unbind thread from processing kqueue
5776  *
5777  *	End processing the per-QoS bucket of events and allow other threads
5778  *	to be requested for future servicing.
5779  *
5780  *	caller holds a reference on the kqueue.
5781  */
5782 void
5783 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5784 {
5785 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5786 		kqworkloop_unbind(kqr_kqworkloop(kqr));
5787 	} else {
5788 		kqworkq_unbind(p, kqr);
5789 	}
5790 }
5791 
5792 /*
5793  * If we aren't already busy processing events [for this QoS],
5794  * request workq thread support as appropriate.
5795  *
5796  * TBD - for now, we don't segregate out processing by QoS.
5797  *
5798  * - May be called with the kqueue's wait queue set locked,
5799  *   so cannot do anything that could recurse on that.
5800  */
5801 static void
5802 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5803 {
5804 	workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5805 
5806 	/* convert to thread qos value */
5807 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5808 
5809 	if (!kqr_thread_requested(kqr)) {
5810 		kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5811 	}
5812 }
5813 
5814 /*
5815  * This represent the asynchronous QoS a given workloop contributes,
5816  * hence is the max of the current active knotes (override index)
5817  * and the workloop max qos (userspace async qos).
5818  */
5819 static kq_index_t
5820 kqworkloop_override(struct kqworkloop *kqwl)
5821 {
5822 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5823 	return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5824 }
5825 
5826 static inline void
5827 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5828 {
5829 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5830 
5831 	kqlock_held(kqwl);
5832 
5833 	if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5834 		kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5835 		act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5836 	}
5837 }
5838 
5839 static void
5840 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5841 {
5842 	workq_threadreq_t kqr = &kqwl->kqwl_request;
5843 	struct kqueue *kq = &kqwl->kqwl_kqueue;
5844 	kq_index_t old_override = kqworkloop_override(kqwl);
5845 
5846 	kqlock_held(kqwl);
5847 
5848 	switch (op) {
5849 	case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5850 		kqwl->kqwl_wakeup_qos = qos;
5851 		kqworkloop_request_fire_r2k_notification(kqwl);
5852 		goto recompute;
5853 
5854 	case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5855 		kqr->tr_kq_override_index = qos;
5856 		goto recompute;
5857 
5858 	case KQWL_UTQ_PARKING:
5859 	case KQWL_UTQ_UNBINDING:
5860 		kqr->tr_kq_override_index = qos;
5861 		OS_FALLTHROUGH;
5862 
5863 	case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5864 		if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5865 			assert(qos == THREAD_QOS_UNSPECIFIED);
5866 		}
5867 		if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5868 			kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5869 		}
5870 		kqwl->kqwl_wakeup_qos = 0;
5871 		for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5872 			if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5873 				kqwl->kqwl_wakeup_qos = i;
5874 				kqworkloop_request_fire_r2k_notification(kqwl);
5875 				break;
5876 			}
5877 		}
5878 		OS_FALLTHROUGH;
5879 
5880 	case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5881 recompute:
5882 		/*
5883 		 * When modifying the wakeup QoS or the override QoS, we always need to
5884 		 * maintain our invariant that kqr_override_index is at least as large
5885 		 * as the highest QoS for which an event is fired.
5886 		 *
5887 		 * However this override index can be larger when there is an overriden
5888 		 * suppressed knote pushing on the kqueue.
5889 		 */
5890 		if (qos < kqwl->kqwl_wakeup_qos) {
5891 			qos = kqwl->kqwl_wakeup_qos;
5892 		}
5893 		if (kqr->tr_kq_override_index < qos) {
5894 			kqr->tr_kq_override_index = qos;
5895 		}
5896 		break;
5897 
5898 	case KQWL_UTQ_REDRIVE_EVENTS:
5899 		break;
5900 
5901 	case KQWL_UTQ_SET_QOS_INDEX:
5902 		kqr->tr_kq_qos_index = qos;
5903 		break;
5904 
5905 	default:
5906 		panic("unknown kqwl thread qos update operation: %d", op);
5907 	}
5908 
5909 	thread_t kqwl_owner = kqwl->kqwl_owner;
5910 	thread_t servicer = kqr_thread(kqr);
5911 	boolean_t qos_changed = FALSE;
5912 	kq_index_t new_override = kqworkloop_override(kqwl);
5913 
5914 	/*
5915 	 * Apply the diffs to the owner if applicable
5916 	 */
5917 	if (kqwl_owner) {
5918 #if 0
5919 		/* JMM - need new trace hooks for owner overrides */
5920 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5921 		    kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5922 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5923 #endif
5924 		if (new_override == old_override) {
5925 			// nothing to do
5926 		} else if (old_override == THREAD_QOS_UNSPECIFIED) {
5927 			thread_add_kevent_override(kqwl_owner, new_override);
5928 		} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5929 			thread_drop_kevent_override(kqwl_owner);
5930 		} else { /*  old_override != new_override */
5931 			thread_update_kevent_override(kqwl_owner, new_override);
5932 		}
5933 	}
5934 
5935 	/*
5936 	 * apply the diffs to the servicer
5937 	 */
5938 
5939 	if (!kqr_thread_requested(kqr)) {
5940 		/*
5941 		 * No servicer, nor thread-request
5942 		 *
5943 		 * Make a new thread request, unless there is an owner (or the workloop
5944 		 * is suspended in userland) or if there is no asynchronous work in the
5945 		 * first place.
5946 		 */
5947 
5948 		if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5949 			int initiate_flags = 0;
5950 			if (op == KQWL_UTQ_UNBINDING) {
5951 				initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5952 			}
5953 
5954 			/* kqueue_threadreq_initiate handles the acknowledgement of the TG
5955 			 * if needed */
5956 			kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5957 		}
5958 	} else if (servicer) {
5959 		/*
5960 		 * Servicer in flight
5961 		 *
5962 		 * Just apply the diff to the servicer
5963 		 */
5964 
5965 #if CONFIG_PREADOPT_TG
5966 		/* When there's a servicer for the kqwl already, then the servicer will
5967 		 * adopt the thread group in the kqr, we don't need to poke the
5968 		 * workqueue subsystem to make different decisions due to the thread
5969 		 * group. Consider the current request ack-ed.
5970 		 */
5971 		os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5972 #endif
5973 
5974 		if (kqr_thread_permanently_bound(kqr) && (kqwl->kqwl_state & KQ_SLEEP)) {
5975 			kqr->tr_qos = new_override;
5976 			workq_kern_bound_thread_reset_pri(kqr, get_bsdthread_info(servicer));
5977 		} else {
5978 			struct uthread *ut = get_bsdthread_info(servicer);
5979 			if (ut->uu_kqueue_override != new_override) {
5980 				if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5981 					thread_add_servicer_override(servicer, new_override);
5982 				} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5983 					thread_drop_servicer_override(servicer);
5984 				} else { /* ut->uu_kqueue_override != new_override */
5985 					thread_update_servicer_override(servicer, new_override);
5986 				}
5987 				ut->uu_kqueue_override = new_override;
5988 				qos_changed = TRUE;
5989 			}
5990 		}
5991 	} else if (new_override == THREAD_QOS_UNSPECIFIED) {
5992 		/*
5993 		 * No events to deliver anymore.
5994 		 *
5995 		 * However canceling with turnstiles is challenging, so the fact that
5996 		 * the request isn't useful will be discovered by the servicer himself
5997 		 * later on.
5998 		 */
5999 	} else if (old_override != new_override) {
6000 		/*
6001 		 * Request is in flight
6002 		 *
6003 		 * Apply the diff to the thread request.
6004 		 */
6005 		kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
6006 		qos_changed = TRUE;
6007 	}
6008 
6009 	if (qos_changed) {
6010 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6011 		    thread_tid(servicer), kqr->tr_kq_qos_index,
6012 		    (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
6013 	}
6014 }
6015 
6016 static void
6017 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
6018 {
6019 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6020 	thread_t servicer = kqr_thread(kqr);
6021 	uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
6022 
6023 	kqlock_held(kqwl);
6024 
6025 	if (servicer) {
6026 		thread_update_servicer_iotier_override(servicer, iotier);
6027 	}
6028 }
6029 
6030 static void
6031 kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl)
6032 {
6033 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6034 
6035 	kqlock_held(kqwl);
6036 
6037 	assert(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
6038 
6039 	__assert_only struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
6040 	assert(workq_thread_is_permanently_bound(uth));
6041 
6042 	/*
6043 	 * The bound thread takes up the responsibility of setting the KQ_SLEEP
6044 	 * on its way to parking. See kqworkloop_bound_thread_park_prepost.
6045 	 * This state is always manipulated under kqlock.
6046 	 */
6047 	if (kqwl->kqwl_state & KQ_SLEEP) {
6048 		kqwl->kqwl_state &= ~KQ_SLEEP;
6049 		kqueue_threadreq_bind(current_proc(),
6050 		    kqr, kqr->tr_thread, KQUEUE_THREADREQ_BIND_SOFT);
6051 		workq_kern_bound_thread_wakeup(kqr);
6052 	}
6053 }
6054 
6055 static void
6056 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
6057 {
6058 	if (qos <= kqwl->kqwl_wakeup_qos) {
6059 		/*
6060 		 * Shortcut wakeups that really do nothing useful
6061 		 */
6062 		return;
6063 	}
6064 
6065 	if ((kqwl->kqwl_state & KQ_PROCESSING) &&
6066 	    kqr_thread(&kqwl->kqwl_request) == current_thread()) {
6067 		/*
6068 		 * kqworkloop_end_processing() will perform the required QoS
6069 		 * computations when it unsets the processing mode.
6070 		 */
6071 		return;
6072 	}
6073 
6074 	kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
6075 
6076 	/*
6077 	 * In case of thread bound kqwl, we let the kqworkloop_update_threads_qos
6078 	 * take care of overriding the servicer first before it waking up. This
6079 	 * simplifies the soft bind of the parked bound thread later.
6080 	 */
6081 	if (kqr_thread_permanently_bound(&kqwl->kqwl_request)) {
6082 		kqworkloop_bound_thread_wakeup(kqwl);
6083 	}
6084 }
6085 
6086 static struct kqtailq *
6087 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6088 {
6089 	if (kq.kq->kq_state & KQ_WORKLOOP) {
6090 		return &kq.kqwl->kqwl_suppressed;
6091 	} else if (kq.kq->kq_state & KQ_WORKQ) {
6092 		return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
6093 	} else {
6094 		return &kq.kqf->kqf_suppressed;
6095 	}
6096 }
6097 
6098 struct turnstile *
6099 kqueue_alloc_turnstile(kqueue_t kqu)
6100 {
6101 	struct kqworkloop *kqwl = kqu.kqwl;
6102 	kq_state_t kq_state;
6103 
6104 	kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
6105 	if (kq_state & KQ_HAS_TURNSTILE) {
6106 		/* force a dependency to pair with the atomic or with release below */
6107 		return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
6108 		           (uintptr_t)kq_state);
6109 	}
6110 
6111 	if (!(kq_state & KQ_WORKLOOP)) {
6112 		return TURNSTILE_NULL;
6113 	}
6114 
6115 	struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
6116 	bool workq_locked = false;
6117 
6118 	kqlock(kqu);
6119 
6120 	if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6121 		workq_locked = true;
6122 		workq_kern_threadreq_lock(kqwl->kqwl_p);
6123 	}
6124 
6125 	if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
6126 		free_ts = ts;
6127 		ts = kqwl->kqwl_turnstile;
6128 	} else {
6129 		ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
6130 		    ts, TURNSTILE_WORKLOOPS);
6131 
6132 		/* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6133 		os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
6134 
6135 		if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6136 			workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
6137 			    &kqwl->kqwl_request, kqwl->kqwl_owner,
6138 			    ts, TURNSTILE_IMMEDIATE_UPDATE);
6139 			/*
6140 			 * The workq may no longer be the interlock after this.
6141 			 * In which case the inheritor wasn't updated.
6142 			 */
6143 		}
6144 		if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
6145 			filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6146 		}
6147 	}
6148 
6149 	if (workq_locked) {
6150 		workq_kern_threadreq_unlock(kqwl->kqwl_p);
6151 	}
6152 
6153 	kqunlock(kqu);
6154 
6155 	if (free_ts) {
6156 		turnstile_deallocate(free_ts);
6157 	} else {
6158 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
6159 	}
6160 	return ts;
6161 }
6162 
6163 __attribute__((always_inline))
6164 struct turnstile *
6165 kqueue_turnstile(kqueue_t kqu)
6166 {
6167 	kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
6168 	if (kq_state & KQ_WORKLOOP) {
6169 		return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
6170 	}
6171 	return TURNSTILE_NULL;
6172 }
6173 
6174 __attribute__((always_inline))
6175 struct turnstile *
6176 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
6177 {
6178 	struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6179 	if (kqwl) {
6180 		return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
6181 	}
6182 	return TURNSTILE_NULL;
6183 }
6184 
6185 static void
6186 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
6187 {
6188 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6189 
6190 	/*
6191 	 * This test is racy, but since we never remove this bit,
6192 	 * it allows us to avoid taking a lock.
6193 	 */
6194 	if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
6195 		return;
6196 	}
6197 
6198 	kqlock_held(kqwl);
6199 
6200 	if (kqr_thread_requested_pending(kqr)) {
6201 		kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
6202 		    WORKQ_THREADREQ_MAKE_OVERCOMMIT);
6203 	} else {
6204 		kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
6205 	}
6206 }
6207 
6208 static void
6209 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
6210     kq_index_t override_index)
6211 {
6212 	workq_threadreq_t kqr;
6213 	kq_index_t old_override_index;
6214 	kq_index_t queue_index = kn->kn_qos_index;
6215 
6216 	if (override_index <= queue_index) {
6217 		return;
6218 	}
6219 
6220 	kqr = kqworkq_get_request(kqwq, queue_index);
6221 
6222 	kqlock_held(kqwq);
6223 
6224 	old_override_index = kqr->tr_kq_override_index;
6225 	if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
6226 		thread_t servicer = kqr_thread(kqr);
6227 		kqr->tr_kq_override_index = override_index;
6228 
6229 		/* apply the override to [incoming?] servicing thread */
6230 		if (servicer) {
6231 			if (old_override_index) {
6232 				thread_update_kevent_override(servicer, override_index);
6233 			} else {
6234 				thread_add_kevent_override(servicer, override_index);
6235 			}
6236 		}
6237 	}
6238 }
6239 
6240 static void
6241 kqueue_update_iotier_override(kqueue_t kqu)
6242 {
6243 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6244 		kqworkloop_update_iotier_override(kqu.kqwl);
6245 	}
6246 }
6247 
6248 static void
6249 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
6250 {
6251 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6252 		kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6253 		    qos);
6254 	} else {
6255 		kqworkq_update_override(kqu.kqwq, kn, qos);
6256 	}
6257 }
6258 
6259 static void
6260 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
6261     enum kqwl_unbind_locked_mode how, unsigned int flags)
6262 {
6263 	struct uthread *ut = get_bsdthread_info(thread);
6264 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6265 
6266 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6267 	    thread_tid(thread), 0, 0);
6268 
6269 	kqlock_held(kqwl);
6270 
6271 	assert(ut->uu_kqr_bound == kqr);
6272 
6273 	if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6274 		ut->uu_kqr_bound = NULL;
6275 	}
6276 
6277 	if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
6278 	    ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6279 		thread_drop_servicer_override(thread);
6280 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6281 	}
6282 
6283 	if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6284 		turnstile_update_inheritor(kqwl->kqwl_turnstile,
6285 		    TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6286 		turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6287 		    TURNSTILE_INTERLOCK_HELD);
6288 	}
6289 
6290 #if CONFIG_PREADOPT_TG
6291 	/* The kqueue is able to adopt a thread group again */
6292 
6293 	thread_group_qos_t old_tg, new_tg = NULL;
6294 	int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
6295 		new_tg = old_tg;
6296 		if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
6297 		        new_tg = KQWL_PREADOPTED_TG_NULL;
6298 		}
6299 	});
6300 
6301 	if (ret) {
6302 		if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) &&
6303 		    KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
6304 			// The permanently configured bound thread remains a part of the
6305 			// thread group until its termination.
6306 		} else {
6307 			// Servicer can drop any preadopt thread group it has since it has
6308 			// unbound.
6309 			KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
6310 			thread_set_preadopt_thread_group(thread, NULL);
6311 		}
6312 	}
6313 #endif
6314 	thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
6315 
6316 	if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6317 		kqr->tr_thread = THREAD_NULL;
6318 		kqr->tr_state = WORKQ_TR_STATE_IDLE;
6319 	}
6320 	kqwl->kqwl_state &= ~KQ_R2K_ARMED;
6321 }
6322 
6323 static void
6324 kqworkloop_unbind_delayed_override_drop(thread_t thread)
6325 {
6326 	struct uthread *ut = get_bsdthread_info(thread);
6327 	if (!workq_thread_is_permanently_bound(ut)) {
6328 		assert(ut->uu_kqr_bound == NULL);
6329 	}
6330 	if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6331 		thread_drop_servicer_override(thread);
6332 		ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6333 	}
6334 }
6335 
6336 /*
6337  *	kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6338  *
6339  *	It will acknowledge events, and possibly request a new thread if:
6340  *	- there were active events left
6341  *	- we pended waitq hook callouts during processing
6342  *	- we pended wakeups while processing (or unsuppressing)
6343  *
6344  *	Called with kqueue lock held.
6345  */
6346 static void
6347 kqworkloop_unbind(struct kqworkloop *kqwl)
6348 {
6349 	struct kqueue *kq = &kqwl->kqwl_kqueue;
6350 	workq_threadreq_t kqr = &kqwl->kqwl_request;
6351 	thread_t thread = kqr_thread_fast(kqr);
6352 	int op = KQWL_UTQ_PARKING;
6353 	kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6354 
6355 	/*
6356 	 * For kqwl permanently bound to a thread, this path is only
6357 	 * exercised when the thread is on its way to terminate.
6358 	 * We don't care about asking for a new thread in that case.
6359 	 */
6360 	bool kqwl_had_bound_thread = kqr_thread_permanently_bound(kqr);
6361 
6362 	assert(thread == current_thread());
6363 
6364 	kqlock(kqwl);
6365 
6366 	if (!kqwl_had_bound_thread) {
6367 		/*
6368 		 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6369 		 * unsuppressing knotes not to be applied until the eventual call to
6370 		 * kqworkloop_update_threads_qos() below.
6371 		 */
6372 		assert((kq->kq_state & KQ_PROCESSING) == 0);
6373 		if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6374 			kq->kq_state |= KQ_PROCESSING;
6375 			qos_override = kqworkloop_acknowledge_events(kqwl);
6376 			kq->kq_state &= ~KQ_PROCESSING;
6377 		}
6378 	}
6379 
6380 	kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
6381 
6382 	if (!kqwl_had_bound_thread) {
6383 		kqworkloop_update_threads_qos(kqwl, op, qos_override);
6384 	}
6385 
6386 	kqunlock(kqwl);
6387 
6388 	/*
6389 	 * Drop the override on the current thread last, after the call to
6390 	 * kqworkloop_update_threads_qos above.
6391 	 */
6392 	kqworkloop_unbind_delayed_override_drop(thread);
6393 
6394 	/* If last reference, dealloc the workloop kq */
6395 	kqworkloop_release(kqwl);
6396 }
6397 
6398 static thread_qos_t
6399 kqworkq_unbind_locked(struct kqworkq *kqwq,
6400     workq_threadreq_t kqr, thread_t thread)
6401 {
6402 	struct uthread *ut = get_bsdthread_info(thread);
6403 	kq_index_t old_override = kqr->tr_kq_override_index;
6404 
6405 	KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6406 	    thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
6407 
6408 	kqlock_held(kqwq);
6409 
6410 	assert(ut->uu_kqr_bound == kqr);
6411 	ut->uu_kqr_bound = NULL;
6412 	kqr->tr_thread = THREAD_NULL;
6413 	kqr->tr_state = WORKQ_TR_STATE_IDLE;
6414 	kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6415 	kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6416 
6417 	return old_override;
6418 }
6419 
6420 /*
6421  *	kqworkq_unbind - unbind of a workq kqueue from a thread
6422  *
6423  *	We may have to request new threads.
6424  *	This can happen there are no waiting processing threads and:
6425  *	- there were active events we never got to (count > 0)
6426  *	- we pended waitq hook callouts during processing
6427  *	- we pended wakeups while processing (or unsuppressing)
6428  */
6429 static void
6430 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6431 {
6432 	struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
6433 	__assert_only int rc;
6434 
6435 	kqlock(kqwq);
6436 	rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6437 	assert(rc == -1);
6438 	kqunlock(kqwq);
6439 }
6440 
6441 workq_threadreq_t
6442 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6443 {
6444 	assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6445 	return &kqwq->kqwq_request[qos_index - 1];
6446 }
6447 
6448 static void
6449 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6450 {
6451 	kq_index_t qos = _pthread_priority_thread_qos(pp);
6452 
6453 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
6454 		assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6455 		pp = _pthread_priority_normalize(pp);
6456 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
6457 		if (qos == THREAD_QOS_UNSPECIFIED) {
6458 			/* On workqueues, outside of QoS means MANAGER */
6459 			qos = KQWQ_QOS_MANAGER;
6460 			pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6461 		} else {
6462 			pp = _pthread_priority_normalize(pp);
6463 		}
6464 	} else {
6465 		pp = _pthread_unspecified_priority();
6466 		qos = THREAD_QOS_UNSPECIFIED;
6467 	}
6468 
6469 	kn->kn_qos = (int32_t)pp;
6470 
6471 	if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6472 		/* Never lower QoS when in "Merge" mode */
6473 		kn->kn_qos_override = qos;
6474 	}
6475 
6476 	/* only adjust in-use qos index when not suppressed */
6477 	if (kn->kn_status & KN_SUPPRESSED) {
6478 		kqueue_update_override(kqu, kn, qos);
6479 	} else if (kn->kn_qos_index != qos) {
6480 		knote_dequeue(kqu, kn);
6481 		kn->kn_qos_index = qos;
6482 	}
6483 }
6484 
6485 static void
6486 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
6487 {
6488 	thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6489 
6490 	kqlock_held(kq);
6491 
6492 	assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6493 	assert(qos_index < THREAD_QOS_LAST);
6494 
6495 	/*
6496 	 * Early exit for knotes that should not change QoS
6497 	 */
6498 	if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6499 		panic("filter %d cannot change QoS", kn->kn_filtid);
6500 	} else if (__improbable(!knote_has_qos(kn))) {
6501 		return;
6502 	}
6503 
6504 	/*
6505 	 * knotes with the FALLBACK flag will only use their registration QoS if the
6506 	 * incoming event has no QoS, else, the registration QoS acts as a floor.
6507 	 */
6508 	thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
6509 	if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6510 		if (qos_index == THREAD_QOS_UNSPECIFIED) {
6511 			qos_index = req_qos;
6512 		}
6513 	} else {
6514 		if (qos_index < req_qos) {
6515 			qos_index = req_qos;
6516 		}
6517 	}
6518 	if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6519 		/* Never lower QoS when in "Merge" mode */
6520 		return;
6521 	}
6522 
6523 	if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6524 		/*
6525 		 * When we're trying to update the QoS override and that both an
6526 		 * f_event() and other f_* calls are running concurrently, any of these
6527 		 * in flight calls may want to perform overrides that aren't properly
6528 		 * serialized with each other.
6529 		 *
6530 		 * The first update that observes this racy situation enters a "Merge"
6531 		 * mode which causes subsequent override requests to saturate the
6532 		 * override instead of replacing its value.
6533 		 *
6534 		 * This mode is left when knote_unlock() or knote_post()
6535 		 * observe that no other f_* routine is in flight.
6536 		 */
6537 		kn->kn_status |= KN_MERGE_QOS;
6538 	}
6539 
6540 	/*
6541 	 * Now apply the override if it changed.
6542 	 */
6543 
6544 	if (kn->kn_qos_override == qos_index) {
6545 		return;
6546 	}
6547 
6548 	kn->kn_qos_override = qos_index;
6549 
6550 	if (kn->kn_status & KN_SUPPRESSED) {
6551 		/*
6552 		 * For suppressed events, the kn_qos_index field cannot be touched as it
6553 		 * allows us to know on which supress queue the knote is for a kqworkq.
6554 		 *
6555 		 * Also, there's no natural push applied on the kqueues when this field
6556 		 * changes anyway. We hence need to apply manual overrides in this case,
6557 		 * which will be cleared when the events are later acknowledged.
6558 		 */
6559 		kqueue_update_override(kq, kn, qos_index);
6560 	} else if (kn->kn_qos_index != qos_index) {
6561 		knote_dequeue(kq, kn);
6562 		kn->kn_qos_index = qos_index;
6563 	}
6564 }
6565 
6566 void
6567 klist_init(struct klist *list)
6568 {
6569 	SLIST_INIT(list);
6570 }
6571 
6572 
6573 /*
6574  *	Query/Post each knote in the object's list
6575  *
6576  *	The object lock protects the list. It is assumed that the filter/event
6577  *	routine for the object can determine that the object is already locked (via
6578  *	the hint) and not deadlock itself.
6579  *
6580  *	Autodetach is a specific contract which will detach all knotes from the
6581  *	object prior to posting the final event for that knote. This is done while
6582  *	under the object lock. A breadcrumb is left in the knote's next pointer to
6583  *	indicate to future calls to f_detach routines that they need not reattempt
6584  *	to knote_detach from the object's klist again. This is currently used by
6585  *	EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6586  *
6587  */
6588 void
6589 knote(struct klist *list, long hint, bool autodetach)
6590 {
6591 	struct knote *kn;
6592 	struct knote *tmp_kn;
6593 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6594 		/*
6595 		 * We can modify the knote's next pointer since since we are holding the
6596 		 * object lock and the list can't be concurrently modified. Anyone
6597 		 * determining auto-detached-ness of a knote should take the primitive lock
6598 		 * to synchronize.
6599 		 *
6600 		 * Note that we do this here instead of the filter's f_event since we may
6601 		 * not even post the event if the knote is being dropped.
6602 		 */
6603 		if (autodetach) {
6604 			kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6605 		}
6606 		knote_post(kn, hint);
6607 	}
6608 
6609 	/* Blast away the entire klist */
6610 	if (autodetach) {
6611 		klist_init(list);
6612 	}
6613 }
6614 
6615 /*
6616  * attach a knote to the specified list.  Return true if this is the first entry.
6617  * The list is protected by whatever lock the object it is associated with uses.
6618  */
6619 int
6620 knote_attach(struct klist *list, struct knote *kn)
6621 {
6622 	int ret = SLIST_EMPTY(list);
6623 	SLIST_INSERT_HEAD(list, kn, kn_selnext);
6624 	return ret;
6625 }
6626 
6627 /*
6628  * detach a knote from the specified list.  Return true if that was the last
6629  * entry.  The list is protected by whatever lock the object it is associated
6630  * with uses.
6631  */
6632 int
6633 knote_detach(struct klist *list, struct knote *kn)
6634 {
6635 	assert(!KNOTE_IS_AUTODETACHED(kn));
6636 
6637 	SLIST_REMOVE(list, kn, knote, kn_selnext);
6638 	return SLIST_EMPTY(list);
6639 }
6640 
6641 /*
6642  * knote_vanish - Indicate that the source has vanished
6643  *
6644  * Used only for vanishing ports - vanishing fds go
6645  * through knote_fdclose()
6646  *
6647  * If the knote has requested EV_VANISHED delivery,
6648  * arrange for that. Otherwise, deliver a NOTE_REVOKE
6649  * event for backward compatibility.
6650  *
6651  * The knote is marked as having vanished. The source's
6652  * reference to the knote is dropped by caller, but the knote's
6653  * source reference is only cleaned up later when the knote is dropped.
6654  *
6655  * Our caller already has the object lock held. Calling
6656  * the detach routine would try to take that lock
6657  * recursively - which likely is not supported.
6658  */
6659 void
6660 knote_vanish(struct klist *list, bool make_active)
6661 {
6662 	struct knote *kn;
6663 	struct knote *kn_next;
6664 
6665 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6666 		struct kqueue *kq = knote_get_kq(kn);
6667 
6668 		kqlock(kq);
6669 		if (__probable(kn->kn_status & KN_REQVANISH)) {
6670 			/*
6671 			 * If EV_VANISH supported - prepare to deliver one
6672 			 */
6673 			kn->kn_status |= KN_VANISHED;
6674 		} else {
6675 			/*
6676 			 * Handle the legacy way to indicate that the port/portset was
6677 			 * deallocated or left the current Mach portspace (modern technique
6678 			 * is with an EV_VANISHED protocol).
6679 			 *
6680 			 * Deliver an EV_EOF event for these changes (hopefully it will get
6681 			 * delivered before the port name recycles to the same generation
6682 			 * count and someone tries to re-register a kevent for it or the
6683 			 * events are udata-specific - avoiding a conflict).
6684 			 */
6685 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
6686 		}
6687 		if (make_active) {
6688 			knote_activate(kq, kn, FILTER_ACTIVE);
6689 		}
6690 		kqunlock(kq);
6691 	}
6692 }
6693 
6694 /*
6695  * remove all knotes referencing a specified fd
6696  *
6697  * Entered with the proc_fd lock already held.
6698  * It returns the same way, but may drop it temporarily.
6699  */
6700 void
6701 knote_fdclose(struct proc *p, int fd)
6702 {
6703 	struct filedesc *fdt = &p->p_fd;
6704 	struct klist *list;
6705 	struct knote *kn;
6706 	KNOTE_LOCK_CTX(knlc);
6707 
6708 restart:
6709 	list = &fdt->fd_knlist[fd];
6710 	SLIST_FOREACH(kn, list, kn_link) {
6711 		struct kqueue *kq = knote_get_kq(kn);
6712 
6713 		kqlock(kq);
6714 
6715 		if (kq->kq_p != p) {
6716 			panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6717 			    __func__, kq->kq_p, p);
6718 		}
6719 
6720 		/*
6721 		 * If the knote supports EV_VANISHED delivery,
6722 		 * transition it to vanished mode (or skip over
6723 		 * it if already vanished).
6724 		 */
6725 		if (kn->kn_status & KN_VANISHED) {
6726 			kqunlock(kq);
6727 			continue;
6728 		}
6729 
6730 		proc_fdunlock(p);
6731 		if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6732 			/* the knote was dropped by someone, nothing to do */
6733 		} else if (kn->kn_status & KN_REQVANISH) {
6734 			/*
6735 			 * Since we have REQVANISH for this knote, we need to notify clients about
6736 			 * the EV_VANISHED.
6737 			 *
6738 			 * But unlike mach ports, we want to do the detach here as well and not
6739 			 * defer it so that we can release the iocount that is on the knote and
6740 			 * close the fp.
6741 			 */
6742 			kn->kn_status |= KN_VANISHED;
6743 
6744 			/*
6745 			 * There may be a concurrent post happening, make sure to wait for it
6746 			 * before we detach. knote_wait_for_post() unlocks on kq on exit
6747 			 */
6748 			knote_wait_for_post(kq, kn);
6749 
6750 			knote_fops(kn)->f_detach(kn);
6751 			if (kn->kn_is_fd) {
6752 				fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6753 			}
6754 			kn->kn_filtid = EVFILTID_DETACHED;
6755 			kqlock(kq);
6756 
6757 			knote_activate(kq, kn, FILTER_ACTIVE);
6758 			knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6759 		} else {
6760 			knote_drop(kq, kn, &knlc);
6761 		}
6762 
6763 		proc_fdlock(p);
6764 		goto restart;
6765 	}
6766 }
6767 
6768 /*
6769  * knote_fdfind - lookup a knote in the fd table for process
6770  *
6771  * If the filter is file-based, lookup based on fd index.
6772  * Otherwise use a hash based on the ident.
6773  *
6774  * Matching is based on kq, filter, and ident. Optionally,
6775  * it may also be based on the udata field in the kevent -
6776  * allowing multiple event registration for the file object
6777  * per kqueue.
6778  *
6779  * fd_knhashlock or fdlock held on entry (and exit)
6780  */
6781 static struct knote *
6782 knote_fdfind(struct kqueue *kq,
6783     const struct kevent_internal_s *kev,
6784     bool is_fd,
6785     struct proc *p)
6786 {
6787 	struct filedesc *fdp = &p->p_fd;
6788 	struct klist *list = NULL;
6789 	struct knote *kn = NULL;
6790 
6791 	/*
6792 	 * determine where to look for the knote
6793 	 */
6794 	if (is_fd) {
6795 		/* fd-based knotes are linked off the fd table */
6796 		if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6797 			list = &fdp->fd_knlist[kev->kei_ident];
6798 		}
6799 	} else if (fdp->fd_knhashmask != 0) {
6800 		/* hash non-fd knotes here too */
6801 		list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6802 	}
6803 
6804 	/*
6805 	 * scan the selected list looking for a match
6806 	 */
6807 	if (list != NULL) {
6808 		SLIST_FOREACH(kn, list, kn_link) {
6809 			if (kq == knote_get_kq(kn) &&
6810 			    kev->kei_ident == kn->kn_id &&
6811 			    kev->kei_filter == kn->kn_filter) {
6812 				if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6813 					if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6814 					    kev->kei_udata == kn->kn_udata) {
6815 						break; /* matching udata-specific knote */
6816 					}
6817 				} else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6818 					break; /* matching non-udata-specific knote */
6819 				}
6820 			}
6821 		}
6822 	}
6823 	return kn;
6824 }
6825 
6826 /*
6827  * kq_add_knote- Add knote to the fd table for process
6828  * while checking for duplicates.
6829  *
6830  * All file-based filters associate a list of knotes by file
6831  * descriptor index. All other filters hash the knote by ident.
6832  *
6833  * May have to grow the table of knote lists to cover the
6834  * file descriptor index presented.
6835  *
6836  * fd_knhashlock and fdlock unheld on entry (and exit).
6837  *
6838  * Takes a rwlock boost if inserting the knote is successful.
6839  */
6840 static int
6841 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6842     struct proc *p)
6843 {
6844 	struct filedesc *fdp = &p->p_fd;
6845 	struct klist *list = NULL;
6846 	int ret = 0;
6847 	bool is_fd = kn->kn_is_fd;
6848 
6849 	if (is_fd) {
6850 		proc_fdlock(p);
6851 	} else {
6852 		knhash_lock(fdp);
6853 	}
6854 
6855 	if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6856 		/* found an existing knote: we can't add this one */
6857 		ret = ERESTART;
6858 		goto out_locked;
6859 	}
6860 
6861 	/* knote was not found: add it now */
6862 	if (!is_fd) {
6863 		if (fdp->fd_knhashmask == 0) {
6864 			u_long size = 0;
6865 
6866 			list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6867 			if (list == NULL) {
6868 				ret = ENOMEM;
6869 				goto out_locked;
6870 			}
6871 
6872 			fdp->fd_knhash = list;
6873 			fdp->fd_knhashmask = size;
6874 		}
6875 
6876 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6877 		SLIST_INSERT_HEAD(list, kn, kn_link);
6878 		ret = 0;
6879 		goto out_locked;
6880 	} else {
6881 		/* knote is fd based */
6882 
6883 		if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6884 			u_int size = 0;
6885 
6886 			/* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6887 			if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6888 				ret = EINVAL;
6889 				goto out_locked;
6890 			}
6891 			/* have to grow the fd_knlist */
6892 			size = fdp->fd_knlistsize;
6893 			while (size <= kn->kn_id) {
6894 				size += KQEXTENT;
6895 			}
6896 
6897 			if (size >= (UINT_MAX / sizeof(struct klist))) {
6898 				ret = EINVAL;
6899 				goto out_locked;
6900 			}
6901 
6902 			list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6903 			if (list == NULL) {
6904 				ret = ENOMEM;
6905 				goto out_locked;
6906 			}
6907 
6908 			bcopy(fdp->fd_knlist, list,
6909 			    fdp->fd_knlistsize * sizeof(struct klist));
6910 			kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6911 			fdp->fd_knlist = list;
6912 			fdp->fd_knlistsize = size;
6913 		}
6914 
6915 		list = &fdp->fd_knlist[kn->kn_id];
6916 		SLIST_INSERT_HEAD(list, kn, kn_link);
6917 		ret = 0;
6918 		goto out_locked;
6919 	}
6920 
6921 out_locked:
6922 	if (ret == 0) {
6923 		kqlock(kq);
6924 		assert((kn->kn_status & KN_LOCKED) == 0);
6925 		(void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6926 		kqueue_retain(kq); /* retain a kq ref */
6927 	}
6928 	if (is_fd) {
6929 		proc_fdunlock(p);
6930 	} else {
6931 		knhash_unlock(fdp);
6932 	}
6933 
6934 	return ret;
6935 }
6936 
6937 /*
6938  * kq_remove_knote - remove a knote from the fd table for process
6939  *
6940  * If the filter is file-based, remove based on fd index.
6941  * Otherwise remove from the hash based on the ident.
6942  *
6943  * fd_knhashlock and fdlock unheld on entry (and exit).
6944  */
6945 static void
6946 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6947     struct knote_lock_ctx *knlc)
6948 {
6949 	struct filedesc *fdp = &p->p_fd;
6950 	struct klist *list = NULL;
6951 	uint16_t kq_state;
6952 	bool is_fd = kn->kn_is_fd;
6953 
6954 	if (is_fd) {
6955 		proc_fdlock(p);
6956 	} else {
6957 		knhash_lock(fdp);
6958 	}
6959 
6960 	if (is_fd) {
6961 		assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6962 		list = &fdp->fd_knlist[kn->kn_id];
6963 	} else {
6964 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6965 	}
6966 	SLIST_REMOVE(list, kn, knote, kn_link);
6967 
6968 	kqlock(kq);
6969 
6970 	/* Update the servicer iotier override */
6971 	kqueue_update_iotier_override(kq);
6972 
6973 	kq_state = kq->kq_state;
6974 	if (knlc) {
6975 		knote_unlock_cancel(kq, kn, knlc);
6976 	} else {
6977 		kqunlock(kq);
6978 	}
6979 	if (is_fd) {
6980 		proc_fdunlock(p);
6981 	} else {
6982 		knhash_unlock(fdp);
6983 	}
6984 
6985 	if (kq_state & KQ_DYNAMIC) {
6986 		kqworkloop_release((struct kqworkloop *)kq);
6987 	}
6988 }
6989 
6990 /*
6991  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6992  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6993  *
6994  * fd_knhashlock or fdlock unheld on entry (and exit)
6995  */
6996 
6997 static struct knote *
6998 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6999     bool is_fd, struct proc *p)
7000 {
7001 	struct filedesc *fdp = &p->p_fd;
7002 	struct knote *kn;
7003 
7004 	if (is_fd) {
7005 		proc_fdlock(p);
7006 	} else {
7007 		knhash_lock(fdp);
7008 	}
7009 
7010 	/*
7011 	 * Temporary horrible hack:
7012 	 * this cast is gross and will go away in a future change.
7013 	 * It is OK to do because we don't look at xflags/s_fflags,
7014 	 * and that when we cast down the kev this way,
7015 	 * the truncated filter field works.
7016 	 */
7017 	kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
7018 
7019 	if (kn) {
7020 		kqlock(kq);
7021 		assert(knote_get_kq(kn) == kq);
7022 	}
7023 
7024 	if (is_fd) {
7025 		proc_fdunlock(p);
7026 	} else {
7027 		knhash_unlock(fdp);
7028 	}
7029 
7030 	return kn;
7031 }
7032 
7033 static struct kqtailq *
7034 knote_get_tailq(kqueue_t kqu, struct knote *kn)
7035 {
7036 	kq_index_t qos_index = kn->kn_qos_index;
7037 
7038 	if (kqu.kq->kq_state & KQ_WORKLOOP) {
7039 		assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
7040 		return &kqu.kqwl->kqwl_queue[qos_index - 1];
7041 	} else if (kqu.kq->kq_state & KQ_WORKQ) {
7042 		assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
7043 		return &kqu.kqwq->kqwq_queue[qos_index - 1];
7044 	} else {
7045 		assert(qos_index == QOS_INDEX_KQFILE);
7046 		return &kqu.kqf->kqf_queue;
7047 	}
7048 }
7049 
7050 static void
7051 knote_enqueue(kqueue_t kqu, struct knote *kn)
7052 {
7053 	kqlock_held(kqu);
7054 
7055 	if ((kn->kn_status & KN_ACTIVE) == 0) {
7056 		return;
7057 	}
7058 
7059 	if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
7060 		return;
7061 	}
7062 
7063 	struct kqtailq *queue = knote_get_tailq(kqu, kn);
7064 	bool wakeup = TAILQ_EMPTY(queue);
7065 
7066 	TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7067 	kn->kn_status |= KN_QUEUED;
7068 	kqu.kq->kq_count++;
7069 
7070 	if (wakeup) {
7071 		if (kqu.kq->kq_state & KQ_WORKLOOP) {
7072 			kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
7073 		} else if (kqu.kq->kq_state & KQ_WORKQ) {
7074 			kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
7075 		} else {
7076 			kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
7077 		}
7078 	}
7079 }
7080 
7081 __attribute__((always_inline))
7082 static inline void
7083 knote_dequeue(kqueue_t kqu, struct knote *kn)
7084 {
7085 	if (kn->kn_status & KN_QUEUED) {
7086 		struct kqtailq *queue = knote_get_tailq(kqu, kn);
7087 
7088 		// attaching the knote calls knote_reset_priority() without
7089 		// the kqlock which is fine, so we can't call kqlock_held()
7090 		// if we're not queued.
7091 		kqlock_held(kqu);
7092 
7093 		TAILQ_REMOVE(queue, kn, kn_tqe);
7094 		kn->kn_status &= ~KN_QUEUED;
7095 		kqu.kq->kq_count--;
7096 		if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
7097 			assert((kqu.kq->kq_count == 0) ==
7098 			    (bool)TAILQ_EMPTY(queue));
7099 		}
7100 	}
7101 }
7102 
7103 /* called with kqueue lock held */
7104 static void
7105 knote_suppress(kqueue_t kqu, struct knote *kn)
7106 {
7107 	struct kqtailq *suppressq;
7108 
7109 	kqlock_held(kqu);
7110 
7111 	assert((kn->kn_status & KN_SUPPRESSED) == 0);
7112 	assert(kn->kn_status & KN_QUEUED);
7113 
7114 	knote_dequeue(kqu, kn);
7115 	/* deactivate - so new activations indicate a wakeup */
7116 	kn->kn_status &= ~KN_ACTIVE;
7117 	kn->kn_status |= KN_SUPPRESSED;
7118 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
7119 	TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7120 }
7121 
7122 __attribute__((always_inline))
7123 static inline void
7124 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
7125 {
7126 	struct kqtailq *suppressq;
7127 
7128 	kqlock_held(kqu);
7129 
7130 	assert(kn->kn_status & KN_SUPPRESSED);
7131 
7132 	kn->kn_status &= ~KN_SUPPRESSED;
7133 	suppressq = kqueue_get_suppressed_queue(kqu, kn);
7134 	TAILQ_REMOVE(suppressq, kn, kn_tqe);
7135 
7136 	/*
7137 	 * If the knote is no longer active, reset its push,
7138 	 * and resynchronize kn_qos_index with kn_qos_override
7139 	 * for knotes with a real qos.
7140 	 */
7141 	if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
7142 		kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
7143 	}
7144 	kn->kn_qos_index = kn->kn_qos_override;
7145 }
7146 
7147 /* called with kqueue lock held */
7148 static void
7149 knote_unsuppress(kqueue_t kqu, struct knote *kn)
7150 {
7151 	knote_unsuppress_noqueue(kqu, kn);
7152 	knote_enqueue(kqu, kn);
7153 }
7154 
7155 __attribute__((always_inline))
7156 static inline void
7157 knote_mark_active(struct knote *kn)
7158 {
7159 	if ((kn->kn_status & KN_ACTIVE) == 0) {
7160 		KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7161 		    kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
7162 		    kn->kn_filtid);
7163 	}
7164 
7165 	kn->kn_status |= KN_ACTIVE;
7166 }
7167 
7168 /* called with kqueue lock held */
7169 static void
7170 knote_activate(kqueue_t kqu, struct knote *kn, int result)
7171 {
7172 	assert(result & FILTER_ACTIVE);
7173 	if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
7174 		// may dequeue the knote
7175 		knote_adjust_qos(kqu.kq, kn, result);
7176 	}
7177 	knote_mark_active(kn);
7178 	knote_enqueue(kqu, kn);
7179 }
7180 
7181 /*
7182  * This function applies changes requested by f_attach or f_touch for
7183  * a given filter. It proceeds in a carefully chosen order to help
7184  * every single transition do the minimal amount of work possible.
7185  */
7186 static void
7187 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
7188     int result)
7189 {
7190 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
7191 		kn->kn_status &= ~KN_DISABLED;
7192 
7193 		/*
7194 		 * it is possible for userland to have knotes registered for a given
7195 		 * workloop `wl_orig` but really handled on another workloop `wl_new`.
7196 		 *
7197 		 * In that case, rearming will happen from the servicer thread of
7198 		 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7199 		 * this knote to stay suppressed forever if we only relied on
7200 		 * kqworkloop_acknowledge_events to be called by `wl_orig`.
7201 		 *
7202 		 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7203 		 * unsuppress because that would mess with the processing phase of
7204 		 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7205 		 * will be called.
7206 		 */
7207 		if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
7208 			if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
7209 				knote_unsuppress_noqueue(kqu, kn);
7210 			}
7211 		}
7212 	}
7213 
7214 	if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
7215 		kqueue_update_iotier_override(kqu);
7216 	}
7217 
7218 	if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
7219 		// may dequeue the knote
7220 		knote_reset_priority(kqu, kn, kev->qos);
7221 	}
7222 
7223 	/*
7224 	 * When we unsuppress above, or because of knote_reset_priority(),
7225 	 * the knote may have been dequeued, we need to restore the invariant
7226 	 * that if the knote is active it needs to be queued now that
7227 	 * we're done applying changes.
7228 	 */
7229 	if (result & FILTER_ACTIVE) {
7230 		knote_activate(kqu, kn, result);
7231 	} else {
7232 		knote_enqueue(kqu, kn);
7233 	}
7234 
7235 	if ((result & FILTER_THREADREQ_NODEFEER) &&
7236 	    act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
7237 		workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7238 	}
7239 }
7240 
7241 /*
7242  * knote_drop - disconnect and drop the knote
7243  *
7244  * Called with the kqueue locked, returns with the kqueue unlocked.
7245  *
7246  * If a knote locking context is passed, it is canceled.
7247  *
7248  * The knote may have already been detached from
7249  * (or not yet attached to) its source object.
7250  */
7251 static void
7252 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
7253 {
7254 	struct proc *p = kq->kq_p;
7255 
7256 	kqlock_held(kq);
7257 
7258 	assert((kn->kn_status & KN_DROPPING) == 0);
7259 	if (knlc == NULL) {
7260 		assert((kn->kn_status & KN_LOCKED) == 0);
7261 	}
7262 	kn->kn_status |= KN_DROPPING;
7263 
7264 	if (kn->kn_status & KN_SUPPRESSED) {
7265 		knote_unsuppress_noqueue(kq, kn);
7266 	} else {
7267 		knote_dequeue(kq, kn);
7268 	}
7269 	knote_wait_for_post(kq, kn);
7270 
7271 	/* Even if we are autodetached, the filter may need to do cleanups of any
7272 	 * stuff stashed on the knote so always make the call and let each filter
7273 	 * handle the possibility of autodetached-ness */
7274 	knote_fops(kn)->f_detach(kn);
7275 
7276 	/* kq may be freed when kq_remove_knote() returns */
7277 	kq_remove_knote(kq, kn, p, knlc);
7278 	if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
7279 		fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
7280 	}
7281 
7282 	knote_free(kn);
7283 }
7284 
7285 void
7286 knote_init(void)
7287 {
7288 #if CONFIG_MEMORYSTATUS
7289 	/* Initialize the memorystatus list lock */
7290 	memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
7291 #endif
7292 }
7293 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
7294 
7295 const struct filterops *
7296 knote_fops(struct knote *kn)
7297 {
7298 	return sysfilt_ops[kn->kn_filtid];
7299 }
7300 
7301 static struct knote *
7302 knote_alloc(void)
7303 {
7304 	return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
7305 }
7306 
7307 static void
7308 knote_free(struct knote *kn)
7309 {
7310 	assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
7311 	zfree(knote_zone, kn);
7312 }
7313 
7314 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
7315 
7316 kevent_ctx_t
7317 kevent_get_context(thread_t thread)
7318 {
7319 	uthread_t ut = get_bsdthread_info(thread);
7320 	return &ut->uu_save.uus_kevent;
7321 }
7322 
7323 static inline bool
7324 kevent_args_requesting_events(unsigned int flags, int nevents)
7325 {
7326 	return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
7327 }
7328 
7329 static inline int
7330 kevent_adjust_flags_for_proc(proc_t p, int flags)
7331 {
7332 	__builtin_assume(p);
7333 	return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
7334 }
7335 
7336 /*!
7337  * @function kevent_get_kqfile
7338  *
7339  * @brief
7340  * Lookup a kqfile by fd.
7341  *
7342  * @discussion
7343  * Callers: kevent, kevent64, kevent_qos
7344  *
7345  * This is not assumed to be a fastpath (kqfile interfaces are legacy)
7346  */
7347 OS_NOINLINE
7348 static int
7349 kevent_get_kqfile(struct proc *p, int fd, int flags,
7350     struct fileproc **fpp, struct kqueue **kqp)
7351 {
7352 	int error = 0;
7353 	struct kqueue *kq;
7354 
7355 	error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
7356 	if (__improbable(error)) {
7357 		return error;
7358 	}
7359 	kq = (struct kqueue *)fp_get_data((*fpp));
7360 
7361 	uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
7362 	if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
7363 		kqlock(kq);
7364 		kq_state = kq->kq_state;
7365 		if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
7366 			if (flags & KEVENT_FLAG_LEGACY32) {
7367 				kq_state |= KQ_KEV32;
7368 			} else if (flags & KEVENT_FLAG_LEGACY64) {
7369 				kq_state |= KQ_KEV64;
7370 			} else {
7371 				kq_state |= KQ_KEV_QOS;
7372 			}
7373 			kq->kq_state = kq_state;
7374 		}
7375 		kqunlock(kq);
7376 	}
7377 
7378 	/*
7379 	 * kqfiles can't be used through the legacy kevent()
7380 	 * and other interfaces at the same time.
7381 	 */
7382 	if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7383 	    (bool)(kq_state & KQ_KEV32))) {
7384 		fp_drop(p, fd, *fpp, 0);
7385 		return EINVAL;
7386 	}
7387 
7388 	*kqp = kq;
7389 	return 0;
7390 }
7391 
7392 /*!
7393  * @function kevent_get_kqwq
7394  *
7395  * @brief
7396  * Lookup or create the process kqwq (faspath).
7397  *
7398  * @discussion
7399  * Callers: kevent64, kevent_qos
7400  */
7401 OS_ALWAYS_INLINE
7402 static int
7403 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7404 {
7405 	struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7406 
7407 	if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7408 		return EINVAL;
7409 	}
7410 	if (__improbable(kqwq == NULL)) {
7411 		kqwq = kqworkq_alloc(p, flags);
7412 		if (__improbable(kqwq == NULL)) {
7413 			return ENOMEM;
7414 		}
7415 	}
7416 
7417 	*kqp = &kqwq->kqwq_kqueue;
7418 	return 0;
7419 }
7420 
7421 #pragma mark kevent copyio
7422 
7423 /*!
7424  * @function kevent_get_data_size
7425  *
7426  * @brief
7427  * Copies in the extra data size from user-space.
7428  */
7429 static int
7430 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7431     kevent_ctx_t kectx)
7432 {
7433 	if (!data_avail || !data_out) {
7434 		kectx->kec_data_size  = 0;
7435 		kectx->kec_data_resid = 0;
7436 	} else if (flags & KEVENT_FLAG_PROC64) {
7437 		user64_size_t usize = 0;
7438 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7439 		if (__improbable(error)) {
7440 			return error;
7441 		}
7442 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7443 	} else {
7444 		user32_size_t usize = 0;
7445 		int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7446 		if (__improbable(error)) {
7447 			return error;
7448 		}
7449 		kectx->kec_data_avail = data_avail;
7450 		kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7451 	}
7452 	kectx->kec_data_out   = data_out;
7453 	kectx->kec_data_avail = data_avail;
7454 	return 0;
7455 }
7456 
7457 /*!
7458  * @function kevent_put_data_size
7459  *
7460  * @brief
7461  * Copies out the residual data size to user-space if any has been used.
7462  */
7463 static int
7464 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7465 {
7466 	if (kectx->kec_data_resid == kectx->kec_data_size) {
7467 		return 0;
7468 	}
7469 	if (flags & KEVENT_FLAG_KERNEL) {
7470 		*(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7471 		return 0;
7472 	}
7473 	if (flags & KEVENT_FLAG_PROC64) {
7474 		user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7475 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7476 	} else {
7477 		user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7478 		return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7479 	}
7480 }
7481 
7482 /*!
7483  * @function kevent_legacy_copyin
7484  *
7485  * @brief
7486  * Handles the copyin of a kevent/kevent64 event.
7487  */
7488 static int
7489 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
7490 {
7491 	int error;
7492 
7493 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7494 
7495 	if (flags & KEVENT_FLAG_LEGACY64) {
7496 		struct kevent64_s kev64;
7497 
7498 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7499 		if (__improbable(error)) {
7500 			return error;
7501 		}
7502 		*addrp += sizeof(kev64);
7503 		*kevp = (struct kevent_qos_s){
7504 			.ident  = kev64.ident,
7505 			.filter = kev64.filter,
7506 			/* Make sure user doesn't pass in any system flags */
7507 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7508 			.udata  = kev64.udata,
7509 			.fflags = kev64.fflags,
7510 			.data   = kev64.data,
7511 			.ext[0] = kev64.ext[0],
7512 			.ext[1] = kev64.ext[1],
7513 		};
7514 	} else if (flags & KEVENT_FLAG_PROC64) {
7515 		struct user64_kevent kev64;
7516 
7517 		error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7518 		if (__improbable(error)) {
7519 			return error;
7520 		}
7521 		*addrp += sizeof(kev64);
7522 		*kevp = (struct kevent_qos_s){
7523 			.ident  = kev64.ident,
7524 			.filter = kev64.filter,
7525 			/* Make sure user doesn't pass in any system flags */
7526 			.flags  = kev64.flags & ~EV_SYSFLAGS,
7527 			.udata  = kev64.udata,
7528 			.fflags = kev64.fflags,
7529 			.data   = kev64.data,
7530 		};
7531 	} else {
7532 		struct user32_kevent kev32;
7533 
7534 		error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7535 		if (__improbable(error)) {
7536 			return error;
7537 		}
7538 		*addrp += sizeof(kev32);
7539 		*kevp = (struct kevent_qos_s){
7540 			.ident  = (uintptr_t)kev32.ident,
7541 			.filter = kev32.filter,
7542 			/* Make sure user doesn't pass in any system flags */
7543 			.flags  = kev32.flags & ~EV_SYSFLAGS,
7544 			.udata  = CAST_USER_ADDR_T(kev32.udata),
7545 			.fflags = kev32.fflags,
7546 			.data   = (intptr_t)kev32.data,
7547 		};
7548 	}
7549 
7550 	return 0;
7551 }
7552 
7553 /*!
7554  * @function kevent_modern_copyin
7555  *
7556  * @brief
7557  * Handles the copyin of a kevent_qos/kevent_id event.
7558  */
7559 static int
7560 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7561 {
7562 	int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7563 	if (__probable(!error)) {
7564 		/* Make sure user doesn't pass in any system flags */
7565 		*addrp += sizeof(struct kevent_qos_s);
7566 		kevp->flags &= ~EV_SYSFLAGS;
7567 	}
7568 	return error;
7569 }
7570 
7571 /*!
7572  * @function kevent_legacy_copyout
7573  *
7574  * @brief
7575  * Handles the copyout of a kevent/kevent64 event.
7576  */
7577 static int
7578 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7579 {
7580 	int advance;
7581 	int error;
7582 
7583 	assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7584 
7585 	/*
7586 	 * fully initialize the differnt output event structure
7587 	 * types from the internal kevent (and some universal
7588 	 * defaults for fields not represented in the internal
7589 	 * form).
7590 	 *
7591 	 * Note: these structures have no padding hence the C99
7592 	 *       initializers below do not leak kernel info.
7593 	 */
7594 	if (flags & KEVENT_FLAG_LEGACY64) {
7595 		struct kevent64_s kev64 = {
7596 			.ident  = kevp->ident,
7597 			.filter = kevp->filter,
7598 			.flags  = kevp->flags,
7599 			.fflags = kevp->fflags,
7600 			.data   = (int64_t)kevp->data,
7601 			.udata  = kevp->udata,
7602 			.ext[0] = kevp->ext[0],
7603 			.ext[1] = kevp->ext[1],
7604 		};
7605 		advance = sizeof(struct kevent64_s);
7606 		error = copyout((caddr_t)&kev64, *addrp, advance);
7607 	} else if (flags & KEVENT_FLAG_PROC64) {
7608 		/*
7609 		 * deal with the special case of a user-supplied
7610 		 * value of (uintptr_t)-1.
7611 		 */
7612 		uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7613 		    (uint64_t)-1LL : (uint64_t)kevp->ident;
7614 		struct user64_kevent kev64 = {
7615 			.ident  = ident,
7616 			.filter = kevp->filter,
7617 			.flags  = kevp->flags,
7618 			.fflags = kevp->fflags,
7619 			.data   = (int64_t) kevp->data,
7620 			.udata  = (user_addr_t) kevp->udata,
7621 		};
7622 		advance = sizeof(kev64);
7623 		error = copyout((caddr_t)&kev64, *addrp, advance);
7624 	} else {
7625 		struct user32_kevent kev32 = {
7626 			.ident  = (uint32_t)kevp->ident,
7627 			.filter = kevp->filter,
7628 			.flags  = kevp->flags,
7629 			.fflags = kevp->fflags,
7630 			.data   = (int32_t)kevp->data,
7631 			.udata  = (uint32_t)kevp->udata,
7632 		};
7633 		advance = sizeof(kev32);
7634 		error = copyout((caddr_t)&kev32, *addrp, advance);
7635 	}
7636 	if (__probable(!error)) {
7637 		*addrp += advance;
7638 	}
7639 	return error;
7640 }
7641 
7642 /*!
7643  * @function kevent_modern_copyout
7644  *
7645  * @brief
7646  * Handles the copyout of a kevent_qos/kevent_id event.
7647  */
7648 OS_ALWAYS_INLINE
7649 static inline int
7650 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7651 {
7652 	int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7653 	if (__probable(!error)) {
7654 		*addrp += sizeof(struct kevent_qos_s);
7655 	}
7656 	return error;
7657 }
7658 
7659 #pragma mark kevent core implementation
7660 
7661 /*!
7662  * @function kevent_callback_inline
7663  *
7664  * @brief
7665  * Callback for each individual event
7666  *
7667  * @discussion
7668  * This is meant to be inlined in kevent_modern_callback and
7669  * kevent_legacy_callback.
7670  */
7671 OS_ALWAYS_INLINE
7672 static inline int
7673 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7674 {
7675 	int error;
7676 
7677 	assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7678 
7679 	/*
7680 	 * Copy out the appropriate amount of event data for this user.
7681 	 */
7682 	if (legacy) {
7683 		error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7684 		    kectx->kec_process_flags);
7685 	} else {
7686 		error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7687 	}
7688 
7689 	/*
7690 	 * If there isn't space for additional events, return
7691 	 * a harmless error to stop the processing here
7692 	 */
7693 	if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7694 		error = EWOULDBLOCK;
7695 	}
7696 	return error;
7697 }
7698 
7699 /*!
7700  * @function kevent_modern_callback
7701  *
7702  * @brief
7703  * Callback for each individual modern event.
7704  *
7705  * @discussion
7706  * This callback handles kevent_qos/kevent_id events.
7707  */
7708 static int
7709 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7710 {
7711 	return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7712 }
7713 
7714 /*!
7715  * @function kevent_legacy_callback
7716  *
7717  * @brief
7718  * Callback for each individual legacy event.
7719  *
7720  * @discussion
7721  * This callback handles kevent/kevent64 events.
7722  */
7723 static int
7724 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7725 {
7726 	return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7727 }
7728 
7729 /*!
7730  * @function kevent_cleanup
7731  *
7732  * @brief
7733  * Handles the cleanup returning from a kevent call.
7734  *
7735  * @discussion
7736  * kevent entry points will take a reference on workloops,
7737  * and a usecount on the fileglob of kqfiles.
7738  *
7739  * This function undoes this on the exit paths of kevents.
7740  *
7741  * @returns
7742  * The error to return to userspace.
7743  */
7744 static int
7745 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7746 {
7747 	// poll should not call any codepath leading to this
7748 	assert((flags & KEVENT_FLAG_POLL) == 0);
7749 
7750 	if (flags & KEVENT_FLAG_WORKLOOP) {
7751 		kqworkloop_release(kqu.kqwl);
7752 	} else if (flags & KEVENT_FLAG_WORKQ) {
7753 		/* nothing held */
7754 	} else {
7755 		fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7756 	}
7757 
7758 	/* don't restart after signals... */
7759 	if (error == ERESTART) {
7760 		error = EINTR;
7761 	} else if (error == 0) {
7762 		/* don't abandon other output just because of residual copyout failures */
7763 		(void)kevent_put_data_size(flags, kectx);
7764 	}
7765 
7766 	if (flags & KEVENT_FLAG_PARKING) {
7767 		thread_t th = current_thread();
7768 		struct uthread *uth = get_bsdthread_info(th);
7769 		workq_threadreq_t kqr = uth->uu_kqr_bound;
7770 		if (kqr && !(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND)) {
7771 			thread_unfreeze_base_pri(th);
7772 		}
7773 	}
7774 	return error;
7775 }
7776 
7777 /*!
7778  * @function kqueue_process
7779  *
7780  * @brief
7781  * Process the triggered events in a kqueue.
7782  *
7783  * @discussion
7784  * Walk the queued knotes and validate that they are really still triggered
7785  * events by calling the filter routines (if necessary).
7786  *
7787  * For each event that is still considered triggered, invoke the callback
7788  * routine provided.
7789  *
7790  * caller holds a reference on the kqueue.
7791  * kqueue locked on entry and exit - but may be dropped
7792  * kqueue list locked (held for duration of call)
7793  *
7794  * This is only called by kqueue_scan() so that the compiler can inline it.
7795  *
7796  * For kqworkloops that are permanently configured with a bound thread, this
7797  * function parks the bound thread (instead of returning) if there are no events
7798  * or errors to be returned and KEVENT_FLAG_PARKING was specified.
7799  *
7800  * @returns
7801  * - 0:            no event was returned, no other error occured
7802  * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
7803  * - EWOULDBLOCK:  (not an error) events have been found and we should return
7804  * - EFAULT:       copyout failed
7805  * - filter specific errors
7806  */
7807 static int
7808 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7809     kevent_callback_t callback)
7810 {
7811 	workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7812 	struct knote *kn;
7813 	int error = 0, rc = 0;
7814 	struct kqtailq *base_queue, *queue;
7815 	uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7816 	bool kqwl_permanently_bound = false;
7817 
7818 	if (kq_type & KQ_WORKQ) {
7819 		rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7820 	} else if (kq_type & KQ_WORKLOOP) {
7821 		kqwl_permanently_bound = kqr_thread_permanently_bound(kqr);
7822 		rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7823 	} else {
7824 kqfile_retry:
7825 		rc = kqfile_begin_processing(kqu.kqf);
7826 		if (rc == EBADF) {
7827 			return EBADF;
7828 		}
7829 	}
7830 
7831 	if (rc == -1) {
7832 		/* Nothing to process */
7833 		if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7834 		    kqwl_permanently_bound) {
7835 			goto kqwl_bound_thread_park;
7836 		}
7837 		return 0;
7838 	}
7839 
7840 	/*
7841 	 * loop through the enqueued knotes associated with this request,
7842 	 * processing each one. Each request may have several queues
7843 	 * of knotes to process (depending on the type of kqueue) so we
7844 	 * have to loop through all the queues as long as we have additional
7845 	 * space.
7846 	 */
7847 
7848 process_again:
7849 	if (kq_type & KQ_WORKQ) {
7850 		base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7851 	} else if (kq_type & KQ_WORKLOOP) {
7852 		base_queue = &kqu.kqwl->kqwl_queue[0];
7853 		queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7854 	} else {
7855 		base_queue = queue = &kqu.kqf->kqf_queue;
7856 	}
7857 
7858 	do {
7859 		while ((kn = TAILQ_FIRST(queue)) != NULL) {
7860 			error = knote_process(kn, kectx, callback);
7861 			if (error == EJUSTRETURN) {
7862 				error = 0;
7863 			} else if (__improbable(error)) {
7864 				/* error is EWOULDBLOCK when the out event array is full */
7865 				goto stop_processing;
7866 			}
7867 		}
7868 	} while (queue-- > base_queue);
7869 
7870 	if (kectx->kec_process_noutputs) {
7871 		/* callers will transform this into no error */
7872 		error = EWOULDBLOCK;
7873 	}
7874 
7875 stop_processing:
7876 	/*
7877 	 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7878 	 * we want to unbind the kqrequest from the thread.
7879 	 *
7880 	 * However, because the kq locks are dropped several times during process,
7881 	 * new knotes may have fired again, in which case, we want to fail the end
7882 	 * processing and process again, until it converges.
7883 	 *
7884 	 * If we have an error or returned events, end processing never fails.
7885 	 */
7886 	if (error) {
7887 		flags &= ~KEVENT_FLAG_PARKING;
7888 	}
7889 	if (kq_type & KQ_WORKQ) {
7890 		rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7891 	} else if (kq_type & KQ_WORKLOOP) {
7892 		rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7893 	} else {
7894 		rc = kqfile_end_processing(kqu.kqf);
7895 	}
7896 
7897 	if (__probable(error)) {
7898 		return error;
7899 	}
7900 
7901 	if (__probable(rc >= 0)) {
7902 		assert(rc == 0 || rc == EBADF);
7903 		if (rc == 0) {
7904 			if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7905 			    kqwl_permanently_bound) {
7906 				goto kqwl_bound_thread_park;
7907 			}
7908 		}
7909 		return rc;
7910 	}
7911 
7912 	if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7913 		assert(flags & KEVENT_FLAG_PARKING);
7914 		goto process_again;
7915 	} else {
7916 		goto kqfile_retry;
7917 	}
7918 
7919 kqwl_bound_thread_park:
7920 #if DEVELOPMENT | DEBUG
7921 	assert(current_thread() == kqr_thread_fast(kqr));
7922 	assert(workq_thread_is_permanently_bound(current_uthread()));
7923 #endif
7924 	kqworkloop_bound_thread_park(kqu.kqwl, kqr_thread_fast(kqr));
7925 	__builtin_unreachable();
7926 }
7927 
7928 /*!
7929  * @function kqueue_scan_continue
7930  *
7931  * @brief
7932  * The continuation used by kqueue_scan for kevent entry points.
7933  *
7934  * @discussion
7935  * Assumes we inherit a use/ref count on the kq or its fileglob.
7936  *
7937  * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7938  * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7939  */
7940 OS_NORETURN OS_NOINLINE
7941 static void
7942 kqueue_scan_continue(void *data, wait_result_t wait_result)
7943 {
7944 	uthread_t ut = current_uthread();
7945 	kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7946 	int error = 0, flags = kectx->kec_process_flags;
7947 	struct kqueue *kq = data;
7948 
7949 	/*
7950 	 * only kevent variants call in here, so we know the callback is
7951 	 * kevent_legacy_callback or kevent_modern_callback.
7952 	 */
7953 	assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7954 
7955 	switch (wait_result) {
7956 	case THREAD_AWAKENED:
7957 		if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7958 			error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7959 		} else {
7960 			error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7961 		}
7962 		break;
7963 	case THREAD_TIMED_OUT:
7964 		error = 0;
7965 		break;
7966 	case THREAD_INTERRUPTED:
7967 		error = EINTR;
7968 		break;
7969 	case THREAD_RESTART:
7970 		error = EBADF;
7971 		break;
7972 	default:
7973 		panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7974 	}
7975 
7976 
7977 	error = kevent_cleanup(kq, flags, error, kectx);
7978 	*(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7979 	unix_syscall_return(error);
7980 }
7981 
7982 /*!
7983  * @function kqueue_scan
7984  *
7985  * @brief
7986  * Scan and wait for events in a kqueue (used by poll & kevent).
7987  *
7988  * @discussion
7989  * Process the triggered events in a kqueue.
7990  *
7991  * If there are no events triggered arrange to wait for them:
7992  * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7993  * - possibly until kectx->kec_deadline expires
7994  *
7995  * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7996  * are set, then it will wait in the kqueue_scan_continue continuation.
7997  *
7998  * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7999  * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
8000  *
8001  * @param kqu
8002  * The kqueue being scanned.
8003  *
8004  * @param flags
8005  * The KEVENT_FLAG_* flags for this call.
8006  *
8007  * @param kectx
8008  * The context used for this scan.
8009  * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
8010  *
8011  * @param callback
8012  * The callback to be called on events sucessfully processed.
8013  * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
8014  */
8015 int
8016 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
8017     kevent_callback_t callback)
8018 {
8019 	int error;
8020 
8021 	for (;;) {
8022 		kqlock(kqu);
8023 		error = kqueue_process(kqu, flags, kectx, callback);
8024 
8025 		/*
8026 		 * If we got an error, events returned (EWOULDBLOCK)
8027 		 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
8028 		 * just return.
8029 		 */
8030 		if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
8031 			kqunlock(kqu);
8032 			return error == EWOULDBLOCK ? 0 : error;
8033 		}
8034 
8035 		assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
8036 
8037 		kqu.kqf->kqf_state |= KQ_SLEEP;
8038 		assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
8039 		    kectx->kec_deadline);
8040 		kqunlock(kqu);
8041 
8042 		if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
8043 			thread_block_parameter(kqueue_scan_continue, kqu.kqf);
8044 			__builtin_unreachable();
8045 		}
8046 
8047 		wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
8048 		switch (wr) {
8049 		case THREAD_AWAKENED:
8050 			break;
8051 		case THREAD_TIMED_OUT:
8052 			return 0;
8053 		case THREAD_INTERRUPTED:
8054 			return EINTR;
8055 		case THREAD_RESTART:
8056 			return EBADF;
8057 		default:
8058 			panic("%s: - bad wait_result (%d)", __func__, wr);
8059 		}
8060 	}
8061 }
8062 
8063 /*!
8064  * @function kevent_internal
8065  *
8066  * @brief
8067  * Common kevent code.
8068  *
8069  * @discussion
8070  * Needs to be inlined to specialize for legacy or modern and
8071  * eliminate dead code.
8072  *
8073  * This is the core logic of kevent entry points, that will:
8074  * - register kevents
8075  * - optionally scan the kqueue for events
8076  *
8077  * The caller is giving kevent_internal a reference on the kqueue
8078  * or its fileproc that needs to be cleaned up by kevent_cleanup().
8079  */
8080 OS_ALWAYS_INLINE
8081 static inline int
8082 kevent_internal(kqueue_t kqu,
8083     user_addr_t changelist, int nchanges,
8084     user_addr_t ueventlist, int nevents,
8085     int flags, kevent_ctx_t kectx, int32_t *retval,
8086     bool legacy)
8087 {
8088 	int error = 0, noutputs = 0, register_rc;
8089 
8090 	/* only bound threads can receive events on workloops */
8091 	if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
8092 #if CONFIG_WORKLOOP_DEBUG
8093 		UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
8094 			.uu_kqid = kqu.kqwl->kqwl_dynamicid,
8095 			.uu_kq = error ? NULL : kqu.kq,
8096 			.uu_error = error,
8097 			.uu_nchanges = nchanges,
8098 			.uu_nevents = nevents,
8099 			.uu_flags = flags,
8100 		});
8101 #endif // CONFIG_WORKLOOP_DEBUG
8102 
8103 		if (flags & KEVENT_FLAG_KERNEL) {
8104 			/* see kevent_workq_internal */
8105 			error = copyout(&kqu.kqwl->kqwl_dynamicid,
8106 			    ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
8107 			kectx->kec_data_resid -= sizeof(kqueue_id_t);
8108 			if (__improbable(error)) {
8109 				goto out;
8110 			}
8111 		}
8112 
8113 		if (kevent_args_requesting_events(flags, nevents)) {
8114 			/*
8115 			 * Disable the R2K notification while doing a register, if the
8116 			 * caller wants events too, we don't want the AST to be set if we
8117 			 * will process these events soon.
8118 			 */
8119 			kqlock(kqu);
8120 			kqu.kq->kq_state &= ~KQ_R2K_ARMED;
8121 			kqunlock(kqu);
8122 			flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
8123 		}
8124 	}
8125 
8126 	/* register all the change requests the user provided... */
8127 	while (nchanges > 0 && error == 0) {
8128 		struct kevent_qos_s kev;
8129 		struct knote *kn = NULL;
8130 
8131 		if (legacy) {
8132 			error = kevent_legacy_copyin(&changelist, &kev, flags);
8133 		} else {
8134 			error = kevent_modern_copyin(&changelist, &kev);
8135 		}
8136 		if (error) {
8137 			break;
8138 		}
8139 
8140 		register_rc = kevent_register(kqu.kq, &kev, &kn);
8141 		if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
8142 			thread_t thread = current_thread();
8143 
8144 			kqlock_held(kqu);
8145 
8146 			if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
8147 				workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
8148 			}
8149 
8150 			// f_post_register_wait is meant to call a continuation and not to
8151 			// return, which is why we don't support FILTER_REGISTER_WAIT if
8152 			// KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
8153 			// waits isn't the last.
8154 			//
8155 			// It is implementable, but not used by any userspace code at the
8156 			// moment, so for now return ENOTSUP if someone tries to do it.
8157 			if (nchanges == 1 && noutputs < nevents &&
8158 			    (flags & KEVENT_FLAG_KERNEL) == 0 &&
8159 			    (flags & KEVENT_FLAG_PARKING) == 0 &&
8160 			    (flags & KEVENT_FLAG_ERROR_EVENTS) &&
8161 			    (flags & KEVENT_FLAG_WORKLOOP)) {
8162 				uthread_t ut = get_bsdthread_info(thread);
8163 
8164 				/*
8165 				 * store the continuation/completion data in the uthread
8166 				 *
8167 				 * Note: the kectx aliases with this,
8168 				 * and is destroyed in the process.
8169 				 */
8170 				ut->uu_save.uus_kevent_register = (struct _kevent_register){
8171 					.kev        = kev,
8172 					.kqwl       = kqu.kqwl,
8173 					.eventout   = noutputs,
8174 					.ueventlist = ueventlist,
8175 				};
8176 				knote_fops(kn)->f_post_register_wait(ut, kn,
8177 				    &ut->uu_save.uus_kevent_register);
8178 				__builtin_unreachable();
8179 			}
8180 			kqunlock(kqu);
8181 
8182 			kev.flags |= EV_ERROR;
8183 			kev.data = ENOTSUP;
8184 		} else {
8185 			assert((register_rc & FILTER_REGISTER_WAIT) == 0);
8186 		}
8187 
8188 		// keep in sync with kevent_register_wait_return()
8189 		if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
8190 			if ((kev.flags & EV_ERROR) == 0) {
8191 				kev.flags |= EV_ERROR;
8192 				kev.data = 0;
8193 			}
8194 			if (legacy) {
8195 				error = kevent_legacy_copyout(&kev, &ueventlist, flags);
8196 			} else {
8197 				error = kevent_modern_copyout(&kev, &ueventlist);
8198 			}
8199 			if (error == 0) {
8200 				noutputs++;
8201 			}
8202 		} else if (kev.flags & EV_ERROR) {
8203 			error = (int)kev.data;
8204 		}
8205 		nchanges--;
8206 	}
8207 
8208 	if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
8209 	    nevents > 0 && noutputs == 0 && error == 0) {
8210 		kectx->kec_process_flags = flags;
8211 		kectx->kec_process_nevents = nevents;
8212 		kectx->kec_process_noutputs = 0;
8213 		kectx->kec_process_eventlist = ueventlist;
8214 
8215 		if (legacy) {
8216 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
8217 		} else {
8218 			error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
8219 		}
8220 
8221 		noutputs = kectx->kec_process_noutputs;
8222 	} else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
8223 		/*
8224 		 * If we didn't through kqworkloop_end_processing(),
8225 		 * we need to do it here.
8226 		 *
8227 		 * kqueue_scan will call kqworkloop_end_processing(),
8228 		 * so we only need to do it if we didn't scan.
8229 		 */
8230 		kqlock(kqu);
8231 		kqworkloop_end_processing(kqu.kqwl, 0, 0);
8232 		kqunlock(kqu);
8233 	}
8234 
8235 	*retval = noutputs;
8236 out:
8237 	return kevent_cleanup(kqu.kq, flags, error, kectx);
8238 }
8239 
8240 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
8241 
8242 /*!
8243  * @function kevent_modern_internal
8244  *
8245  * @brief
8246  * The backend of the kevent_id and kevent_workq_internal entry points.
8247  *
8248  * @discussion
8249  * Needs to be inline due to the number of arguments.
8250  */
8251 OS_NOINLINE
8252 static int
8253 kevent_modern_internal(kqueue_t kqu,
8254     user_addr_t changelist, int nchanges,
8255     user_addr_t ueventlist, int nevents,
8256     int flags, kevent_ctx_t kectx, int32_t *retval)
8257 {
8258 	return kevent_internal(kqu.kq, changelist, nchanges,
8259 	           ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
8260 }
8261 
8262 /*!
8263  * @function kevent_id
8264  *
8265  * @brief
8266  * The kevent_id() syscall.
8267  */
8268 int
8269 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
8270 {
8271 	int error, flags = uap->flags & KEVENT_FLAG_USER;
8272 	uthread_t uth = current_uthread();
8273 	workq_threadreq_t kqr = uth->uu_kqr_bound;
8274 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8275 	kqueue_t kqu;
8276 
8277 	flags = kevent_adjust_flags_for_proc(p, flags);
8278 	flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
8279 
8280 	if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
8281 	    KEVENT_FLAG_WORKLOOP)) {
8282 		return EINVAL;
8283 	}
8284 
8285 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8286 	if (__improbable(error)) {
8287 		return error;
8288 	}
8289 
8290 	kectx->kec_deadline = 0;
8291 	kectx->kec_fp       = NULL;
8292 	kectx->kec_fd       = -1;
8293 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8294 
8295 	/*
8296 	 * Get the kq we are going to be working on
8297 	 * As a fastpath, look at the currently bound workloop.
8298 	 */
8299 	kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
8300 	if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
8301 		if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
8302 			return EEXIST;
8303 		}
8304 		kqworkloop_retain(kqu.kqwl);
8305 	} else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
8306 		return EXDEV;
8307 	} else {
8308 		error = kqworkloop_get_or_create(p, uap->id, NULL, NULL,
8309 		    flags, &kqu.kqwl);
8310 		if (__improbable(error)) {
8311 			return error;
8312 		}
8313 	}
8314 
8315 	return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
8316 	           uap->eventlist, uap->nevents, flags, kectx, retval);
8317 }
8318 
8319 /**!
8320  * @function kevent_workq_internal
8321  *
8322  * @discussion
8323  * This function is exported for the sake of the workqueue subsystem.
8324  *
8325  * It is called in two ways:
8326  * - when a thread is about to go to userspace to ask for pending event
8327  * - when a thread is returning from userspace with events back
8328  *
8329  * the workqueue subsystem will only use the following flags:
8330  * - KEVENT_FLAG_STACK_DATA (always)
8331  * - KEVENT_FLAG_IMMEDIATE (always)
8332  * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
8333  *   userspace).
8334  *
8335  * It implicitly acts on the bound kqueue, and for the case of workloops
8336  * will copyout the kqueue ID before anything else.
8337  *
8338  *
8339  * Pthread will have setup the various arguments to fit this stack layout:
8340  *
8341  * +-------....----+--------------+-----------+--------------------+
8342  * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
8343  * +-------....----+--------------+-----------+--------------------+
8344  *                 ^              ^
8345  *             data_out       eventlist
8346  *
8347  * When a workloop is used, the workloop ID is copied out right before
8348  * the eventlist and is taken from the data buffer.
8349  *
8350  * @warning
8351  * This function is carefuly tailored to not make any call except the final tail
8352  * call into kevent_modern_internal. (LTO inlines current_uthread()).
8353  *
8354  * This function is performance sensitive due to the workq subsystem.
8355  */
8356 int
8357 kevent_workq_internal(struct proc *p,
8358     user_addr_t changelist, int nchanges,
8359     user_addr_t eventlist, int nevents,
8360     user_addr_t data_out, user_size_t *data_available,
8361     unsigned int flags, int32_t *retval)
8362 {
8363 	uthread_t uth = current_uthread();
8364 	workq_threadreq_t kqr = uth->uu_kqr_bound;
8365 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8366 	kqueue_t kqu;
8367 
8368 	assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
8369 	    flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
8370 
8371 	kectx->kec_data_out   = data_out;
8372 	kectx->kec_data_avail = (uint64_t)data_available;
8373 	kectx->kec_data_size  = *data_available;
8374 	kectx->kec_data_resid = *data_available;
8375 	kectx->kec_deadline   = 0;
8376 	kectx->kec_fp         = NULL;
8377 	kectx->kec_fd         = -1;
8378 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8379 
8380 	flags = kevent_adjust_flags_for_proc(p, flags);
8381 
8382 	if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
8383 		kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
8384 		kqworkloop_retain(kqu.kqwl);
8385 
8386 		flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
8387 		    KEVENT_FLAG_KERNEL;
8388 	} else {
8389 		kqu.kqwq = p->p_fd.fd_wqkqueue;
8390 
8391 		flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
8392 	}
8393 
8394 	return kevent_modern_internal(kqu, changelist, nchanges,
8395 	           eventlist, nevents, flags, kectx, retval);
8396 }
8397 
8398 /*!
8399  * @function kevent_qos
8400  *
8401  * @brief
8402  * The kevent_qos() syscall.
8403  */
8404 int
8405 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
8406 {
8407 	uthread_t uth = current_uthread();
8408 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8409 	int error, flags = uap->flags & KEVENT_FLAG_USER;
8410 	struct kqueue *kq;
8411 
8412 	if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8413 		return EINVAL;
8414 	}
8415 
8416 	flags = kevent_adjust_flags_for_proc(p, flags);
8417 
8418 	error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8419 	if (__improbable(error)) {
8420 		return error;
8421 	}
8422 
8423 	kectx->kec_deadline = 0;
8424 	kectx->kec_fp       = NULL;
8425 	kectx->kec_fd       = uap->fd;
8426 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8427 
8428 	/* get the kq we are going to be working on */
8429 	if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8430 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8431 	} else {
8432 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8433 	}
8434 	if (__improbable(error)) {
8435 		return error;
8436 	}
8437 
8438 	return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
8439 	           uap->eventlist, uap->nevents, flags, kectx, retval);
8440 }
8441 
8442 #pragma mark legacy syscalls: kevent, kevent64
8443 
8444 /*!
8445  * @function kevent_legacy_get_deadline
8446  *
8447  * @brief
8448  * Compute the deadline for the legacy kevent syscalls.
8449  *
8450  * @discussion
8451  * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8452  * as this takes precedence over the deadline.
8453  *
8454  * This function will fail if utimeout is USER_ADDR_NULL
8455  * (the caller should check).
8456  */
8457 static int
8458 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8459 {
8460 	struct timespec ts;
8461 
8462 	if (flags & KEVENT_FLAG_PROC64) {
8463 		struct user64_timespec ts64;
8464 		int error = copyin(utimeout, &ts64, sizeof(ts64));
8465 		if (__improbable(error)) {
8466 			return error;
8467 		}
8468 		ts.tv_sec = (unsigned long)ts64.tv_sec;
8469 		ts.tv_nsec = (long)ts64.tv_nsec;
8470 	} else {
8471 		struct user32_timespec ts32;
8472 		int error = copyin(utimeout, &ts32, sizeof(ts32));
8473 		if (__improbable(error)) {
8474 			return error;
8475 		}
8476 		ts.tv_sec = ts32.tv_sec;
8477 		ts.tv_nsec = ts32.tv_nsec;
8478 	}
8479 	if (!timespec_is_valid(&ts)) {
8480 		return EINVAL;
8481 	}
8482 
8483 	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
8484 	return 0;
8485 }
8486 
8487 /*!
8488  * @function kevent_legacy_internal
8489  *
8490  * @brief
8491  * The core implementation for kevent and kevent64
8492  */
8493 OS_NOINLINE
8494 static int
8495 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
8496     int32_t *retval, int flags)
8497 {
8498 	uthread_t uth = current_uthread();
8499 	kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8500 	struct kqueue *kq;
8501 	int error;
8502 
8503 	if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8504 		return EINVAL;
8505 	}
8506 
8507 	flags = kevent_adjust_flags_for_proc(p, flags);
8508 
8509 	kectx->kec_data_out   = 0;
8510 	kectx->kec_data_avail = 0;
8511 	kectx->kec_data_size  = 0;
8512 	kectx->kec_data_resid = 0;
8513 	kectx->kec_deadline   = 0;
8514 	kectx->kec_fp         = NULL;
8515 	kectx->kec_fd         = uap->fd;
8516 	/* the kec_process_* fields are filled if kqueue_scann is called only */
8517 
8518 	/* convert timeout to absolute - if we have one (and not immediate) */
8519 	if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8520 		error = kevent_legacy_get_deadline(flags, uap->timeout,
8521 		    &kectx->kec_deadline);
8522 		if (__improbable(error)) {
8523 			return error;
8524 		}
8525 	}
8526 
8527 	/* get the kq we are going to be working on */
8528 	if (flags & KEVENT_FLAG_WORKQ) {
8529 		error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8530 	} else {
8531 		error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8532 	}
8533 	if (__improbable(error)) {
8534 		return error;
8535 	}
8536 
8537 	return kevent_internal(kq, uap->changelist, uap->nchanges,
8538 	           uap->eventlist, uap->nevents, flags, kectx, retval,
8539 	           /*legacy*/ true);
8540 }
8541 
8542 /*!
8543  * @function kevent
8544  *
8545  * @brief
8546  * The legacy kevent() syscall.
8547  */
8548 int
8549 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8550 {
8551 	struct kevent64_args args = {
8552 		.fd         = uap->fd,
8553 		.changelist = uap->changelist,
8554 		.nchanges   = uap->nchanges,
8555 		.eventlist  = uap->eventlist,
8556 		.nevents    = uap->nevents,
8557 		.timeout    = uap->timeout,
8558 	};
8559 
8560 	return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8561 }
8562 
8563 /*!
8564  * @function kevent64
8565  *
8566  * @brief
8567  * The legacy kevent64() syscall.
8568  */
8569 int
8570 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8571 {
8572 	int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8573 	return kevent_legacy_internal(p, uap, retval, flags);
8574 }
8575 
8576 #pragma mark - socket interface
8577 
8578 #if SOCKETS
8579 #include <sys/param.h>
8580 #include <sys/socket.h>
8581 #include <sys/protosw.h>
8582 #include <sys/domain.h>
8583 #include <sys/mbuf.h>
8584 #include <sys/kern_event.h>
8585 #include <sys/malloc.h>
8586 #include <sys/sys_domain.h>
8587 #include <sys/syslog.h>
8588 
8589 #ifndef ROUNDUP64
8590 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8591 #endif
8592 
8593 #ifndef ADVANCE64
8594 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8595 #endif
8596 
8597 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8598 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8599 
8600 static int kev_attach(struct socket *so, int proto, struct proc *p);
8601 static int kev_detach(struct socket *so);
8602 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8603     struct ifnet *ifp, struct proc *p);
8604 static lck_mtx_t * event_getlock(struct socket *, int);
8605 static int event_lock(struct socket *, int, void *);
8606 static int event_unlock(struct socket *, int, void *);
8607 
8608 static int event_sofreelastref(struct socket *);
8609 static void kev_delete(struct kern_event_pcb *);
8610 
8611 static struct pr_usrreqs event_usrreqs = {
8612 	.pru_attach =           kev_attach,
8613 	.pru_control =          kev_control,
8614 	.pru_detach =           kev_detach,
8615 	.pru_soreceive =        soreceive,
8616 };
8617 
8618 static struct protosw eventsw[] = {
8619 	{
8620 		.pr_type =              SOCK_RAW,
8621 		.pr_protocol =          SYSPROTO_EVENT,
8622 		.pr_flags =             PR_ATOMIC,
8623 		.pr_usrreqs =           &event_usrreqs,
8624 		.pr_lock =              event_lock,
8625 		.pr_unlock =            event_unlock,
8626 		.pr_getlock =           event_getlock,
8627 	}
8628 };
8629 
8630 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8631 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8632 
8633 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8634     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8635 
8636 struct kevtstat kevtstat;
8637 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8638     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8639     kevt_getstat, "S,kevtstat", "");
8640 
8641 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8642     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8643     kevt_pcblist, "S,xkevtpcb", "");
8644 
8645 SYSCTL_UINT(_net_systm_kevt, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
8646     (unsigned int *)&kevtstat.kes_pcbcount, 0, "");
8647 
8648 static lck_mtx_t *
8649 event_getlock(struct socket *so, int flags)
8650 {
8651 #pragma unused(flags)
8652 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8653 
8654 	if (so->so_pcb != NULL) {
8655 		if (so->so_usecount < 0) {
8656 			panic("%s: so=%p usecount=%d lrh= %s", __func__,
8657 			    so, so->so_usecount, solockhistory_nr(so));
8658 		}
8659 		/* NOTREACHED */
8660 	} else {
8661 		panic("%s: so=%p NULL NO so_pcb %s", __func__,
8662 		    so, solockhistory_nr(so));
8663 		/* NOTREACHED */
8664 	}
8665 	return &ev_pcb->evp_mtx;
8666 }
8667 
8668 static int
8669 event_lock(struct socket *so, int refcount, void *lr)
8670 {
8671 	void *lr_saved;
8672 
8673 	if (lr == NULL) {
8674 		lr_saved = __builtin_return_address(0);
8675 	} else {
8676 		lr_saved = lr;
8677 	}
8678 
8679 	if (so->so_pcb != NULL) {
8680 		lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8681 	} else {
8682 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8683 		    so, lr_saved, solockhistory_nr(so));
8684 		/* NOTREACHED */
8685 	}
8686 
8687 	if (so->so_usecount < 0) {
8688 		panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8689 		    so, so->so_pcb, lr_saved, so->so_usecount,
8690 		    solockhistory_nr(so));
8691 		/* NOTREACHED */
8692 	}
8693 
8694 	if (refcount) {
8695 		so->so_usecount++;
8696 	}
8697 
8698 	so->lock_lr[so->next_lock_lr] = lr_saved;
8699 	so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8700 	return 0;
8701 }
8702 
8703 static int
8704 event_unlock(struct socket *so, int refcount, void *lr)
8705 {
8706 	void *lr_saved;
8707 	lck_mtx_t *mutex_held;
8708 
8709 	if (lr == NULL) {
8710 		lr_saved = __builtin_return_address(0);
8711 	} else {
8712 		lr_saved = lr;
8713 	}
8714 
8715 	if (refcount) {
8716 		so->so_usecount--;
8717 	}
8718 	if (so->so_usecount < 0) {
8719 		panic("%s: so=%p usecount=%d lrh= %s", __func__,
8720 		    so, so->so_usecount, solockhistory_nr(so));
8721 		/* NOTREACHED */
8722 	}
8723 	if (so->so_pcb == NULL) {
8724 		panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8725 		    so, so->so_usecount, (void *)lr_saved,
8726 		    solockhistory_nr(so));
8727 		/* NOTREACHED */
8728 	}
8729 	mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8730 
8731 	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8732 	so->unlock_lr[so->next_unlock_lr] = lr_saved;
8733 	so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8734 
8735 	if (so->so_usecount == 0) {
8736 		VERIFY(so->so_flags & SOF_PCBCLEARING);
8737 		event_sofreelastref(so);
8738 	} else {
8739 		lck_mtx_unlock(mutex_held);
8740 	}
8741 
8742 	return 0;
8743 }
8744 
8745 static int
8746 event_sofreelastref(struct socket *so)
8747 {
8748 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8749 
8750 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8751 
8752 	so->so_pcb = NULL;
8753 
8754 	/*
8755 	 * Disable upcall in the event another thread is in kev_post_msg()
8756 	 * appending record to the receive socket buffer, since sbwakeup()
8757 	 * may release the socket lock otherwise.
8758 	 */
8759 	so->so_rcv.sb_flags &= ~SB_UPCALL;
8760 	so->so_snd.sb_flags &= ~SB_UPCALL;
8761 	so->so_event = sonullevent;
8762 	lck_mtx_unlock(&(ev_pcb->evp_mtx));
8763 
8764 	LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8765 	lck_rw_lock_exclusive(&kev_rwlock);
8766 	LIST_REMOVE(ev_pcb, evp_link);
8767 	kevtstat.kes_pcbcount--;
8768 	kevtstat.kes_gencnt++;
8769 	lck_rw_done(&kev_rwlock);
8770 	kev_delete(ev_pcb);
8771 
8772 	sofreelastref(so, 1);
8773 	return 0;
8774 }
8775 
8776 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8777 
8778 static
8779 struct kern_event_head kern_event_head;
8780 
8781 static u_int32_t static_event_id = 0;
8782 
8783 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8784 
8785 /*
8786  * Install the protosw's for the NKE manager.  Invoked at extension load time
8787  */
8788 void
8789 kern_event_init(struct domain *dp)
8790 {
8791 	struct protosw *pr;
8792 	int i;
8793 
8794 	VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8795 	VERIFY(dp == systemdomain);
8796 
8797 	for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8798 		net_add_proto(pr, dp, 1);
8799 	}
8800 }
8801 
8802 static int
8803 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8804 {
8805 	int error = 0;
8806 	struct kern_event_pcb *ev_pcb;
8807 
8808 	error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8809 	if (error != 0) {
8810 		return error;
8811 	}
8812 
8813 	ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8814 	lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8815 
8816 	ev_pcb->evp_socket = so;
8817 	ev_pcb->evp_vendor_code_filter = 0xffffffff;
8818 
8819 	so->so_pcb = (caddr_t) ev_pcb;
8820 	lck_rw_lock_exclusive(&kev_rwlock);
8821 	LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8822 	kevtstat.kes_pcbcount++;
8823 	kevtstat.kes_gencnt++;
8824 	lck_rw_done(&kev_rwlock);
8825 
8826 	return error;
8827 }
8828 
8829 static void
8830 kev_delete(struct kern_event_pcb *ev_pcb)
8831 {
8832 	VERIFY(ev_pcb != NULL);
8833 	lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8834 	zfree(ev_pcb_zone, ev_pcb);
8835 }
8836 
8837 static int
8838 kev_detach(struct socket *so)
8839 {
8840 	struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8841 
8842 	if (ev_pcb != NULL) {
8843 		soisdisconnected(so);
8844 		so->so_flags |= SOF_PCBCLEARING;
8845 	}
8846 
8847 	return 0;
8848 }
8849 
8850 /*
8851  * For now, kev_vendor_code and mbuf_tags use the same
8852  * mechanism.
8853  */
8854 errno_t
8855 kev_vendor_code_find(
8856 	const char      *string,
8857 	u_int32_t       *out_vendor_code)
8858 {
8859 	if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8860 		return EINVAL;
8861 	}
8862 	return net_str_id_find_internal(string, out_vendor_code,
8863 	           NSI_VENDOR_CODE, 1);
8864 }
8865 
8866 errno_t
8867 kev_msg_post(struct kev_msg *event_msg)
8868 {
8869 	mbuf_tag_id_t min_vendor, max_vendor;
8870 
8871 	net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8872 
8873 	if (event_msg == NULL) {
8874 		return EINVAL;
8875 	}
8876 
8877 	/*
8878 	 * Limit third parties to posting events for registered vendor codes
8879 	 * only
8880 	 */
8881 	if (event_msg->vendor_code < min_vendor ||
8882 	    event_msg->vendor_code > max_vendor) {
8883 		os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8884 		return EINVAL;
8885 	}
8886 	return kev_post_msg(event_msg);
8887 }
8888 
8889 static int
8890 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8891 {
8892 	struct mbuf *m, *m2;
8893 	struct kern_event_pcb *ev_pcb;
8894 	struct kern_event_msg *ev;
8895 	char *tmp;
8896 	u_int32_t total_size;
8897 	int i;
8898 
8899 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8900 	/*
8901 	 * Special hook for ALF state updates
8902 	 */
8903 	if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8904 	    event_msg->kev_class == KEV_NKE_CLASS &&
8905 	    event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8906 	    event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8907 #if MACH_ASSERT
8908 		os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8909 #endif /* MACH_ASSERT */
8910 		net_filter_event_mark(NET_FILTER_EVENT_ALF,
8911 		    net_check_compatible_alf());
8912 	}
8913 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8914 
8915 	/* Verify the message is small enough to fit in one mbuf w/o cluster */
8916 	total_size = KEV_MSG_HEADER_SIZE;
8917 
8918 	for (i = 0; i < 5; i++) {
8919 		if (event_msg->dv[i].data_length == 0) {
8920 			break;
8921 		}
8922 		total_size += event_msg->dv[i].data_length;
8923 	}
8924 
8925 	if (total_size > MLEN) {
8926 		os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8927 		return EMSGSIZE;
8928 	}
8929 
8930 	m = m_get(wait, MT_DATA);
8931 	if (m == 0) {
8932 		os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8933 		return ENOMEM;
8934 	}
8935 	ev = mtod(m, struct kern_event_msg *);
8936 	total_size = KEV_MSG_HEADER_SIZE;
8937 
8938 	tmp = (char *) &ev->event_data[0];
8939 	for (i = 0; i < 5; i++) {
8940 		if (event_msg->dv[i].data_length == 0) {
8941 			break;
8942 		}
8943 
8944 		total_size += event_msg->dv[i].data_length;
8945 		bcopy(event_msg->dv[i].data_ptr, tmp,
8946 		    event_msg->dv[i].data_length);
8947 		tmp += event_msg->dv[i].data_length;
8948 	}
8949 
8950 	ev->id = ++static_event_id;
8951 	ev->total_size   = total_size;
8952 	ev->vendor_code  = event_msg->vendor_code;
8953 	ev->kev_class    = event_msg->kev_class;
8954 	ev->kev_subclass = event_msg->kev_subclass;
8955 	ev->event_code   = event_msg->event_code;
8956 
8957 	m->m_len = total_size;
8958 	lck_rw_lock_shared(&kev_rwlock);
8959 	for (ev_pcb = LIST_FIRST(&kern_event_head);
8960 	    ev_pcb;
8961 	    ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8962 		lck_mtx_lock(&ev_pcb->evp_mtx);
8963 		if (ev_pcb->evp_socket->so_pcb == NULL) {
8964 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8965 			continue;
8966 		}
8967 		if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8968 			if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8969 				lck_mtx_unlock(&ev_pcb->evp_mtx);
8970 				continue;
8971 			}
8972 
8973 			if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8974 				if (ev_pcb->evp_class_filter != ev->kev_class) {
8975 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8976 					continue;
8977 				}
8978 
8979 				if ((ev_pcb->evp_subclass_filter !=
8980 				    KEV_ANY_SUBCLASS) &&
8981 				    (ev_pcb->evp_subclass_filter !=
8982 				    ev->kev_subclass)) {
8983 					lck_mtx_unlock(&ev_pcb->evp_mtx);
8984 					continue;
8985 				}
8986 			}
8987 		}
8988 
8989 		m2 = m_copym(m, 0, m->m_len, wait);
8990 		if (m2 == 0) {
8991 			os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8992 			m_free(m);
8993 			lck_mtx_unlock(&ev_pcb->evp_mtx);
8994 			lck_rw_done(&kev_rwlock);
8995 			return ENOMEM;
8996 		}
8997 		if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8998 			/*
8999 			 * We use "m" for the socket stats as it would be
9000 			 * unsafe to use "m2"
9001 			 */
9002 			so_inc_recv_data_stat(ev_pcb->evp_socket,
9003 			    1, m->m_len);
9004 
9005 			sorwakeup(ev_pcb->evp_socket);
9006 			os_atomic_inc(&kevtstat.kes_posted, relaxed);
9007 		} else {
9008 			os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
9009 		}
9010 		lck_mtx_unlock(&ev_pcb->evp_mtx);
9011 	}
9012 	m_free(m);
9013 	lck_rw_done(&kev_rwlock);
9014 
9015 	return 0;
9016 }
9017 
9018 int
9019 kev_post_msg(struct kev_msg *event_msg)
9020 {
9021 	return kev_post_msg_internal(event_msg, M_WAIT);
9022 }
9023 
9024 int
9025 kev_post_msg_nowait(struct kev_msg *event_msg)
9026 {
9027 	return kev_post_msg_internal(event_msg, M_NOWAIT);
9028 }
9029 
9030 static int
9031 kev_control(struct socket *so,
9032     u_long cmd,
9033     caddr_t data,
9034     __unused struct ifnet *ifp,
9035     __unused struct proc *p)
9036 {
9037 	struct kev_request *kev_req = (struct kev_request *) data;
9038 	struct kern_event_pcb  *ev_pcb;
9039 	struct kev_vendor_code *kev_vendor;
9040 	u_int32_t  *id_value = (u_int32_t *) data;
9041 
9042 	switch (cmd) {
9043 	case SIOCGKEVID:
9044 		*id_value = static_event_id;
9045 		break;
9046 	case SIOCSKEVFILT:
9047 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9048 		ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9049 		ev_pcb->evp_class_filter = kev_req->kev_class;
9050 		ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
9051 		break;
9052 	case SIOCGKEVFILT:
9053 		ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9054 		kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9055 		kev_req->kev_class   = ev_pcb->evp_class_filter;
9056 		kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9057 		break;
9058 	case SIOCGKEVVENDOR:
9059 		kev_vendor = (struct kev_vendor_code *)data;
9060 		/* Make sure string is NULL terminated */
9061 		kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
9062 		return net_str_id_find_internal(kev_vendor->vendor_string,
9063 		           &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
9064 	default:
9065 		return ENOTSUP;
9066 	}
9067 
9068 	return 0;
9069 }
9070 
9071 int
9072 kevt_getstat SYSCTL_HANDLER_ARGS
9073 {
9074 #pragma unused(oidp, arg1, arg2)
9075 	int error = 0;
9076 
9077 	lck_rw_lock_shared(&kev_rwlock);
9078 
9079 	if (req->newptr != USER_ADDR_NULL) {
9080 		error = EPERM;
9081 		goto done;
9082 	}
9083 	if (req->oldptr == USER_ADDR_NULL) {
9084 		req->oldidx = sizeof(struct kevtstat);
9085 		goto done;
9086 	}
9087 
9088 	error = SYSCTL_OUT(req, &kevtstat,
9089 	    MIN(sizeof(struct kevtstat), req->oldlen));
9090 done:
9091 	lck_rw_done(&kev_rwlock);
9092 
9093 	return error;
9094 }
9095 
9096 __private_extern__ int
9097 kevt_pcblist SYSCTL_HANDLER_ARGS
9098 {
9099 #pragma unused(oidp, arg1, arg2)
9100 	int error = 0;
9101 	uint64_t n, i;
9102 	struct xsystmgen xsg;
9103 	void *buf = NULL;
9104 	size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
9105 	    ROUNDUP64(sizeof(struct xsocket_n)) +
9106 	    2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
9107 	    ROUNDUP64(sizeof(struct xsockstat_n));
9108 	struct kern_event_pcb  *ev_pcb;
9109 
9110 	buf = kalloc_data(item_size, Z_WAITOK_ZERO_NOFAIL);
9111 
9112 	lck_rw_lock_shared(&kev_rwlock);
9113 
9114 	n = kevtstat.kes_pcbcount;
9115 
9116 	if (req->oldptr == USER_ADDR_NULL) {
9117 		req->oldidx = (size_t) ((n + n / 8) * item_size);
9118 		goto done;
9119 	}
9120 	if (req->newptr != USER_ADDR_NULL) {
9121 		error = EPERM;
9122 		goto done;
9123 	}
9124 	bzero(&xsg, sizeof(xsg));
9125 	xsg.xg_len = sizeof(xsg);
9126 	xsg.xg_count = n;
9127 	xsg.xg_gen = kevtstat.kes_gencnt;
9128 	xsg.xg_sogen = so_gencnt;
9129 	error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9130 	if (error) {
9131 		goto done;
9132 	}
9133 	/*
9134 	 * We are done if there is no pcb
9135 	 */
9136 	if (n == 0) {
9137 		goto done;
9138 	}
9139 
9140 	i = 0;
9141 	for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9142 	    i < n && ev_pcb != NULL;
9143 	    i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9144 		struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9145 		struct xsocket_n *xso = (struct xsocket_n *)
9146 		    ADVANCE64(xk, sizeof(*xk));
9147 		struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9148 		    ADVANCE64(xso, sizeof(*xso));
9149 		struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9150 		    ADVANCE64(xsbrcv, sizeof(*xsbrcv));
9151 		struct xsockstat_n *xsostats = (struct xsockstat_n *)
9152 		    ADVANCE64(xsbsnd, sizeof(*xsbsnd));
9153 
9154 		bzero(buf, item_size);
9155 
9156 		lck_mtx_lock(&ev_pcb->evp_mtx);
9157 
9158 		xk->kep_len = sizeof(struct xkevtpcb);
9159 		xk->kep_kind = XSO_EVT;
9160 		xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRHASH(ev_pcb);
9161 		xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9162 		xk->kep_class_filter = ev_pcb->evp_class_filter;
9163 		xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9164 
9165 		sotoxsocket_n(ev_pcb->evp_socket, xso);
9166 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
9167 		    &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9168 		sbtoxsockbuf_n(ev_pcb->evp_socket ?
9169 		    &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9170 		sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9171 
9172 		lck_mtx_unlock(&ev_pcb->evp_mtx);
9173 
9174 		error = SYSCTL_OUT(req, buf, item_size);
9175 	}
9176 
9177 	if (error == 0) {
9178 		/*
9179 		 * Give the user an updated idea of our state.
9180 		 * If the generation differs from what we told
9181 		 * her before, she knows that something happened
9182 		 * while we were processing this request, and it
9183 		 * might be necessary to retry.
9184 		 */
9185 		bzero(&xsg, sizeof(xsg));
9186 		xsg.xg_len = sizeof(xsg);
9187 		xsg.xg_count = n;
9188 		xsg.xg_gen = kevtstat.kes_gencnt;
9189 		xsg.xg_sogen = so_gencnt;
9190 		error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9191 		if (error) {
9192 			goto done;
9193 		}
9194 	}
9195 
9196 done:
9197 	lck_rw_done(&kev_rwlock);
9198 
9199 	kfree_data(buf, item_size);
9200 	return error;
9201 }
9202 
9203 #endif /* SOCKETS */
9204 
9205 
9206 int
9207 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
9208 {
9209 	struct vinfo_stat * st;
9210 
9211 	st = &kinfo->kq_stat;
9212 
9213 	st->vst_size = kqu.kq->kq_count;
9214 	if (kqu.kq->kq_state & KQ_KEV_QOS) {
9215 		st->vst_blksize = sizeof(struct kevent_qos_s);
9216 	} else if (kqu.kq->kq_state & KQ_KEV64) {
9217 		st->vst_blksize = sizeof(struct kevent64_s);
9218 	} else {
9219 		st->vst_blksize = sizeof(struct kevent);
9220 	}
9221 	st->vst_mode = S_IFIFO;
9222 	st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
9223 	    kqu.kqwl->kqwl_dynamicid : 0;
9224 
9225 	/* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9226 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9227 	static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
9228 	static_assert(PROC_KQUEUE_32 == KQ_KEV32);
9229 	static_assert(PROC_KQUEUE_64 == KQ_KEV64);
9230 	static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
9231 	static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
9232 	static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
9233 	kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
9234 	if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
9235 		if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
9236 			kinfo->kq_state |= PROC_KQUEUE_SELECT;
9237 		}
9238 	}
9239 
9240 	return 0;
9241 }
9242 
9243 static int
9244 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
9245 {
9246 	workq_threadreq_t kqr = &kqwl->kqwl_request;
9247 	workq_threadreq_param_t trp = {};
9248 	int err;
9249 
9250 	if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
9251 		return EINVAL;
9252 	}
9253 
9254 	if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
9255 		return err;
9256 	}
9257 
9258 	kqlock(kqwl);
9259 
9260 	kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
9261 	kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9262 	kqdi->kqdi_request_state = kqr->tr_state;
9263 	kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
9264 	kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
9265 	kqdi->kqdi_sync_waiters = 0;
9266 	kqdi->kqdi_sync_waiter_qos = 0;
9267 
9268 	trp.trp_value = kqwl->kqwl_params;
9269 	if (trp.trp_flags & TRP_PRIORITY) {
9270 		kqdi->kqdi_pri = trp.trp_pri;
9271 	} else {
9272 		kqdi->kqdi_pri = 0;
9273 	}
9274 
9275 	if (trp.trp_flags & TRP_POLICY) {
9276 		kqdi->kqdi_pol = trp.trp_pol;
9277 	} else {
9278 		kqdi->kqdi_pol = 0;
9279 	}
9280 
9281 	if (trp.trp_flags & TRP_CPUPERCENT) {
9282 		kqdi->kqdi_cpupercent = trp.trp_cpupercent;
9283 	} else {
9284 		kqdi->kqdi_cpupercent = 0;
9285 	}
9286 
9287 	kqunlock(kqwl);
9288 
9289 	return 0;
9290 }
9291 
9292 
9293 static unsigned long
9294 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9295     unsigned long buflen, unsigned long nknotes)
9296 {
9297 	for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9298 		if (kq == knote_get_kq(kn)) {
9299 			if (nknotes < buflen) {
9300 				struct kevent_extinfo *info = &buf[nknotes];
9301 
9302 				kqlock(kq);
9303 
9304 				if (knote_fops(kn)->f_sanitized_copyout) {
9305 					knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
9306 				} else {
9307 					info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
9308 				}
9309 
9310 				if (knote_has_qos(kn)) {
9311 					info->kqext_kev.qos =
9312 					    _pthread_priority_thread_qos_fast(kn->kn_qos);
9313 				} else {
9314 					info->kqext_kev.qos = kn->kn_qos_override;
9315 				}
9316 				info->kqext_kev.filter |= 0xff00; /* sign extend filter */
9317 				info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
9318 				info->kqext_kev.data    = 0; /* this is where sdata lives */
9319 				info->kqext_sdata       = kn->kn_sdata;
9320 				info->kqext_status      = kn->kn_status;
9321 				info->kqext_sfflags     = kn->kn_sfflags;
9322 
9323 				kqunlock(kq);
9324 			}
9325 
9326 			/* we return total number of knotes, which may be more than requested */
9327 			nknotes++;
9328 		}
9329 	}
9330 
9331 	return nknotes;
9332 }
9333 
9334 int
9335 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9336     int32_t *nkqueues_out)
9337 {
9338 	proc_t p = (proc_t)proc;
9339 	struct filedesc *fdp = &p->p_fd;
9340 	unsigned int nkqueues = 0;
9341 	unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9342 	size_t buflen, bufsize;
9343 	kqueue_id_t *kq_ids = NULL;
9344 	int err = 0;
9345 
9346 	assert(p != NULL);
9347 
9348 	if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9349 		err = EINVAL;
9350 		goto out;
9351 	}
9352 
9353 	buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9354 
9355 	if (ubuflen != 0) {
9356 		if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9357 			err = ERANGE;
9358 			goto out;
9359 		}
9360 		kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
9361 		if (!kq_ids) {
9362 			err = ENOMEM;
9363 			goto out;
9364 		}
9365 	}
9366 
9367 	kqhash_lock(fdp);
9368 
9369 	u_long kqhashmask = fdp->fd_kqhashmask;
9370 	if (kqhashmask > 0) {
9371 		for (uint32_t i = 0; i < kqhashmask + 1; i++) {
9372 			struct kqworkloop *kqwl;
9373 
9374 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9375 				/* report the number of kqueues, even if they don't all fit */
9376 				if (nkqueues < buflen) {
9377 					kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9378 				}
9379 				nkqueues++;
9380 			}
9381 
9382 			/*
9383 			 * Drop the kqhash lock and take it again to give some breathing room
9384 			 */
9385 			kqhash_unlock(fdp);
9386 			kqhash_lock(fdp);
9387 
9388 			/*
9389 			 * Reevaluate to see if we have raced with someone who changed this -
9390 			 * if we have, we should bail out with the set of info captured so far
9391 			 */
9392 			if (fdp->fd_kqhashmask != kqhashmask) {
9393 				break;
9394 			}
9395 		}
9396 	}
9397 
9398 	kqhash_unlock(fdp);
9399 
9400 	if (kq_ids) {
9401 		size_t copysize;
9402 		if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
9403 			err = ERANGE;
9404 			goto out;
9405 		}
9406 
9407 		assert(ubufsize >= copysize);
9408 		err = copyout(kq_ids, ubuf, copysize);
9409 	}
9410 
9411 out:
9412 	if (kq_ids) {
9413 		kfree_data(kq_ids, bufsize);
9414 	}
9415 
9416 	if (!err) {
9417 		*nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9418 	}
9419 	return err;
9420 }
9421 
9422 int
9423 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9424     uint32_t ubufsize, int32_t *size_out)
9425 {
9426 	proc_t p = (proc_t)proc;
9427 	struct kqworkloop *kqwl;
9428 	int err = 0;
9429 	struct kqueue_dyninfo kqdi = { };
9430 
9431 	assert(p != NULL);
9432 
9433 	if (ubufsize < sizeof(struct kqueue_info)) {
9434 		return ENOBUFS;
9435 	}
9436 
9437 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9438 	if (!kqwl) {
9439 		return ESRCH;
9440 	}
9441 
9442 	/*
9443 	 * backward compatibility: allow the argument to this call to only be
9444 	 * a struct kqueue_info
9445 	 */
9446 	if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9447 		ubufsize = sizeof(struct kqueue_dyninfo);
9448 		err = fill_kqueue_dyninfo(kqwl, &kqdi);
9449 	} else {
9450 		ubufsize = sizeof(struct kqueue_info);
9451 		err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
9452 	}
9453 	if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9454 		*size_out = ubufsize;
9455 	}
9456 	kqworkloop_release(kqwl);
9457 	return err;
9458 }
9459 
9460 int
9461 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9462     uint32_t ubufsize, int32_t *nknotes_out)
9463 {
9464 	proc_t p = (proc_t)proc;
9465 	struct kqworkloop *kqwl;
9466 	int err;
9467 
9468 	kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9469 	if (!kqwl) {
9470 		return ESRCH;
9471 	}
9472 
9473 	err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
9474 	kqworkloop_release(kqwl);
9475 	return err;
9476 }
9477 
9478 int
9479 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9480     uint32_t bufsize, int32_t *retval)
9481 {
9482 	struct knote *kn;
9483 	int i;
9484 	int err = 0;
9485 	struct filedesc *fdp = &p->p_fd;
9486 	unsigned long nknotes = 0;
9487 	unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9488 	struct kevent_extinfo *kqext = NULL;
9489 
9490 	/* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9491 	buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9492 
9493 	kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
9494 	if (kqext == NULL) {
9495 		err = ENOMEM;
9496 		goto out;
9497 	}
9498 
9499 	proc_fdlock(p);
9500 	u_long fd_knlistsize = fdp->fd_knlistsize;
9501 	struct klist *fd_knlist = fdp->fd_knlist;
9502 
9503 	for (i = 0; i < fd_knlistsize; i++) {
9504 		kn = SLIST_FIRST(&fd_knlist[i]);
9505 		nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9506 
9507 		proc_fdunlock(p);
9508 		proc_fdlock(p);
9509 		/*
9510 		 * Reevaluate to see if we have raced with someone who changed this -
9511 		 * if we have, we return the set of info for fd_knlistsize we knew
9512 		 * in the beginning except if knotes_dealloc interleaves with us.
9513 		 * In that case, we bail out early with the set of info captured so far.
9514 		 */
9515 		if (fd_knlistsize != fdp->fd_knlistsize) {
9516 			if (fdp->fd_knlistsize) {
9517 				/* kq_add_knote might grow fdp->fd_knlist. */
9518 				fd_knlist = fdp->fd_knlist;
9519 			} else {
9520 				break;
9521 			}
9522 		}
9523 	}
9524 	proc_fdunlock(p);
9525 
9526 	knhash_lock(fdp);
9527 	u_long knhashmask = fdp->fd_knhashmask;
9528 
9529 	if (knhashmask != 0) {
9530 		for (i = 0; i < (int)knhashmask + 1; i++) {
9531 			kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9532 			nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9533 
9534 			knhash_unlock(fdp);
9535 			knhash_lock(fdp);
9536 
9537 			/*
9538 			 * Reevaluate to see if we have raced with someone who changed this -
9539 			 * if we have, we should bail out with the set of info captured so far
9540 			 */
9541 			if (fdp->fd_knhashmask != knhashmask) {
9542 				break;
9543 			}
9544 		}
9545 	}
9546 	knhash_unlock(fdp);
9547 
9548 	assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9549 	err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9550 
9551 out:
9552 	kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9553 
9554 	if (!err) {
9555 		*retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9556 	}
9557 	return err;
9558 }
9559 
9560 static unsigned int
9561 klist_copy_udata(struct klist *list, uint64_t *buf,
9562     unsigned int buflen, unsigned int nknotes)
9563 {
9564 	struct knote *kn;
9565 	SLIST_FOREACH(kn, list, kn_link) {
9566 		if (nknotes < buflen) {
9567 			/*
9568 			 * kevent_register will always set kn_udata atomically
9569 			 * so that we don't have to take any kqlock here.
9570 			 */
9571 			buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9572 		}
9573 		/* we return total number of knotes, which may be more than requested */
9574 		nknotes++;
9575 	}
9576 
9577 	return nknotes;
9578 }
9579 
9580 int
9581 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9582 {
9583 	proc_t p = (proc_t)proc;
9584 	struct filedesc *fdp = &p->p_fd;
9585 	unsigned int nuptrs = 0;
9586 	unsigned int buflen = bufsize / sizeof(uint64_t);
9587 	struct kqworkloop *kqwl;
9588 	u_long size = 0;
9589 	struct klist *fd_knlist = NULL;
9590 
9591 	if (buflen > 0) {
9592 		assert(buf != NULL);
9593 	}
9594 
9595 	/*
9596 	 * Copyout the uptrs as much as possible but make sure to drop the respective
9597 	 * locks and take them again periodically so that we don't blow through
9598 	 * preemption disabled timeouts. Always reevaluate to see if we have raced
9599 	 * with someone who changed size of the hash - if we have, we return info for
9600 	 * the size of the hash we knew in the beginning except if it drops to 0.
9601 	 * In that case, we bail out with the set of info captured so far
9602 	 */
9603 	proc_fdlock(p);
9604 	size = fdp->fd_knlistsize;
9605 	fd_knlist = fdp->fd_knlist;
9606 
9607 	for (int i = 0; i < size; i++) {
9608 		nuptrs = klist_copy_udata(&fd_knlist[i], buf, buflen, nuptrs);
9609 
9610 		proc_fdunlock(p);
9611 		proc_fdlock(p);
9612 		if (size != fdp->fd_knlistsize) {
9613 			if (fdp->fd_knlistsize) {
9614 				/* kq_add_knote might grow fdp->fd_knlist. */
9615 				fd_knlist = fdp->fd_knlist;
9616 			} else {
9617 				break;
9618 			}
9619 		}
9620 	}
9621 	proc_fdunlock(p);
9622 
9623 	knhash_lock(fdp);
9624 	size = fdp->fd_knhashmask;
9625 
9626 	if (size != 0) {
9627 		for (size_t i = 0; i < size + 1; i++) {
9628 			nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9629 
9630 			knhash_unlock(fdp);
9631 			knhash_lock(fdp);
9632 			/* The only path that can interleave with us today is knotes_dealloc. */
9633 			if (size != fdp->fd_knhashmask) {
9634 				break;
9635 			}
9636 		}
9637 	}
9638 	knhash_unlock(fdp);
9639 
9640 	kqhash_lock(fdp);
9641 	size = fdp->fd_kqhashmask;
9642 
9643 	if (size != 0) {
9644 		for (size_t i = 0; i < size + 1; i++) {
9645 			LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9646 				if (nuptrs < buflen) {
9647 					buf[nuptrs] = kqwl->kqwl_dynamicid;
9648 				}
9649 				nuptrs++;
9650 			}
9651 
9652 			kqhash_unlock(fdp);
9653 			kqhash_lock(fdp);
9654 			if (size != fdp->fd_kqhashmask) {
9655 				break;
9656 			}
9657 		}
9658 	}
9659 	kqhash_unlock(fdp);
9660 
9661 	return (int)nuptrs;
9662 }
9663 
9664 static void
9665 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9666 {
9667 	uint64_t ast_addr;
9668 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9669 	size_t user_addr_size = proc_is_64bit ? 8 : 4;
9670 	uint32_t ast_flags32 = 0;
9671 	uint64_t ast_flags64 = 0;
9672 	struct uthread *ut = get_bsdthread_info(thread);
9673 
9674 	if (ut->uu_kqr_bound != NULL) {
9675 		ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9676 	}
9677 
9678 	if (ast_flags64 == 0) {
9679 		return;
9680 	}
9681 
9682 	if (!(p->p_flag & P_LP64)) {
9683 		ast_flags32 = (uint32_t)ast_flags64;
9684 		assert(ast_flags64 < 0x100000000ull);
9685 	}
9686 
9687 	ast_addr = thread_rettokern_addr(thread);
9688 	if (ast_addr == 0) {
9689 		return;
9690 	}
9691 
9692 	if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9693 	    (user_addr_t)ast_addr,
9694 	    user_addr_size) != 0) {
9695 		printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9696 		    "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9697 	}
9698 }
9699 
9700 /*
9701  * Semantics of writing to TSD value:
9702  *
9703  * 1. It is written to by the kernel and cleared by userspace.
9704  * 2. When the userspace code clears the TSD field, it takes responsibility for
9705  * taking action on the quantum expiry action conveyed by kernel.
9706  * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9707  * userspace back to kernel to make sure that it is never leaked across thread
9708  * requests.
9709  */
9710 void
9711 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9712     uint64_t flags)
9713 {
9714 	uint64_t ast_addr;
9715 	bool proc_is_64bit = !!(p->p_flag & P_LP64);
9716 	uint32_t ast_flags32 = 0;
9717 	uint64_t ast_flags64 = flags;
9718 
9719 	if (ast_flags64 == 0) {
9720 		return;
9721 	}
9722 
9723 	if (!(p->p_flag & P_LP64)) {
9724 		ast_flags32 = (uint32_t)ast_flags64;
9725 		assert(ast_flags64 < 0x100000000ull);
9726 	}
9727 
9728 	ast_addr = thread_wqquantum_addr(thread);
9729 	assert(ast_addr != 0);
9730 
9731 	if (proc_is_64bit) {
9732 		if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9733 #if DEBUG || DEVELOPMENT
9734 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9735 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9736 #endif
9737 		}
9738 	} else {
9739 		if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9740 #if DEBUG || DEVELOPMENT
9741 			printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9742 			    "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9743 #endif
9744 		}
9745 	}
9746 }
9747 
9748 void
9749 kevent_ast(thread_t thread, uint16_t bits)
9750 {
9751 	proc_t p = current_proc();
9752 
9753 
9754 	if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9755 		workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9756 	}
9757 	if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9758 		kevent_set_return_to_kernel_user_tsd(p, thread);
9759 	}
9760 
9761 	if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9762 		workq_kern_quantum_expiry_reevaluate(p, thread);
9763 	}
9764 }
9765 
9766 #if DEVELOPMENT || DEBUG
9767 
9768 #define KEVENT_SYSCTL_BOUND_ID 1
9769 
9770 static int
9771 kevent_sysctl SYSCTL_HANDLER_ARGS
9772 {
9773 #pragma unused(oidp, arg2)
9774 	uintptr_t type = (uintptr_t)arg1;
9775 	uint64_t bound_id = 0;
9776 
9777 	if (type != KEVENT_SYSCTL_BOUND_ID) {
9778 		return EINVAL;
9779 	}
9780 
9781 	if (req->newptr) {
9782 		return EINVAL;
9783 	}
9784 
9785 	struct uthread *ut = current_uthread();
9786 	if (!ut) {
9787 		return EFAULT;
9788 	}
9789 
9790 	workq_threadreq_t kqr = ut->uu_kqr_bound;
9791 	if (kqr) {
9792 		if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9793 			bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9794 		} else {
9795 			bound_id = -1;
9796 		}
9797 	}
9798 
9799 	return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9800 }
9801 
9802 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9803     "kevent information");
9804 
9805 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9806     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9807     (void *)KEVENT_SYSCTL_BOUND_ID,
9808     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9809     "get the ID of the bound kqueue");
9810 
9811 #endif /* DEVELOPMENT || DEBUG */
9812