1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91
92 #include <kern/thread_group.h>
93 #include <kern/locks.h>
94 #include <kern/clock.h>
95 #include <kern/cpu_data.h>
96 #include <kern/policy_internal.h>
97 #include <kern/thread_call.h>
98 #include <kern/sched_prim.h>
99 #include <kern/waitq.h>
100 #include <kern/zalloc.h>
101 #include <kern/kalloc.h>
102 #include <kern/assert.h>
103 #include <kern/ast.h>
104 #include <kern/thread.h>
105 #include <kern/kcdata.h>
106 #include <kern/work_interval.h>
107
108 #include <pthread/priority_private.h>
109 #include <pthread/workqueue_syscalls.h>
110 #include <pthread/workqueue_internal.h>
111 #include <libkern/libkern.h>
112
113 #include <os/log.h>
114
115 #include "mach/kern_return.h"
116 #include "net/net_str_id.h"
117
118 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
119 #include <skywalk/lib/net_filter_event.h>
120
121 extern bool net_check_compatible_alf(void);
122 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
123
124 #include <mach/task.h>
125 #include <libkern/section_keywords.h>
126
127 #if CONFIG_MEMORYSTATUS
128 #include <sys/kern_memorystatus.h>
129 #endif
130
131 #if DEVELOPMENT || DEBUG
132 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
133 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
134 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
135 #endif
136
137 /* Enable bound thread support for kqworkloop. */
138 static TUNABLE(int, bootarg_thread_bound_kqwl_support_enabled,
139 "enable_thread_bound_kqwl_support", 0);
140 SYSCTL_NODE(_kern, OID_AUTO, kern_event, CTLFLAG_RD | CTLFLAG_LOCKED, 0, NULL);
141 SYSCTL_INT(_kern_kern_event, OID_AUTO, thread_bound_kqwl_support_enabled,
142 CTLFLAG_RD | CTLFLAG_LOCKED,
143 &bootarg_thread_bound_kqwl_support_enabled, 0,
144 "Whether thread bound kqwl support is enabled");
145
146 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
147 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
148 VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
149
150 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
151 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
152
153 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
154
155 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
156 vfs_context_t ctx);
157 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
158 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
159 struct kevent_qos_s *kev);
160 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
161
162 static const struct fileops kqueueops = {
163 .fo_type = DTYPE_KQUEUE,
164 .fo_read = fo_no_read,
165 .fo_write = fo_no_write,
166 .fo_ioctl = fo_no_ioctl,
167 .fo_select = kqueue_select,
168 .fo_close = kqueue_close,
169 .fo_drain = kqueue_drain,
170 .fo_kqfilter = kqueue_kqfilter,
171 };
172
173 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
174 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
175 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
176 thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
177 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
178 static void kevent_register_wait_cleanup(struct knote *kn);
179
180 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
181 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
182
183 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
184 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
185 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
186 static void kqueue_update_iotier_override(kqueue_t kqu);
187
188 static void kqworkloop_unbind(struct kqworkloop *kqwl);
189
190 enum kqwl_unbind_locked_mode {
191 KQWL_OVERRIDE_DROP_IMMEDIATELY,
192 KQWL_OVERRIDE_DROP_DELAYED,
193 };
194 // The soft unbinding of kqworkloop only applies to kqwls configured
195 // with a permanently bound thread.
196 #define KQUEUE_THREADREQ_UNBIND_SOFT 0x1
197 static void kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
198 enum kqwl_unbind_locked_mode how, unsigned int flags);
199 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
200 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
201 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
202 static void kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread);
203 static void kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl);
204
205 enum {
206 KQWL_UTQ_NONE,
207 /*
208 * The wakeup qos is the qos of QUEUED knotes.
209 *
210 * This QoS is accounted for with the events override in the
211 * kqr_override_index field. It is raised each time a new knote is queued at
212 * a given QoS. The kqwl_wakeup_qos field is a superset of the non empty
213 * knote buckets and is recomputed after each event delivery.
214 */
215 KQWL_UTQ_UPDATE_WAKEUP_QOS,
216 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
217 KQWL_UTQ_UNBINDING, /* attempt to rebind */
218 KQWL_UTQ_PARKING,
219 /*
220 * The wakeup override is for suppressed knotes that have fired again at
221 * a higher QoS than the one for which they are suppressed already.
222 * This override is cleared when the knote suppressed list becomes empty.
223 */
224 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
225 KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
226 /*
227 * The QoS is the maximum QoS of an event enqueued on this workloop in
228 * userland. It is copied from the only EVFILT_WORKLOOP knote with
229 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
230 * such knote, this QoS is 0.
231 */
232 KQWL_UTQ_SET_QOS_INDEX,
233 KQWL_UTQ_REDRIVE_EVENTS,
234 };
235 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
236 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
237
238 static struct knote *knote_alloc(void);
239 static void knote_free(struct knote *kn);
240 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
241 struct knote_lock_ctx *knlc, struct proc *p);
242 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
243 struct kevent_qos_s *kev, bool is_fd, struct proc *p);
244
245 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
246 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
247
248 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
249 struct kevent_qos_s *kev, int result);
250 static void knote_suppress(kqueue_t kqu, struct knote *kn);
251 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
252 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
253
254 // both these functions may dequeue the knote and it is up to the caller
255 // to enqueue the knote back
256 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
257 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
258
259 static ZONE_DEFINE(knote_zone, "knote zone",
260 sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
261 static ZONE_DEFINE(kqfile_zone, "kqueue file zone",
262 sizeof(struct kqfile), ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
263 static ZONE_DEFINE(kqworkq_zone, "kqueue workq zone",
264 sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
265 static ZONE_DEFINE(kqworkloop_zone, "kqueue workloop zone",
266 sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_NO_TBI_TAG);
267
268 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
269
270 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
271 static void filt_no_detach(struct knote *kn);
272 static int filt_bad_event(struct knote *kn, long hint);
273 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
274 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
275
276 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
277 .f_attach = filt_no_attach,
278 .f_detach = filt_no_detach,
279 .f_event = filt_bad_event,
280 .f_touch = filt_bad_touch,
281 .f_process = filt_bad_process,
282 };
283
284 #if CONFIG_MEMORYSTATUS
285 extern const struct filterops memorystatus_filtops;
286 #endif /* CONFIG_MEMORYSTATUS */
287 extern const struct filterops fs_filtops;
288 extern const struct filterops sig_filtops;
289 extern const struct filterops machport_attach_filtops;
290 extern const struct filterops mach_port_filtops;
291 extern const struct filterops mach_port_set_filtops;
292 extern const struct filterops pipe_nfiltops;
293 extern const struct filterops pipe_rfiltops;
294 extern const struct filterops pipe_wfiltops;
295 extern const struct filterops ptsd_kqops;
296 extern const struct filterops ptmx_kqops;
297 extern const struct filterops soread_filtops;
298 extern const struct filterops sowrite_filtops;
299 extern const struct filterops sock_filtops;
300 extern const struct filterops soexcept_filtops;
301 extern const struct filterops spec_filtops;
302 extern const struct filterops bpfread_filtops;
303 extern const struct filterops necp_fd_rfiltops;
304 #if SKYWALK
305 extern const struct filterops skywalk_channel_rfiltops;
306 extern const struct filterops skywalk_channel_wfiltops;
307 extern const struct filterops skywalk_channel_efiltops;
308 #endif /* SKYWALK */
309 extern const struct filterops fsevent_filtops;
310 extern const struct filterops vnode_filtops;
311 extern const struct filterops tty_filtops;
312
313 const static struct filterops file_filtops;
314 const static struct filterops kqread_filtops;
315 const static struct filterops proc_filtops;
316 const static struct filterops timer_filtops;
317 const static struct filterops user_filtops;
318 const static struct filterops workloop_filtops;
319 #if CONFIG_EXCLAVES
320 extern const struct filterops exclaves_notification_filtops;
321 #endif /* CONFIG_EXCLAVES */
322
323 /*
324 *
325 * Rules for adding new filters to the system:
326 * Public filters:
327 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
328 * in the exported section of the header
329 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
330 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
331 * of the Public Filters section in the array.
332 * Private filters:
333 * - Add a new "EVFILT_" value to bsd/sys/event_private.h (typically a positive value)
334 * - Update the EVFILTID_MAX value to reflect the new addition
335 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
336 * the Private filters section of the array.
337 */
338 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
339 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
340 /* Public Filters */
341 [~EVFILT_READ] = &file_filtops,
342 [~EVFILT_WRITE] = &file_filtops,
343 [~EVFILT_AIO] = &bad_filtops,
344 [~EVFILT_VNODE] = &file_filtops,
345 [~EVFILT_PROC] = &proc_filtops,
346 [~EVFILT_SIGNAL] = &sig_filtops,
347 [~EVFILT_TIMER] = &timer_filtops,
348 [~EVFILT_MACHPORT] = &machport_attach_filtops,
349 [~EVFILT_FS] = &fs_filtops,
350 [~EVFILT_USER] = &user_filtops,
351 [~EVFILT_UNUSED_11] = &bad_filtops,
352 [~EVFILT_VM] = &bad_filtops,
353 [~EVFILT_SOCK] = &file_filtops,
354 #if CONFIG_MEMORYSTATUS
355 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
356 #else
357 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
358 #endif
359 [~EVFILT_EXCEPT] = &file_filtops,
360 #if SKYWALK
361 [~EVFILT_NW_CHANNEL] = &file_filtops,
362 #else /* !SKYWALK */
363 [~EVFILT_NW_CHANNEL] = &bad_filtops,
364 #endif /* !SKYWALK */
365 [~EVFILT_WORKLOOP] = &workloop_filtops,
366 #if CONFIG_EXCLAVES
367 [~EVFILT_EXCLAVES_NOTIFICATION] = &exclaves_notification_filtops,
368 #else /* !CONFIG_EXCLAVES */
369 [~EVFILT_EXCLAVES_NOTIFICATION] = &bad_filtops,
370 #endif /* CONFIG_EXCLAVES*/
371
372 /* Private filters */
373 [EVFILTID_KQREAD] = &kqread_filtops,
374 [EVFILTID_PIPE_N] = &pipe_nfiltops,
375 [EVFILTID_PIPE_R] = &pipe_rfiltops,
376 [EVFILTID_PIPE_W] = &pipe_wfiltops,
377 [EVFILTID_PTSD] = &ptsd_kqops,
378 [EVFILTID_SOREAD] = &soread_filtops,
379 [EVFILTID_SOWRITE] = &sowrite_filtops,
380 [EVFILTID_SCK] = &sock_filtops,
381 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
382 [EVFILTID_SPEC] = &spec_filtops,
383 [EVFILTID_BPFREAD] = &bpfread_filtops,
384 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
385 #if SKYWALK
386 [EVFILTID_SKYWALK_CHANNEL_W] = &skywalk_channel_wfiltops,
387 [EVFILTID_SKYWALK_CHANNEL_R] = &skywalk_channel_rfiltops,
388 [EVFILTID_SKYWALK_CHANNEL_E] = &skywalk_channel_efiltops,
389 #else /* !SKYWALK */
390 [EVFILTID_SKYWALK_CHANNEL_W] = &bad_filtops,
391 [EVFILTID_SKYWALK_CHANNEL_R] = &bad_filtops,
392 [EVFILTID_SKYWALK_CHANNEL_E] = &bad_filtops,
393 #endif /* !SKYWALK */
394 [EVFILTID_FSEVENT] = &fsevent_filtops,
395 [EVFILTID_VN] = &vnode_filtops,
396 [EVFILTID_TTY] = &tty_filtops,
397 [EVFILTID_PTMX] = &ptmx_kqops,
398 [EVFILTID_MACH_PORT] = &mach_port_filtops,
399 [EVFILTID_MACH_PORT_SET] = &mach_port_set_filtops,
400
401 /* fake filter for detached knotes, keep last */
402 [EVFILTID_DETACHED] = &bad_filtops,
403 };
404
405 static inline bool
kqr_thread_bound(workq_threadreq_t kqr)406 kqr_thread_bound(workq_threadreq_t kqr)
407 {
408 return kqr->tr_state == WORKQ_TR_STATE_BOUND;
409 }
410
411 static inline bool
kqr_thread_permanently_bound(workq_threadreq_t kqr)412 kqr_thread_permanently_bound(workq_threadreq_t kqr)
413 {
414 return kqr_thread_bound(kqr) && (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
415 }
416
417 static inline bool
kqr_thread_requested_pending(workq_threadreq_t kqr)418 kqr_thread_requested_pending(workq_threadreq_t kqr)
419 {
420 workq_tr_state_t tr_state = kqr->tr_state;
421 return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
422 }
423
424 static inline bool
kqr_thread_requested(workq_threadreq_t kqr)425 kqr_thread_requested(workq_threadreq_t kqr)
426 {
427 return kqr->tr_state != WORKQ_TR_STATE_IDLE;
428 }
429
430 static inline thread_t
kqr_thread_fast(workq_threadreq_t kqr)431 kqr_thread_fast(workq_threadreq_t kqr)
432 {
433 assert(kqr_thread_bound(kqr));
434 return kqr->tr_thread;
435 }
436
437 static inline thread_t
kqr_thread(workq_threadreq_t kqr)438 kqr_thread(workq_threadreq_t kqr)
439 {
440 return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
441 }
442
443 static inline struct kqworkloop *
kqr_kqworkloop(workq_threadreq_t kqr)444 kqr_kqworkloop(workq_threadreq_t kqr)
445 {
446 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
447 return __container_of(kqr, struct kqworkloop, kqwl_request);
448 }
449 return NULL;
450 }
451
452 static inline kqueue_t
kqr_kqueue(proc_t p,workq_threadreq_t kqr)453 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
454 {
455 kqueue_t kqu;
456 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
457 kqu.kqwl = kqr_kqworkloop(kqr);
458 } else {
459 kqu.kqwq = p->p_fd.fd_wqkqueue;
460 assert(kqr >= kqu.kqwq->kqwq_request &&
461 kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
462 }
463 return kqu;
464 }
465
466 #if CONFIG_PREADOPT_TG
467 /* There are no guarantees about which locks are held when this is called */
468 inline thread_group_qos_t
kqr_preadopt_thread_group(workq_threadreq_t req)469 kqr_preadopt_thread_group(workq_threadreq_t req)
470 {
471 struct kqworkloop *kqwl = kqr_kqworkloop(req);
472 return kqwl ? os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed) : NULL;
473 }
474
475 /* There are no guarantees about which locks are held when this is called */
_Atomic(thread_group_qos_t)476 inline _Atomic(thread_group_qos_t) *
477 kqr_preadopt_thread_group_addr(workq_threadreq_t req)
478 {
479 struct kqworkloop *kqwl = kqr_kqworkloop(req);
480 return kqwl ? (&kqwl->kqwl_preadopt_tg) : NULL;
481 }
482 #endif
483
484 /*
485 * kqueue/note lock implementations
486 *
487 * The kqueue lock guards the kq state, the state of its queues,
488 * and the kqueue-aware status and locks of individual knotes.
489 *
490 * The kqueue workq lock is used to protect state guarding the
491 * interaction of the kqueue with the workq. This state cannot
492 * be guarded by the kq lock - as it needs to be taken when we
493 * already have the waitq set lock held (during the waitq hook
494 * callback). It might be better to use the waitq lock itself
495 * for this, but the IRQ requirements make that difficult).
496 *
497 * Knote flags, filter flags, and associated data are protected
498 * by the underlying object lock - and are only ever looked at
499 * by calling the filter to get a [consistent] snapshot of that
500 * data.
501 */
502
503 static inline void
kqlock(kqueue_t kqu)504 kqlock(kqueue_t kqu)
505 {
506 lck_spin_lock(&kqu.kq->kq_lock);
507 }
508
509 static inline void
kqlock_held(__assert_only kqueue_t kqu)510 kqlock_held(__assert_only kqueue_t kqu)
511 {
512 LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
513 }
514
515 static inline void
kqunlock(kqueue_t kqu)516 kqunlock(kqueue_t kqu)
517 {
518 lck_spin_unlock(&kqu.kq->kq_lock);
519 }
520
521 static inline void
knhash_lock(struct filedesc * fdp)522 knhash_lock(struct filedesc *fdp)
523 {
524 lck_mtx_lock(&fdp->fd_knhashlock);
525 }
526
527 static inline void
knhash_unlock(struct filedesc * fdp)528 knhash_unlock(struct filedesc *fdp)
529 {
530 lck_mtx_unlock(&fdp->fd_knhashlock);
531 }
532
533 /* wait event for knote locks */
534 static inline event_t
knote_lock_wev(struct knote * kn)535 knote_lock_wev(struct knote *kn)
536 {
537 return (event_t)(&kn->kn_hook);
538 }
539
540 /* wait event for kevent_register_wait_* */
541 static inline event64_t
knote_filt_wev64(struct knote * kn)542 knote_filt_wev64(struct knote *kn)
543 {
544 /* kdp_workloop_sync_wait_find_owner knows about this */
545 return CAST_EVENT64_T(kn);
546 }
547
548 /* wait event for knote_post/knote_drop */
549 static inline event_t
knote_post_wev(struct knote * kn)550 knote_post_wev(struct knote *kn)
551 {
552 return &kn->kn_kevent;
553 }
554
555 /*!
556 * @function knote_has_qos
557 *
558 * @brief
559 * Whether the knote has a regular QoS.
560 *
561 * @discussion
562 * kn_qos_override is:
563 * - 0 on kqfiles
564 * - THREAD_QOS_LAST for special buckets (manager)
565 *
566 * Other values mean the knote participates to QoS propagation.
567 */
568 static inline bool
knote_has_qos(struct knote * kn)569 knote_has_qos(struct knote *kn)
570 {
571 return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
572 }
573
574 #pragma mark knote locks
575
576 /*
577 * Enum used by the knote_lock_* functions.
578 *
579 * KNOTE_KQ_LOCK_ALWAYS
580 * The function will always return with the kq lock held.
581 *
582 * KNOTE_KQ_LOCK_ON_SUCCESS
583 * The function will return with the kq lock held if it was successful
584 * (knote_lock() is the only function that can fail).
585 *
586 * KNOTE_KQ_LOCK_ON_FAILURE
587 * The function will return with the kq lock held if it was unsuccessful
588 * (knote_lock() is the only function that can fail).
589 *
590 * KNOTE_KQ_UNLOCK:
591 * The function returns with the kq unlocked.
592 */
593 enum kqlocking {
594 KNOTE_KQ_LOCK_ALWAYS,
595 KNOTE_KQ_LOCK_ON_SUCCESS,
596 KNOTE_KQ_LOCK_ON_FAILURE,
597 KNOTE_KQ_UNLOCK,
598 };
599
600 static struct knote_lock_ctx *
knote_lock_ctx_find(kqueue_t kqu,struct knote * kn)601 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
602 {
603 struct knote_lock_ctx *ctx;
604 LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
605 if (ctx->knlc_knote == kn) {
606 return ctx;
607 }
608 }
609 panic("knote lock context not found: %p", kn);
610 __builtin_trap();
611 }
612
613 /* slowpath of knote_lock() */
614 __attribute__((noinline))
615 static bool __result_use_check
knote_lock_slow(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,int kqlocking)616 knote_lock_slow(kqueue_t kqu, struct knote *kn,
617 struct knote_lock_ctx *knlc, int kqlocking)
618 {
619 struct knote_lock_ctx *owner_lc;
620 struct uthread *uth = current_uthread();
621 wait_result_t wr;
622
623 kqlock_held(kqu);
624
625 owner_lc = knote_lock_ctx_find(kqu, kn);
626 #if MACH_ASSERT
627 knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
628 #endif
629 owner_lc->knlc_waiters++;
630
631 /*
632 * Make our lock context visible to knote_unlock()
633 */
634 uth->uu_knlock = knlc;
635
636 wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
637 knote_lock_wev(kn), owner_lc->knlc_thread,
638 THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
639
640 if (wr == THREAD_RESTART) {
641 /*
642 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
643 * We need to cleanup the state since no one did.
644 */
645 uth->uu_knlock = NULL;
646 #if MACH_ASSERT
647 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
648 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
649 #endif
650
651 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
652 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
653 kqlock(kqu);
654 }
655 return false;
656 } else {
657 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
658 kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
659 kqlock(kqu);
660 /*
661 * This state is set under the lock so we can't
662 * really assert this unless we hold the lock.
663 */
664 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
665 }
666 return true;
667 }
668 }
669
670 /*
671 * Attempts to take the "knote" lock.
672 *
673 * Called with the kqueue lock held.
674 *
675 * Returns true if the knote lock is acquired, false if it has been dropped
676 */
677 static bool __result_use_check
knote_lock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)678 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
679 enum kqlocking kqlocking)
680 {
681 kqlock_held(kqu);
682
683 #if MACH_ASSERT
684 assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
685 #endif
686 knlc->knlc_knote = kn;
687 knlc->knlc_thread = current_thread();
688 knlc->knlc_waiters = 0;
689
690 if (__improbable(kn->kn_status & KN_LOCKED)) {
691 return knote_lock_slow(kqu, kn, knlc, kqlocking);
692 }
693
694 /*
695 * When the knote will be dropped, the knote lock is taken before
696 * KN_DROPPING is set, and then the knote will be removed from any
697 * hash table that references it before the lock is canceled.
698 */
699 assert((kn->kn_status & KN_DROPPING) == 0);
700 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
701 kn->kn_status |= KN_LOCKED;
702 #if MACH_ASSERT
703 knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
704 #endif
705
706 if (kqlocking == KNOTE_KQ_UNLOCK ||
707 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
708 kqunlock(kqu);
709 }
710 return true;
711 }
712
713 /*
714 * Unlocks a knote successfully locked with knote_lock().
715 *
716 * Called with the kqueue lock held.
717 *
718 * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
719 */
720 static void
knote_unlock(kqueue_t kqu,struct knote * kn,struct knote_lock_ctx * knlc,enum kqlocking kqlocking)721 knote_unlock(kqueue_t kqu, struct knote *kn,
722 struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
723 {
724 kqlock_held(kqu);
725
726 assert(knlc->knlc_knote == kn);
727 assert(kn->kn_status & KN_LOCKED);
728 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
729
730 LIST_REMOVE(knlc, knlc_link);
731
732 if (knlc->knlc_waiters) {
733 thread_t thread = THREAD_NULL;
734
735 wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
736 LCK_WAKE_DEFAULT, &thread);
737
738 /*
739 * knote_lock_slow() publishes the lock context of waiters
740 * in uthread::uu_knlock.
741 *
742 * Reach out and make this context the new owner.
743 */
744 struct uthread *ut = get_bsdthread_info(thread);
745 struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
746
747 assert(next_owner_lc->knlc_knote == kn);
748 next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
749 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
750 #if MACH_ASSERT
751 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
752 #endif
753 ut->uu_knlock = NULL;
754 thread_deallocate_safe(thread);
755 } else {
756 kn->kn_status &= ~KN_LOCKED;
757 }
758
759 if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
760 /*
761 * No f_event() in flight anymore, we can leave QoS "Merge" mode
762 *
763 * See knote_adjust_qos()
764 */
765 kn->kn_status &= ~KN_MERGE_QOS;
766 }
767 if (kqlocking == KNOTE_KQ_UNLOCK) {
768 kqunlock(kqu);
769 }
770 #if MACH_ASSERT
771 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
772 #endif
773 }
774
775 /*
776 * Aborts all waiters for a knote lock, and unlock the knote.
777 *
778 * Called with the kqueue lock held.
779 *
780 * Returns with the kqueue unlocked.
781 */
782 static void
knote_unlock_cancel(struct kqueue * kq,struct knote * kn,struct knote_lock_ctx * knlc)783 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
784 struct knote_lock_ctx *knlc)
785 {
786 kqlock_held(kq);
787
788 assert(knlc->knlc_knote == kn);
789 assert(kn->kn_status & KN_LOCKED);
790 assert(kn->kn_status & KN_DROPPING);
791
792 LIST_REMOVE(knlc, knlc_link);
793 kn->kn_status &= ~KN_LOCKED;
794 kqunlock(kq);
795
796 if (knlc->knlc_waiters) {
797 wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
798 }
799 #if MACH_ASSERT
800 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
801 #endif
802 }
803
804 /*
805 * Call the f_event hook of a given filter.
806 *
807 * Takes a use count to protect against concurrent drops.
808 * Called with the object lock held.
809 */
810 static void
knote_post(struct knote * kn,long hint)811 knote_post(struct knote *kn, long hint)
812 {
813 struct kqueue *kq = knote_get_kq(kn);
814 int dropping, result;
815
816 kqlock(kq);
817
818 if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
819 return kqunlock(kq);
820 }
821
822 if (__improbable(kn->kn_status & KN_POSTING)) {
823 panic("KNOTE() called concurrently on knote %p", kn);
824 }
825
826 kn->kn_status |= KN_POSTING;
827
828 kqunlock(kq);
829 result = filter_call(knote_fops(kn), f_event(kn, hint));
830 kqlock(kq);
831
832 /* Someone dropped the knote/the monitored object vanished while we
833 * were in f_event, swallow the side effects of the post.
834 */
835 dropping = (kn->kn_status & (KN_DROPPING | KN_VANISHED));
836
837 if (!dropping && (result & FILTER_ADJUST_EVENT_IOTIER_BIT)) {
838 kqueue_update_iotier_override(kq);
839 }
840
841 if (!dropping && (result & FILTER_ACTIVE)) {
842 knote_activate(kq, kn, result);
843 }
844
845 if ((kn->kn_status & KN_LOCKED) == 0) {
846 /*
847 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
848 *
849 * See knote_adjust_qos()
850 */
851 kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
852 } else {
853 kn->kn_status &= ~KN_POSTING;
854 }
855
856 if (__improbable(dropping)) {
857 thread_wakeup(knote_post_wev(kn));
858 }
859
860 kqunlock(kq);
861 }
862
863 /*
864 * Called by knote_drop() and knote_fdclose() to wait for the last f_event()
865 * caller to be done.
866 *
867 * - kq locked at entry
868 * - kq unlocked at exit
869 */
870 static void
knote_wait_for_post(struct kqueue * kq,struct knote * kn)871 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
872 {
873 kqlock_held(kq);
874
875 assert(kn->kn_status & (KN_DROPPING | KN_VANISHED));
876
877 if (kn->kn_status & KN_POSTING) {
878 lck_spin_sleep(&kq->kq_lock, LCK_SLEEP_UNLOCK, knote_post_wev(kn),
879 THREAD_UNINT | THREAD_WAIT_NOREPORT);
880 } else {
881 kqunlock(kq);
882 }
883 }
884
885 #pragma mark knote helpers for filters
886
887 OS_ALWAYS_INLINE
888 void *
knote_kn_hook_get_raw(struct knote * kn)889 knote_kn_hook_get_raw(struct knote *kn)
890 {
891 uintptr_t *addr = &kn->kn_hook;
892
893 void *hook = (void *) *addr;
894 #if __has_feature(ptrauth_calls)
895 if (hook) {
896 uint16_t blend = kn->kn_filter;
897 blend |= (kn->kn_filtid << 8);
898 blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
899
900 hook = ptrauth_auth_data(hook, ptrauth_key_process_independent_data,
901 ptrauth_blend_discriminator(addr, blend));
902 }
903 #endif
904
905 return hook;
906 }
907
908 OS_ALWAYS_INLINE void
knote_kn_hook_set_raw(struct knote * kn,void * kn_hook)909 knote_kn_hook_set_raw(struct knote *kn, void *kn_hook)
910 {
911 uintptr_t *addr = &kn->kn_hook;
912 #if __has_feature(ptrauth_calls)
913 if (kn_hook) {
914 uint16_t blend = kn->kn_filter;
915 blend |= (kn->kn_filtid << 8);
916 blend ^= OS_PTRAUTH_DISCRIMINATOR("kn.kn_hook");
917
918 kn_hook = ptrauth_sign_unauthenticated(kn_hook,
919 ptrauth_key_process_independent_data,
920 ptrauth_blend_discriminator(addr, blend));
921 }
922 #endif
923 *addr = (uintptr_t) kn_hook;
924 }
925
926 OS_ALWAYS_INLINE
927 void
knote_set_error(struct knote * kn,int error)928 knote_set_error(struct knote *kn, int error)
929 {
930 kn->kn_flags |= EV_ERROR;
931 kn->kn_sdata = error;
932 }
933
934 OS_ALWAYS_INLINE
935 int64_t
knote_low_watermark(const struct knote * kn)936 knote_low_watermark(const struct knote *kn)
937 {
938 return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
939 }
940
941 /*!
942 * @function knote_fill_kevent_with_sdata
943 *
944 * @brief
945 * Fills in a kevent from the current content of a knote.
946 *
947 * @discussion
948 * This is meant to be called from filter's f_process hooks.
949 * The kevent data is filled with kn->kn_sdata.
950 *
951 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
952 *
953 * Using knote_fill_kevent is typically preferred.
954 */
955 OS_ALWAYS_INLINE
956 void
knote_fill_kevent_with_sdata(struct knote * kn,struct kevent_qos_s * kev)957 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
958 {
959 #define knote_assert_aliases(name1, offs1, name2) \
960 static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
961 offsetof(struct kevent_internal_s, name2), \
962 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
963 /*
964 * All the code makes assumptions on these aliasing,
965 * so make sure we fail the build if we ever ever ever break them.
966 */
967 knote_assert_aliases(ident, 0, kei_ident);
968 #ifdef __LITTLE_ENDIAN__
969 knote_assert_aliases(filter, 0, kei_filter); // non trivial overlap
970 knote_assert_aliases(filter, 1, kei_filtid); // non trivial overlap
971 #else
972 knote_assert_aliases(filter, 0, kei_filtid); // non trivial overlap
973 knote_assert_aliases(filter, 1, kei_filter); // non trivial overlap
974 #endif
975 knote_assert_aliases(flags, 0, kei_flags);
976 knote_assert_aliases(qos, 0, kei_qos);
977 knote_assert_aliases(udata, 0, kei_udata);
978 knote_assert_aliases(fflags, 0, kei_fflags);
979 knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
980 knote_assert_aliases(data, 0, kei_sdata); // non trivial overlap
981 knote_assert_aliases(ext, 0, kei_ext);
982 #undef knote_assert_aliases
983
984 /*
985 * Fix the differences between kevent_qos_s and kevent_internal_s:
986 * - xflags is where kn_sfflags lives, we need to zero it
987 * - fixup the high bits of `filter` where kn_filtid lives
988 */
989 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
990 kev->xflags = 0;
991 kev->filter |= 0xff00;
992 if (kn->kn_flags & EV_CLEAR) {
993 kn->kn_fflags = 0;
994 }
995 }
996
997 /*!
998 * @function knote_fill_kevent
999 *
1000 * @brief
1001 * Fills in a kevent from the current content of a knote.
1002 *
1003 * @discussion
1004 * This is meant to be called from filter's f_process hooks.
1005 * The kevent data is filled with the passed in data.
1006 *
1007 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
1008 */
1009 OS_ALWAYS_INLINE
1010 void
knote_fill_kevent(struct knote * kn,struct kevent_qos_s * kev,int64_t data)1011 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
1012 {
1013 knote_fill_kevent_with_sdata(kn, kev);
1014 kev->filter = kn->kn_filter;
1015 kev->data = data;
1016 }
1017
1018
1019 #pragma mark file_filtops
1020
1021 static int
filt_fileattach(struct knote * kn,struct kevent_qos_s * kev)1022 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
1023 {
1024 return fo_kqfilter(kn->kn_fp, kn, kev);
1025 }
1026
1027 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
1028 .f_isfd = 1,
1029 .f_attach = filt_fileattach,
1030 };
1031
1032 #pragma mark kqread_filtops
1033
1034 #define f_flag fp_glob->fg_flag
1035 #define f_ops fp_glob->fg_ops
1036 #define f_lflags fp_glob->fg_lflags
1037
1038 static void
filt_kqdetach(struct knote * kn)1039 filt_kqdetach(struct knote *kn)
1040 {
1041 struct kqfile *kqf = (struct kqfile *)fp_get_data(kn->kn_fp);
1042 struct kqueue *kq = &kqf->kqf_kqueue;
1043
1044 kqlock(kq);
1045 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
1046 kqunlock(kq);
1047 }
1048
1049 static int
filt_kqueue(struct knote * kn,__unused long hint)1050 filt_kqueue(struct knote *kn, __unused long hint)
1051 {
1052 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1053
1054 return kq->kq_count > 0;
1055 }
1056
1057 static int
filt_kqtouch(struct knote * kn,struct kevent_qos_s * kev)1058 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
1059 {
1060 #pragma unused(kev)
1061 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1062 int res;
1063
1064 kqlock(kq);
1065 res = (kq->kq_count > 0);
1066 kqunlock(kq);
1067
1068 return res;
1069 }
1070
1071 static int
filt_kqprocess(struct knote * kn,struct kevent_qos_s * kev)1072 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
1073 {
1074 struct kqueue *kq = (struct kqueue *)fp_get_data(kn->kn_fp);
1075 int res = 0;
1076
1077 kqlock(kq);
1078 if (kq->kq_count) {
1079 knote_fill_kevent(kn, kev, kq->kq_count);
1080 res = 1;
1081 }
1082 kqunlock(kq);
1083
1084 return res;
1085 }
1086
1087 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
1088 .f_isfd = 1,
1089 .f_detach = filt_kqdetach,
1090 .f_event = filt_kqueue,
1091 .f_touch = filt_kqtouch,
1092 .f_process = filt_kqprocess,
1093 };
1094
1095 #pragma mark proc_filtops
1096
1097 static int
filt_procattach(struct knote * kn,__unused struct kevent_qos_s * kev)1098 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1099 {
1100 struct proc *p;
1101
1102 assert(PID_MAX < NOTE_PDATAMASK);
1103
1104 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
1105 knote_set_error(kn, ENOTSUP);
1106 return 0;
1107 }
1108
1109 p = proc_find((int)kn->kn_id);
1110 if (p == NULL) {
1111 knote_set_error(kn, ESRCH);
1112 return 0;
1113 }
1114
1115 const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1116
1117 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1118 do {
1119 pid_t selfpid = proc_selfpid();
1120
1121 if (p->p_ppid == selfpid) {
1122 break; /* parent => ok */
1123 }
1124 if ((p->p_lflag & P_LTRACED) != 0 &&
1125 (p->p_oppid == selfpid)) {
1126 break; /* parent-in-waiting => ok */
1127 }
1128 if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1129 break; /* allowed to signal => ok */
1130 }
1131 proc_rele(p);
1132 knote_set_error(kn, EACCES);
1133 return 0;
1134 } while (0);
1135 }
1136
1137 kn->kn_proc = p;
1138 kn->kn_flags |= EV_CLEAR; /* automatically set */
1139 kn->kn_sdata = 0; /* incoming data is ignored */
1140
1141 proc_klist_lock();
1142
1143 KNOTE_ATTACH(&p->p_klist, kn);
1144
1145 proc_klist_unlock();
1146
1147 proc_rele(p);
1148
1149 /*
1150 * only captures edge-triggered events after this point
1151 * so it can't already be fired.
1152 */
1153 return 0;
1154 }
1155
1156
1157 /*
1158 * The knote may be attached to a different process, which may exit,
1159 * leaving nothing for the knote to be attached to. In that case,
1160 * the pointer to the process will have already been nulled out.
1161 */
1162 static void
filt_procdetach(struct knote * kn)1163 filt_procdetach(struct knote *kn)
1164 {
1165 struct proc *p;
1166
1167 proc_klist_lock();
1168
1169 p = kn->kn_proc;
1170 if (p != PROC_NULL) {
1171 kn->kn_proc = PROC_NULL;
1172 KNOTE_DETACH(&p->p_klist, kn);
1173 }
1174
1175 proc_klist_unlock();
1176 }
1177
1178 static int
filt_procevent(struct knote * kn,long hint)1179 filt_procevent(struct knote *kn, long hint)
1180 {
1181 u_int event;
1182
1183 /* ALWAYS CALLED WITH proc_klist_lock */
1184
1185 /*
1186 * Note: a lot of bits in hint may be obtained from the knote
1187 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1188 * bits in hint for filt_procevent
1189 *
1190 * mask off extra data
1191 */
1192 event = (u_int)hint & NOTE_PCTRLMASK;
1193
1194 /*
1195 * termination lifecycle events can happen while a debugger
1196 * has reparented a process, in which case notifications
1197 * should be quashed except to the tracing parent. When
1198 * the debugger reaps the child (either via wait4(2) or
1199 * process exit), the child will be reparented to the original
1200 * parent and these knotes re-fired.
1201 */
1202 if (event & NOTE_EXIT) {
1203 if ((kn->kn_proc->p_oppid != 0)
1204 && (proc_getpid(knote_get_kq(kn)->kq_p) != kn->kn_proc->p_ppid)) {
1205 /*
1206 * This knote is not for the current ptrace(2) parent, ignore.
1207 */
1208 return 0;
1209 }
1210 }
1211
1212 /*
1213 * if the user is interested in this event, record it.
1214 */
1215 if (kn->kn_sfflags & event) {
1216 kn->kn_fflags |= event;
1217 }
1218
1219 #pragma clang diagnostic push
1220 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1221 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1222 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1223 }
1224 #pragma clang diagnostic pop
1225
1226
1227 /*
1228 * The kernel has a wrapper in place that returns the same data
1229 * as is collected here, in kn_hook32. Any changes to how
1230 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1231 * should also be reflected in the proc_pidnoteexit() wrapper.
1232 */
1233 if (event == NOTE_EXIT) {
1234 kn->kn_hook32 = 0;
1235 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1236 kn->kn_fflags |= NOTE_EXITSTATUS;
1237 kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1238 }
1239 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1240 kn->kn_fflags |= NOTE_EXIT_DETAIL;
1241 if ((kn->kn_proc->p_lflag &
1242 P_LTERM_DECRYPTFAIL) != 0) {
1243 kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1244 }
1245 if ((kn->kn_proc->p_lflag &
1246 P_LTERM_JETSAM) != 0) {
1247 kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1248 switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1249 case P_JETSAM_VMPAGESHORTAGE:
1250 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1251 break;
1252 case P_JETSAM_VMTHRASHING:
1253 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1254 break;
1255 case P_JETSAM_FCTHRASHING:
1256 kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1257 break;
1258 case P_JETSAM_VNODE:
1259 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1260 break;
1261 case P_JETSAM_HIWAT:
1262 kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1263 break;
1264 case P_JETSAM_PID:
1265 kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1266 break;
1267 case P_JETSAM_IDLEEXIT:
1268 kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1269 break;
1270 }
1271 }
1272 if ((proc_getcsflags(kn->kn_proc) &
1273 CS_KILLED) != 0) {
1274 kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1275 }
1276 }
1277 }
1278
1279 /* if we have any matching state, activate the knote */
1280 return kn->kn_fflags != 0;
1281 }
1282
1283 static int
filt_proctouch(struct knote * kn,struct kevent_qos_s * kev)1284 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1285 {
1286 int res;
1287
1288 proc_klist_lock();
1289
1290 /* accept new filter flags and mask off output events no long interesting */
1291 kn->kn_sfflags = kev->fflags;
1292
1293 /* restrict the current results to the (smaller?) set of new interest */
1294 /*
1295 * For compatibility with previous implementations, we leave kn_fflags
1296 * as they were before.
1297 */
1298 //kn->kn_fflags &= kn->kn_sfflags;
1299
1300 res = (kn->kn_fflags != 0);
1301
1302 proc_klist_unlock();
1303
1304 return res;
1305 }
1306
1307 static int
filt_procprocess(struct knote * kn,struct kevent_qos_s * kev)1308 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1309 {
1310 int res = 0;
1311
1312 proc_klist_lock();
1313 if (kn->kn_fflags) {
1314 knote_fill_kevent(kn, kev, kn->kn_hook32);
1315 kn->kn_hook32 = 0;
1316 res = 1;
1317 }
1318 proc_klist_unlock();
1319 return res;
1320 }
1321
1322 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1323 .f_attach = filt_procattach,
1324 .f_detach = filt_procdetach,
1325 .f_event = filt_procevent,
1326 .f_touch = filt_proctouch,
1327 .f_process = filt_procprocess,
1328 };
1329
1330 #pragma mark timer_filtops
1331
1332 struct filt_timer_params {
1333 uint64_t deadline; /* deadline in abs/cont time
1334 * (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1335 uint64_t leeway; /* leeway in abstime, or 0 if none */
1336 uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1337 };
1338
1339 /*
1340 * Values stored in the knote at rest (using Mach absolute time units)
1341 *
1342 * kn->kn_thcall where the thread_call object is stored
1343 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1344 * kn->kn_ext[1] leeway value
1345 * kn->kn_sdata interval timer: the interval
1346 * absolute/deadline timer: 0
1347 * kn->kn_hook32 timer state (with gencount)
1348 *
1349 * TIMER_IDLE:
1350 * The timer has either never been scheduled or been cancelled.
1351 * It is safe to schedule a new one in this state.
1352 *
1353 * TIMER_ARMED:
1354 * The timer has been scheduled
1355 *
1356 * TIMER_FIRED
1357 * The timer has fired and an event needs to be delivered.
1358 * When in this state, the callout may still be running.
1359 *
1360 * TIMER_IMMEDIATE
1361 * The timer has fired at registration time, and the callout was never
1362 * dispatched.
1363 */
1364 #define TIMER_IDLE 0x0
1365 #define TIMER_ARMED 0x1
1366 #define TIMER_FIRED 0x2
1367 #define TIMER_IMMEDIATE 0x3
1368 #define TIMER_STATE_MASK 0x3
1369 #define TIMER_GEN_INC 0x4
1370
1371 static void
filt_timer_set_params(struct knote * kn,struct filt_timer_params * params)1372 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1373 {
1374 kn->kn_ext[0] = params->deadline;
1375 kn->kn_ext[1] = params->leeway;
1376 kn->kn_sdata = params->interval;
1377 }
1378
1379 /*
1380 * filt_timervalidate - process data from user
1381 *
1382 * Sets up the deadline, interval, and leeway from the provided user data
1383 *
1384 * Input:
1385 * kn_sdata timer deadline or interval time
1386 * kn_sfflags style of timer, unit of measurement
1387 *
1388 * Output:
1389 * struct filter_timer_params to apply to the filter with
1390 * filt_timer_set_params when changes are ready to be commited.
1391 *
1392 * Returns:
1393 * EINVAL Invalid user data parameters
1394 * ERANGE Various overflows with the parameters
1395 *
1396 * Called with timer filter lock held.
1397 */
1398 static int
filt_timervalidate(const struct kevent_qos_s * kev,struct filt_timer_params * params)1399 filt_timervalidate(const struct kevent_qos_s *kev,
1400 struct filt_timer_params *params)
1401 {
1402 /*
1403 * There are 5 knobs that need to be chosen for a timer registration:
1404 *
1405 * A) Units of time (what is the time duration of the specified number)
1406 * Absolute and interval take:
1407 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1408 * Defaults to milliseconds if not specified
1409 *
1410 * B) Clock epoch (what is the zero point of the specified number)
1411 * For interval, there is none
1412 * For absolute, defaults to the gettimeofday/calendar epoch
1413 * With NOTE_MACHTIME, uses mach_absolute_time()
1414 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1415 *
1416 * C) The knote's behavior on delivery
1417 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1418 * Absolute is a forced one-shot timer which deletes on delivery
1419 * TODO: Add a way for absolute to be not forced one-shot
1420 *
1421 * D) Whether the time duration is relative to now or absolute
1422 * Interval fires at now + duration when it is set up
1423 * Absolute fires at now + difference between now walltime and passed in walltime
1424 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1425 *
1426 * E) Whether the timer continues to tick across sleep
1427 * By default all three do not.
1428 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1429 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1430 * expires when mach_continuous_time() is > the passed in value.
1431 */
1432
1433 uint64_t multiplier;
1434
1435 boolean_t use_abstime = FALSE;
1436
1437 switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1438 case NOTE_SECONDS:
1439 multiplier = NSEC_PER_SEC;
1440 break;
1441 case NOTE_USECONDS:
1442 multiplier = NSEC_PER_USEC;
1443 break;
1444 case NOTE_NSECONDS:
1445 multiplier = 1;
1446 break;
1447 case NOTE_MACHTIME:
1448 multiplier = 0;
1449 use_abstime = TRUE;
1450 break;
1451 case 0: /* milliseconds (default) */
1452 multiplier = NSEC_PER_SEC / 1000;
1453 break;
1454 default:
1455 return EINVAL;
1456 }
1457
1458 /* transform the leeway in kn_ext[1] to same time scale */
1459 if (kev->fflags & NOTE_LEEWAY) {
1460 uint64_t leeway_abs;
1461
1462 if (use_abstime) {
1463 leeway_abs = (uint64_t)kev->ext[1];
1464 } else {
1465 uint64_t leeway_ns;
1466 if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1467 return ERANGE;
1468 }
1469
1470 nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1471 }
1472
1473 params->leeway = leeway_abs;
1474 } else {
1475 params->leeway = 0;
1476 }
1477
1478 if (kev->fflags & NOTE_ABSOLUTE) {
1479 uint64_t deadline_abs;
1480
1481 if (use_abstime) {
1482 deadline_abs = (uint64_t)kev->data;
1483 } else {
1484 uint64_t calendar_deadline_ns;
1485
1486 if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1487 return ERANGE;
1488 }
1489
1490 /* calendar_deadline_ns is in nanoseconds since the epoch */
1491
1492 clock_sec_t seconds;
1493 clock_nsec_t nanoseconds;
1494
1495 /*
1496 * Note that the conversion through wall-time is only done once.
1497 *
1498 * If the relationship between MAT and gettimeofday changes,
1499 * the underlying timer does not update.
1500 *
1501 * TODO: build a wall-time denominated timer_call queue
1502 * and a flag to request DTRTing with wall-time timers
1503 */
1504 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1505
1506 uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1507
1508 /* if deadline is in the future */
1509 if (calendar_now_ns < calendar_deadline_ns) {
1510 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1511 uint64_t interval_abs;
1512
1513 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1514
1515 /*
1516 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1517 * causes the timer to keep ticking across sleep, but
1518 * it does not change the calendar timebase.
1519 */
1520
1521 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1522 clock_continuoustime_interval_to_deadline(interval_abs,
1523 &deadline_abs);
1524 } else {
1525 clock_absolutetime_interval_to_deadline(interval_abs,
1526 &deadline_abs);
1527 }
1528 } else {
1529 deadline_abs = 0; /* cause immediate expiration */
1530 }
1531 }
1532
1533 params->deadline = deadline_abs;
1534 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1535 } else if (kev->data < 0) {
1536 /*
1537 * Negative interval timers fire immediately, once.
1538 *
1539 * Ideally a negative interval would be an error, but certain clients
1540 * pass negative values on accident, and expect an event back.
1541 *
1542 * In the old implementation the timer would repeat with no delay
1543 * N times until mach_absolute_time() + (N * interval) underflowed,
1544 * then it would wait ~forever by accidentally arming a timer for the far future.
1545 *
1546 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1547 */
1548
1549 params->deadline = 0; /* expire immediately */
1550 params->interval = 0; /* non-repeating */
1551 } else {
1552 uint64_t interval_abs = 0;
1553
1554 if (use_abstime) {
1555 interval_abs = (uint64_t)kev->data;
1556 } else {
1557 uint64_t interval_ns;
1558 if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1559 return ERANGE;
1560 }
1561
1562 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1563 }
1564
1565 uint64_t deadline = 0;
1566
1567 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1568 clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1569 } else {
1570 clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1571 }
1572
1573 params->deadline = deadline;
1574 params->interval = interval_abs;
1575 }
1576
1577 return 0;
1578 }
1579
1580 /*
1581 * filt_timerexpire - the timer callout routine
1582 */
1583 static void
filt_timerexpire(void * knx,void * state_on_arm)1584 filt_timerexpire(void *knx, void *state_on_arm)
1585 {
1586 struct knote *kn = knx;
1587
1588 uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1589 uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1590
1591 if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1592 // our f_event always would say FILTER_ACTIVE,
1593 // so be leaner and just do it.
1594 struct kqueue *kq = knote_get_kq(kn);
1595 kqlock(kq);
1596 knote_activate(kq, kn, FILTER_ACTIVE);
1597 kqunlock(kq);
1598 } else {
1599 /*
1600 * The timer has been reprogrammed or canceled since it was armed,
1601 * and this is a late firing for the timer, just ignore it.
1602 */
1603 }
1604 }
1605
1606 /*
1607 * Does this deadline needs a timer armed for it, or has it expired?
1608 */
1609 static bool
filt_timer_is_ready(struct knote * kn)1610 filt_timer_is_ready(struct knote *kn)
1611 {
1612 uint64_t now, deadline = kn->kn_ext[0];
1613
1614 if (deadline == 0) {
1615 return true;
1616 }
1617
1618 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1619 now = mach_continuous_time();
1620 } else {
1621 now = mach_absolute_time();
1622 }
1623 return deadline <= now;
1624 }
1625
1626 /*
1627 * Arm a timer
1628 *
1629 * It is the responsibility of the caller to make sure the timer call
1630 * has completed or been cancelled properly prior to arming it.
1631 */
1632 static void
filt_timerarm(struct knote * kn)1633 filt_timerarm(struct knote *kn)
1634 {
1635 uint64_t deadline = kn->kn_ext[0];
1636 uint64_t leeway = kn->kn_ext[1];
1637 uint32_t state;
1638
1639 int filter_flags = kn->kn_sfflags;
1640 unsigned int timer_flags = 0;
1641
1642 if (filter_flags & NOTE_CRITICAL) {
1643 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1644 } else if (filter_flags & NOTE_BACKGROUND) {
1645 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1646 } else {
1647 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1648 }
1649
1650 if (filter_flags & NOTE_LEEWAY) {
1651 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1652 }
1653
1654 if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1655 timer_flags |= THREAD_CALL_CONTINUOUS;
1656 }
1657
1658 /*
1659 * Move to ARMED.
1660 *
1661 * We increase the gencount, and setup the thread call with this expected
1662 * state. It means that if there was a previous generation of the timer in
1663 * flight that needs to be ignored, then 3 things are possible:
1664 *
1665 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1666 * but we clobber it with ARMED and a new gencount. The knote will still
1667 * be activated, but filt_timerprocess() which is serialized with this
1668 * call will not see the FIRED bit set and will not deliver an event.
1669 *
1670 * - this code runs first, but filt_timerexpire() comes second. Because it
1671 * knows an old gencount, it will debounce and not activate the knote.
1672 *
1673 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1674 * will just cancel it properly.
1675 *
1676 * This is important as userspace expects to never be woken up for past
1677 * timers after filt_timertouch ran.
1678 */
1679 state = os_atomic_load(&kn->kn_hook32, relaxed);
1680 state &= ~TIMER_STATE_MASK;
1681 state += TIMER_GEN_INC + TIMER_ARMED;
1682 os_atomic_store(&kn->kn_hook32, state, relaxed);
1683
1684 thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1685 (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1686 }
1687
1688 /*
1689 * Mark a timer as "already fired" when it is being reprogrammed
1690 *
1691 * If there is a timer in flight, this will do a best effort at canceling it,
1692 * but will not wait. If the thread call was in flight, having set the
1693 * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1694 * cancelation.
1695 */
1696 static void
filt_timerfire_immediate(struct knote * kn)1697 filt_timerfire_immediate(struct knote *kn)
1698 {
1699 uint32_t state;
1700
1701 static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1702 "validate that this atomic or will transition to IMMEDIATE");
1703 state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1704
1705 if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1706 thread_call_cancel(kn->kn_thcall);
1707 }
1708 }
1709
1710 /*
1711 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1712 */
1713 static int
filt_timerattach(struct knote * kn,struct kevent_qos_s * kev)1714 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1715 {
1716 thread_call_t callout;
1717 struct filt_timer_params params;
1718 int error;
1719
1720 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1721 knote_set_error(kn, error);
1722 return 0;
1723 }
1724
1725 callout = thread_call_allocate_with_options(filt_timerexpire,
1726 (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1727 THREAD_CALL_OPTIONS_ONCE);
1728
1729 if (NULL == callout) {
1730 knote_set_error(kn, ENOMEM);
1731 return 0;
1732 }
1733
1734 filt_timer_set_params(kn, ¶ms);
1735 kn->kn_thcall = callout;
1736 kn->kn_flags |= EV_CLEAR;
1737 os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1738
1739 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1740 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1741 kn->kn_flags |= EV_ONESHOT;
1742 }
1743
1744 if (filt_timer_is_ready(kn)) {
1745 os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1746 return FILTER_ACTIVE;
1747 } else {
1748 filt_timerarm(kn);
1749 return 0;
1750 }
1751 }
1752
1753 /*
1754 * Shut down the timer if it's running, and free the callout.
1755 */
1756 static void
filt_timerdetach(struct knote * kn)1757 filt_timerdetach(struct knote *kn)
1758 {
1759 __assert_only boolean_t freed;
1760
1761 /*
1762 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1763 * running anymore.
1764 */
1765 thread_call_cancel_wait(kn->kn_thcall);
1766 freed = thread_call_free(kn->kn_thcall);
1767 assert(freed);
1768 }
1769
1770 /*
1771 * filt_timertouch - update timer knote with new user input
1772 *
1773 * Cancel and restart the timer based on new user data. When
1774 * the user picks up a knote, clear the count of how many timer
1775 * pops have gone off (in kn_data).
1776 */
1777 static int
filt_timertouch(struct knote * kn,struct kevent_qos_s * kev)1778 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1779 {
1780 struct filt_timer_params params;
1781 uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1782 int error;
1783
1784 if (kev->qos && (knote_get_kq(kn)->kq_state & KQ_WORKLOOP) &&
1785 !_pthread_priority_thread_qos(kev->qos)) {
1786 /* validate usage of FILTER_UPDATE_REQ_QOS */
1787 kev->flags |= EV_ERROR;
1788 kev->data = ERANGE;
1789 return 0;
1790 }
1791
1792 if (changed_flags & NOTE_ABSOLUTE) {
1793 kev->flags |= EV_ERROR;
1794 kev->data = EINVAL;
1795 return 0;
1796 }
1797
1798 if ((error = filt_timervalidate(kev, ¶ms)) != 0) {
1799 kev->flags |= EV_ERROR;
1800 kev->data = error;
1801 return 0;
1802 }
1803
1804 /* capture the new values used to compute deadline */
1805 filt_timer_set_params(kn, ¶ms);
1806 kn->kn_sfflags = kev->fflags;
1807
1808 if (filt_timer_is_ready(kn)) {
1809 filt_timerfire_immediate(kn);
1810 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1811 } else {
1812 filt_timerarm(kn);
1813 return FILTER_UPDATE_REQ_QOS;
1814 }
1815 }
1816
1817 /*
1818 * filt_timerprocess - query state of knote and snapshot event data
1819 *
1820 * Determine if the timer has fired in the past, snapshot the state
1821 * of the kevent for returning to user-space, and clear pending event
1822 * counters for the next time.
1823 */
1824 static int
filt_timerprocess(struct knote * kn,struct kevent_qos_s * kev)1825 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1826 {
1827 uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1828
1829 /*
1830 * filt_timerprocess is serialized with any filter routine except for
1831 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1832 * transition, and on success, activates the knote.
1833 *
1834 * Hence, we don't need atomic modifications of the state, only to peek at
1835 * whether we see any of the "FIRED" state, and if we do, it is safe to
1836 * do simple state machine transitions.
1837 */
1838 switch (state & TIMER_STATE_MASK) {
1839 case TIMER_IDLE:
1840 case TIMER_ARMED:
1841 /*
1842 * This can happen if a touch resets a timer that had fired
1843 * without being processed
1844 */
1845 return 0;
1846 }
1847
1848 os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1849
1850 /*
1851 * Copy out the interesting kevent state,
1852 * but don't leak out the raw time calculations.
1853 *
1854 * TODO: potential enhancements - tell the user about:
1855 * - deadline to which this timer thought it was expiring
1856 * - return kn_sfflags in the fflags field so the client can know
1857 * under what flags the timer fired
1858 */
1859 knote_fill_kevent(kn, kev, 1);
1860 kev->ext[0] = 0;
1861 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1862
1863 if (kn->kn_sdata != 0) {
1864 /*
1865 * This is a 'repeating' timer, so we have to emit
1866 * how many intervals expired between the arm
1867 * and the process.
1868 *
1869 * A very strange style of interface, because
1870 * this could easily be done in the client...
1871 */
1872
1873 uint64_t now;
1874
1875 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1876 now = mach_continuous_time();
1877 } else {
1878 now = mach_absolute_time();
1879 }
1880
1881 uint64_t first_deadline = kn->kn_ext[0];
1882 uint64_t interval_abs = kn->kn_sdata;
1883 uint64_t orig_arm_time = first_deadline - interval_abs;
1884
1885 assert(now > orig_arm_time);
1886 assert(now > first_deadline);
1887
1888 uint64_t elapsed = now - orig_arm_time;
1889
1890 uint64_t num_fired = elapsed / interval_abs;
1891
1892 /*
1893 * To reach this code, we must have seen the timer pop
1894 * and be in repeating mode, so therefore it must have been
1895 * more than 'interval' time since the attach or last
1896 * successful touch.
1897 */
1898 assert(num_fired > 0);
1899
1900 /* report how many intervals have elapsed to the user */
1901 kev->data = (int64_t)num_fired;
1902
1903 /* We only need to re-arm the timer if it's not about to be destroyed */
1904 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1905 /* fire at the end of the next interval */
1906 uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1907
1908 assert(new_deadline > now);
1909
1910 kn->kn_ext[0] = new_deadline;
1911
1912 /*
1913 * This can't shortcut setting up the thread call, because
1914 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1915 */
1916 filt_timerarm(kn);
1917 }
1918 }
1919
1920 return FILTER_ACTIVE;
1921 }
1922
1923 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1924 .f_extended_codes = true,
1925 .f_attach = filt_timerattach,
1926 .f_detach = filt_timerdetach,
1927 .f_event = filt_bad_event,
1928 .f_touch = filt_timertouch,
1929 .f_process = filt_timerprocess,
1930 };
1931
1932 #pragma mark user_filtops
1933
1934 static int
filt_userattach(struct knote * kn,__unused struct kevent_qos_s * kev)1935 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1936 {
1937 if (kn->kn_sfflags & NOTE_TRIGGER) {
1938 kn->kn_hook32 = FILTER_ACTIVE;
1939 } else {
1940 kn->kn_hook32 = 0;
1941 }
1942 return kn->kn_hook32;
1943 }
1944
1945 static int
filt_usertouch(struct knote * kn,struct kevent_qos_s * kev)1946 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1947 {
1948 uint32_t ffctrl;
1949 int fflags;
1950
1951 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1952 fflags = kev->fflags & NOTE_FFLAGSMASK;
1953 switch (ffctrl) {
1954 case NOTE_FFNOP:
1955 break;
1956 case NOTE_FFAND:
1957 kn->kn_sfflags &= fflags;
1958 break;
1959 case NOTE_FFOR:
1960 kn->kn_sfflags |= fflags;
1961 break;
1962 case NOTE_FFCOPY:
1963 kn->kn_sfflags = fflags;
1964 break;
1965 }
1966 kn->kn_sdata = kev->data;
1967
1968 if (kev->fflags & NOTE_TRIGGER) {
1969 kn->kn_hook32 = FILTER_ACTIVE;
1970 }
1971 return (int)kn->kn_hook32;
1972 }
1973
1974 static int
filt_userprocess(struct knote * kn,struct kevent_qos_s * kev)1975 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1976 {
1977 int result = (int)kn->kn_hook32;
1978
1979 if (result) {
1980 /* EVFILT_USER returns the data that was passed in */
1981 knote_fill_kevent_with_sdata(kn, kev);
1982 kev->fflags = kn->kn_sfflags;
1983 if (kn->kn_flags & EV_CLEAR) {
1984 /* knote_fill_kevent cleared kn_fflags */
1985 kn->kn_hook32 = 0;
1986 }
1987 }
1988
1989 return result;
1990 }
1991
1992 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1993 .f_extended_codes = true,
1994 .f_attach = filt_userattach,
1995 .f_detach = filt_no_detach,
1996 .f_event = filt_bad_event,
1997 .f_touch = filt_usertouch,
1998 .f_process = filt_userprocess,
1999 };
2000
2001 #pragma mark workloop_filtops
2002
2003 #define EPREEMPTDISABLED (-1)
2004
2005 static inline void
filt_wllock(struct kqworkloop * kqwl)2006 filt_wllock(struct kqworkloop *kqwl)
2007 {
2008 lck_spin_lock(&kqwl->kqwl_statelock);
2009 }
2010
2011 static inline void
filt_wlunlock(struct kqworkloop * kqwl)2012 filt_wlunlock(struct kqworkloop *kqwl)
2013 {
2014 lck_spin_unlock(&kqwl->kqwl_statelock);
2015 }
2016
2017 /*
2018 * Returns true when the interlock for the turnstile is the workqueue lock
2019 *
2020 * When this is the case, all turnstiles operations are delegated
2021 * to the workqueue subsystem.
2022 *
2023 * This is required because kqueue_threadreq_bind_prepost only holds the
2024 * workqueue lock but needs to move the inheritor from the workloop turnstile
2025 * away from the creator thread, so that this now fulfilled request cannot be
2026 * picked anymore by other threads.
2027 */
2028 static inline bool
filt_wlturnstile_interlock_is_workq(struct kqworkloop * kqwl)2029 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
2030 {
2031 return kqr_thread_requested_pending(&kqwl->kqwl_request);
2032 }
2033
2034 static void
filt_wlupdate_inheritor(struct kqworkloop * kqwl,struct turnstile * ts,turnstile_update_flags_t flags)2035 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
2036 turnstile_update_flags_t flags)
2037 {
2038 turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
2039 workq_threadreq_t kqr = &kqwl->kqwl_request;
2040
2041 /*
2042 * binding to the workq should always happen through
2043 * workq_kern_threadreq_update_inheritor()
2044 */
2045 assert(!filt_wlturnstile_interlock_is_workq(kqwl));
2046
2047 if ((inheritor = kqwl->kqwl_owner)) {
2048 flags |= TURNSTILE_INHERITOR_THREAD;
2049 } else if ((inheritor = kqr_thread(kqr))) {
2050 flags |= TURNSTILE_INHERITOR_THREAD;
2051 }
2052
2053 turnstile_update_inheritor(ts, inheritor, flags);
2054 }
2055
2056 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
2057 #define FILT_WLATTACH 0
2058 #define FILT_WLTOUCH 1
2059 #define FILT_WLDROP 2
2060
2061 __result_use_check
2062 static int
filt_wlupdate(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,kq_index_t qos_index,int op)2063 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
2064 struct kevent_qos_s *kev, kq_index_t qos_index, int op)
2065 {
2066 user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2067 workq_threadreq_t kqr = &kqwl->kqwl_request;
2068 thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
2069 kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
2070 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2071 int action = KQWL_UTQ_NONE, error = 0;
2072 bool wl_inheritor_updated = false, needs_wake = false;
2073 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2074 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2075 uint64_t udata = 0;
2076 struct turnstile *ts = TURNSTILE_NULL;
2077
2078 filt_wllock(kqwl);
2079
2080 again:
2081 new_owner = cur_owner = kqwl->kqwl_owner;
2082
2083 /*
2084 * Phase 1:
2085 *
2086 * If asked, load the uint64 value at the user provided address and compare
2087 * it against the passed in mask and expected value.
2088 *
2089 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
2090 * a thread reference.
2091 *
2092 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
2093 * the current thread, then end ownership.
2094 *
2095 * Lastly decide whether we need to perform a QoS update.
2096 */
2097 if (uaddr) {
2098 /*
2099 * Until <rdar://problem/24999882> exists,
2100 * disabling preemption copyin forces any
2101 * vm_fault we encounter to fail.
2102 */
2103 error = copyin_atomic64(uaddr, &udata);
2104
2105 /*
2106 * If we get EFAULT, drop locks, and retry.
2107 * If we still get an error report it,
2108 * else assume the memory has been faulted
2109 * and attempt to copyin under lock again.
2110 */
2111 switch (error) {
2112 case 0:
2113 break;
2114 case EFAULT:
2115 if (efault_retry-- > 0) {
2116 filt_wlunlock(kqwl);
2117 error = copyin_atomic64(uaddr, &udata);
2118 filt_wllock(kqwl);
2119 if (error == 0) {
2120 goto again;
2121 }
2122 }
2123 OS_FALLTHROUGH;
2124 default:
2125 goto out;
2126 }
2127
2128 /* Update state as copied in. */
2129 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2130
2131 if ((udata & mask) != (kdata & mask)) {
2132 error = ESTALE;
2133 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2134 /*
2135 * Decipher the owner port name, and translate accordingly.
2136 * The low 2 bits were borrowed for other flags, so mask them off.
2137 *
2138 * Then attempt translation to a thread reference or fail.
2139 */
2140 mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2141 if (name != MACH_PORT_NULL) {
2142 name = ipc_entry_name_mask(name);
2143 extra_thread_ref = port_name_to_thread(name,
2144 PORT_INTRANS_THREAD_IN_CURRENT_TASK);
2145 if (extra_thread_ref == THREAD_NULL) {
2146 error = EOWNERDEAD;
2147 goto out;
2148 }
2149 new_owner = extra_thread_ref;
2150 }
2151 }
2152 }
2153
2154 if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2155 new_owner = THREAD_NULL;
2156 }
2157
2158 if (error == 0) {
2159 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2160 action = KQWL_UTQ_SET_QOS_INDEX;
2161 } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2162 action = KQWL_UTQ_SET_QOS_INDEX;
2163 }
2164
2165 if (op == FILT_WLTOUCH) {
2166 /*
2167 * Save off any additional fflags/data we just accepted
2168 * But only keep the last round of "update" bits we acted on which helps
2169 * debugging a lot.
2170 */
2171 kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2172 kn->kn_sfflags |= kev->fflags;
2173 if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2174 needs_wake = (kn->kn_thread != THREAD_NULL);
2175 }
2176 } else if (op == FILT_WLDROP) {
2177 if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2178 NOTE_WL_SYNC_WAIT) {
2179 /*
2180 * When deleting a SYNC_WAIT knote that hasn't been woken up
2181 * explicitly, issue a wake up.
2182 */
2183 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2184 needs_wake = (kn->kn_thread != THREAD_NULL);
2185 }
2186 }
2187 }
2188
2189 /*
2190 * Phase 2:
2191 *
2192 * Commit ownership and QoS changes if any, possibly wake up waiters
2193 */
2194
2195 if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2196 goto out;
2197 }
2198
2199 kqlock(kqwl);
2200
2201 /* If already tracked as servicer, don't track as owner */
2202 if (new_owner == kqr_thread(kqr)) {
2203 new_owner = THREAD_NULL;
2204 }
2205
2206 if (cur_owner != new_owner) {
2207 kqwl->kqwl_owner = new_owner;
2208 if (new_owner == extra_thread_ref) {
2209 /* we just transfered this ref to kqwl_owner */
2210 extra_thread_ref = THREAD_NULL;
2211 }
2212 cur_override = kqworkloop_override(kqwl);
2213
2214 if (new_owner) {
2215 /* override it before we drop the old */
2216 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2217 thread_add_kevent_override(new_owner, cur_override);
2218 }
2219 if (kqr_thread_requested_pending(kqr)) {
2220 if (action == KQWL_UTQ_NONE) {
2221 action = KQWL_UTQ_REDRIVE_EVENTS;
2222 }
2223 }
2224 } else if (action == KQWL_UTQ_NONE &&
2225 !kqr_thread_requested(kqr) &&
2226 kqwl->kqwl_wakeup_qos) {
2227 action = KQWL_UTQ_REDRIVE_EVENTS;
2228 }
2229 }
2230
2231 if (action != KQWL_UTQ_NONE) {
2232 kqworkloop_update_threads_qos(kqwl, action, qos_index);
2233 }
2234
2235 ts = kqwl->kqwl_turnstile;
2236 if (cur_owner != new_owner && ts) {
2237 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2238 /*
2239 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2240 * the code went through workq_kern_threadreq_initiate()
2241 * and the workqueue has set the inheritor already
2242 */
2243 assert(filt_wlturnstile_interlock_is_workq(kqwl));
2244 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2245 workq_kern_threadreq_lock(kqwl->kqwl_p);
2246 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2247 ts, TURNSTILE_IMMEDIATE_UPDATE);
2248 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2249 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2250 /*
2251 * If the workq is no longer the interlock, then
2252 * workq_kern_threadreq_update_inheritor() has finished a bind
2253 * and we need to fallback to the regular path.
2254 */
2255 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2256 }
2257 wl_inheritor_updated = true;
2258 } else {
2259 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2260 wl_inheritor_updated = true;
2261 }
2262
2263 /*
2264 * We need a turnstile reference because we are dropping the interlock
2265 * and the caller has not called turnstile_prepare.
2266 */
2267 if (wl_inheritor_updated) {
2268 turnstile_reference(ts);
2269 }
2270 }
2271
2272 if (needs_wake && ts) {
2273 waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2274 kn->kn_thread, THREAD_AWAKENED);
2275 if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2276 disable_preemption();
2277 error = EPREEMPTDISABLED;
2278 }
2279 }
2280
2281 kqunlock(kqwl);
2282
2283 out:
2284 /*
2285 * Phase 3:
2286 *
2287 * Unlock and cleanup various lingering references and things.
2288 */
2289 filt_wlunlock(kqwl);
2290
2291 #if CONFIG_WORKLOOP_DEBUG
2292 KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2293 .updater = current_thread(),
2294 .servicer = kqr_thread(kqr), /* Note: racy */
2295 .old_owner = cur_owner,
2296 .new_owner = new_owner,
2297
2298 .kev_ident = kev->ident,
2299 .error = (int16_t)error,
2300 .kev_flags = kev->flags,
2301 .kev_fflags = kev->fflags,
2302
2303 .kev_mask = mask,
2304 .kev_value = kdata,
2305 .in_value = udata,
2306 });
2307 #endif // CONFIG_WORKLOOP_DEBUG
2308
2309 if (wl_inheritor_updated) {
2310 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2311 turnstile_deallocate(ts);
2312 }
2313
2314 if (cur_owner && new_owner != cur_owner) {
2315 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2316 thread_drop_kevent_override(cur_owner);
2317 }
2318 thread_deallocate_safe(cur_owner);
2319 }
2320 if (extra_thread_ref) {
2321 thread_deallocate_safe(extra_thread_ref);
2322 }
2323 return error;
2324 }
2325
2326 /*
2327 * Remembers the last updated that came in from userspace for debugging reasons.
2328 * - fflags is mirrored from the userspace kevent
2329 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2330 * - ext[VALUE] is set to what the kernel loaded atomically
2331 * - data is set to the error if any
2332 */
2333 static inline void
filt_wlremember_last_update(struct knote * kn,struct kevent_qos_s * kev,int error)2334 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2335 int error)
2336 {
2337 kn->kn_fflags = kev->fflags;
2338 kn->kn_sdata = error;
2339 memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2340 }
2341
2342 static int
filt_wlupdate_sync_ipc(struct kqworkloop * kqwl,struct knote * kn,struct kevent_qos_s * kev,int op)2343 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2344 struct kevent_qos_s *kev, int op)
2345 {
2346 user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2347 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2348 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2349 uint64_t udata = 0;
2350 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2351 int error = 0;
2352
2353 if (op == FILT_WLATTACH) {
2354 (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2355 } else if (uaddr == 0) {
2356 return 0;
2357 }
2358
2359 filt_wllock(kqwl);
2360
2361 again:
2362
2363 /*
2364 * Do the debounce thing, the lock serializing the state is the knote lock.
2365 */
2366 if (uaddr) {
2367 /*
2368 * Until <rdar://problem/24999882> exists,
2369 * disabling preemption copyin forces any
2370 * vm_fault we encounter to fail.
2371 */
2372 error = copyin_atomic64(uaddr, &udata);
2373
2374 /*
2375 * If we get EFAULT, drop locks, and retry.
2376 * If we still get an error report it,
2377 * else assume the memory has been faulted
2378 * and attempt to copyin under lock again.
2379 */
2380 switch (error) {
2381 case 0:
2382 break;
2383 case EFAULT:
2384 if (efault_retry-- > 0) {
2385 filt_wlunlock(kqwl);
2386 error = copyin_atomic64(uaddr, &udata);
2387 filt_wllock(kqwl);
2388 if (error == 0) {
2389 goto again;
2390 }
2391 }
2392 OS_FALLTHROUGH;
2393 default:
2394 goto out;
2395 }
2396
2397 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2398 kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2399
2400 if ((udata & mask) != (kdata & mask)) {
2401 error = ESTALE;
2402 goto out;
2403 }
2404 }
2405
2406 if (op == FILT_WLATTACH) {
2407 error = filt_wlattach_sync_ipc(kn);
2408 if (error == 0) {
2409 disable_preemption();
2410 error = EPREEMPTDISABLED;
2411 }
2412 }
2413
2414 out:
2415 filt_wlunlock(kqwl);
2416 return error;
2417 }
2418
2419 static int
filt_wlattach(struct knote * kn,struct kevent_qos_s * kev)2420 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2421 {
2422 struct kqueue *kq = knote_get_kq(kn);
2423 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2424 int error = 0, result = 0;
2425 kq_index_t qos_index = 0;
2426
2427 if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2428 error = ENOTSUP;
2429 goto out;
2430 }
2431
2432 uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2433 switch (command) {
2434 case NOTE_WL_THREAD_REQUEST:
2435 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2436 error = EINVAL;
2437 goto out;
2438 }
2439 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2440 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2441 error = ERANGE;
2442 goto out;
2443 }
2444 if (kqwl->kqwl_request.tr_kq_qos_index) {
2445 /*
2446 * There already is a thread request, and well, you're only allowed
2447 * one per workloop, so fail the attach.
2448 */
2449 error = EALREADY;
2450 goto out;
2451 }
2452 break;
2453 case NOTE_WL_SYNC_WAIT:
2454 case NOTE_WL_SYNC_WAKE:
2455 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2456 error = EINVAL;
2457 goto out;
2458 }
2459 if ((kn->kn_flags & EV_DISABLE) == 0) {
2460 error = EINVAL;
2461 goto out;
2462 }
2463 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2464 error = EINVAL;
2465 goto out;
2466 }
2467 break;
2468
2469 case NOTE_WL_SYNC_IPC:
2470 if ((kn->kn_flags & EV_DISABLE) == 0) {
2471 error = EINVAL;
2472 goto out;
2473 }
2474 if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2475 error = EINVAL;
2476 goto out;
2477 }
2478 break;
2479 default:
2480 error = EINVAL;
2481 goto out;
2482 }
2483
2484 if (command == NOTE_WL_SYNC_IPC) {
2485 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2486 } else {
2487 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2488 }
2489
2490 if (error == EPREEMPTDISABLED) {
2491 error = 0;
2492 result = FILTER_THREADREQ_NODEFEER;
2493 }
2494 out:
2495 if (error) {
2496 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2497 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2498 error = 0;
2499 }
2500 knote_set_error(kn, error);
2501 return result;
2502 }
2503 if (command == NOTE_WL_SYNC_WAIT) {
2504 return kevent_register_wait_prepare(kn, kev, result);
2505 }
2506 /* Just attaching the thread request successfully will fire it */
2507 if (command == NOTE_WL_THREAD_REQUEST) {
2508 /*
2509 * Thread Request knotes need an explicit touch to be active again,
2510 * so delivering an event needs to also consume it.
2511 */
2512 kn->kn_flags |= EV_CLEAR;
2513 return result | FILTER_ACTIVE;
2514 }
2515 return result;
2516 }
2517
2518 static void __dead2
filt_wlwait_continue(void * parameter,wait_result_t wr)2519 filt_wlwait_continue(void *parameter, wait_result_t wr)
2520 {
2521 struct _kevent_register *cont_args = parameter;
2522 struct kqworkloop *kqwl = cont_args->kqwl;
2523
2524 kqlock(kqwl);
2525 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2526 workq_kern_threadreq_lock(kqwl->kqwl_p);
2527 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2528 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2529 } else {
2530 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2531 }
2532 kqunlock(kqwl);
2533
2534 turnstile_cleanup();
2535
2536 if (wr == THREAD_INTERRUPTED) {
2537 cont_args->kev.flags |= EV_ERROR;
2538 cont_args->kev.data = EINTR;
2539 } else if (wr != THREAD_AWAKENED) {
2540 panic("Unexpected wait result: %d", wr);
2541 }
2542
2543 kevent_register_wait_return(cont_args);
2544 }
2545
2546 /*
2547 * Called with the workloop mutex held, most of the time never returns as it
2548 * calls filt_wlwait_continue through a continuation.
2549 */
2550 static void __dead2
filt_wlpost_register_wait(struct uthread * uth,struct knote * kn,struct _kevent_register * cont_args)2551 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2552 struct _kevent_register *cont_args)
2553 {
2554 struct kqworkloop *kqwl = cont_args->kqwl;
2555 workq_threadreq_t kqr = &kqwl->kqwl_request;
2556 struct turnstile *ts;
2557 bool workq_locked = false;
2558
2559 kqlock_held(kqwl);
2560
2561 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2562 workq_kern_threadreq_lock(kqwl->kqwl_p);
2563 workq_locked = true;
2564 }
2565
2566 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2567 TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2568
2569 if (workq_locked) {
2570 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2571 &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2572 TURNSTILE_DELAYED_UPDATE);
2573 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2574 /*
2575 * if the interlock is no longer the workqueue lock,
2576 * then we don't need to hold it anymore.
2577 */
2578 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2579 workq_locked = false;
2580 }
2581 }
2582 if (!workq_locked) {
2583 /*
2584 * If the interlock is the workloop's, then it's our responsibility to
2585 * call update_inheritor, so just do it.
2586 */
2587 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2588 }
2589
2590 thread_set_pending_block_hint(get_machthread(uth), kThreadWaitWorkloopSyncWait);
2591 waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2592 THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2593
2594 if (workq_locked) {
2595 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2596 }
2597
2598 thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2599 if (thread) {
2600 thread_reference(thread);
2601 }
2602
2603 kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2604 }
2605
2606 /* called in stackshot context to report the thread responsible for blocking this thread */
2607 void
kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,event64_t event,thread_waitinfo_t * waitinfo)2608 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2609 event64_t event, thread_waitinfo_t *waitinfo)
2610 {
2611 struct knote *kn = (struct knote *)event;
2612
2613 zone_require(knote_zone, kn);
2614
2615 assert(kn->kn_thread == thread);
2616
2617 struct kqueue *kq = knote_get_kq(kn);
2618
2619 zone_require(kqworkloop_zone, kq);
2620 assert(kq->kq_state & KQ_WORKLOOP);
2621
2622 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2623 workq_threadreq_t kqr = &kqwl->kqwl_request;
2624
2625 thread_t kqwl_owner = kqwl->kqwl_owner;
2626
2627 if (kqwl_owner != THREAD_NULL) {
2628 thread_require(kqwl_owner);
2629 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2630 } else if ((kqr->tr_state >= WORKQ_TR_STATE_BINDING) && (kqr->tr_thread != NULL)) {
2631 thread_require(kqr->tr_thread);
2632 waitinfo->owner = thread_tid(kqr->tr_thread);
2633 } else if (kqr_thread_requested_pending(kqr)) { /* > idle, < bound */
2634 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2635 } else {
2636 waitinfo->owner = 0;
2637 }
2638
2639 waitinfo->context = kqwl->kqwl_dynamicid;
2640 }
2641
2642 static void
filt_wldetach(struct knote * kn)2643 filt_wldetach(struct knote *kn)
2644 {
2645 if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2646 filt_wldetach_sync_ipc(kn);
2647 } else if (kn->kn_thread) {
2648 kevent_register_wait_cleanup(kn);
2649 }
2650 }
2651
2652 static int
filt_wlvalidate_kev_flags(struct knote * kn,struct kevent_qos_s * kev,thread_qos_t * qos_index)2653 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2654 thread_qos_t *qos_index)
2655 {
2656 uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2657 uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2658
2659 if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2660 return EINVAL;
2661 }
2662 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2663 if (kev->flags & EV_DELETE) {
2664 return EINVAL;
2665 }
2666 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2667 return EINVAL;
2668 }
2669 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2670 return ERANGE;
2671 }
2672 }
2673
2674 switch (new_commands) {
2675 case NOTE_WL_THREAD_REQUEST:
2676 /* thread requests can only update themselves */
2677 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2678 return EINVAL;
2679 }
2680 break;
2681
2682 case NOTE_WL_SYNC_WAIT:
2683 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2684 return EINVAL;
2685 }
2686 goto sync_checks;
2687
2688 case NOTE_WL_SYNC_WAKE:
2689 sync_checks:
2690 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2691 return EINVAL;
2692 }
2693 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2694 return EINVAL;
2695 }
2696 break;
2697
2698 case NOTE_WL_SYNC_IPC:
2699 if (sav_commands != NOTE_WL_SYNC_IPC) {
2700 return EINVAL;
2701 }
2702 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2703 return EINVAL;
2704 }
2705 break;
2706
2707 default:
2708 return EINVAL;
2709 }
2710 return 0;
2711 }
2712
2713 static int
filt_wltouch(struct knote * kn,struct kevent_qos_s * kev)2714 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2715 {
2716 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2717 thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2718 int result = 0;
2719
2720 int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2721 if (error) {
2722 goto out;
2723 }
2724
2725 uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2726 if (command == NOTE_WL_SYNC_IPC) {
2727 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2728 } else {
2729 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2730 filt_wlremember_last_update(kn, kev, error);
2731 }
2732 if (error == EPREEMPTDISABLED) {
2733 error = 0;
2734 result = FILTER_THREADREQ_NODEFEER;
2735 }
2736
2737 out:
2738 if (error) {
2739 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2740 /* If userland wants ESTALE to be hidden, do not activate */
2741 return result;
2742 }
2743 kev->flags |= EV_ERROR;
2744 kev->data = error;
2745 return result;
2746 }
2747 if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2748 return kevent_register_wait_prepare(kn, kev, result);
2749 }
2750 /* Just touching the thread request successfully will fire it */
2751 if (command == NOTE_WL_THREAD_REQUEST) {
2752 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2753 result |= FILTER_UPDATE_REQ_QOS;
2754 }
2755 result |= FILTER_ACTIVE;
2756 }
2757 return result;
2758 }
2759
2760 static bool
filt_wlallow_drop(struct knote * kn,struct kevent_qos_s * kev)2761 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2762 {
2763 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2764
2765 int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2766 if (error) {
2767 goto out;
2768 }
2769
2770 uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2771 if (command == NOTE_WL_SYNC_IPC) {
2772 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2773 } else {
2774 error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2775 filt_wlremember_last_update(kn, kev, error);
2776 }
2777 assert(error != EPREEMPTDISABLED);
2778
2779 out:
2780 if (error) {
2781 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2782 return false;
2783 }
2784 kev->flags |= EV_ERROR;
2785 kev->data = error;
2786 return false;
2787 }
2788 return true;
2789 }
2790
2791 static int
filt_wlprocess(struct knote * kn,struct kevent_qos_s * kev)2792 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2793 {
2794 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2795 int rc = 0;
2796
2797 assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2798
2799 kqlock(kqwl);
2800
2801 if (kqwl->kqwl_owner) {
2802 /*
2803 * <rdar://problem/33584321> userspace sometimes due to events being
2804 * delivered but not triggering a drain session can cause a process
2805 * of the thread request knote.
2806 *
2807 * When that happens, the automatic deactivation due to process
2808 * would swallow the event, so we have to activate the knote again.
2809 */
2810 knote_activate(kqwl, kn, FILTER_ACTIVE);
2811 } else {
2812 #if DEBUG || DEVELOPMENT
2813 if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2814 /*
2815 * see src/queue_internal.h in libdispatch
2816 */
2817 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2818 user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2819 task_t t = current_task();
2820 uint64_t val;
2821 if (addr && task_is_active(t) && !task_is_halting(t) &&
2822 copyin_atomic64(addr, &val) == 0 &&
2823 val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2824 (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2825 panic("kevent: workloop %#016llx is not enqueued "
2826 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2827 kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2828 }
2829 }
2830 #endif
2831 knote_fill_kevent(kn, kev, 0);
2832 kev->fflags = kn->kn_sfflags;
2833 rc |= FILTER_ACTIVE;
2834 }
2835
2836 kqunlock(kqwl);
2837
2838 if (rc & FILTER_ACTIVE) {
2839 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2840 }
2841 return rc;
2842 }
2843
2844 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2845 .f_extended_codes = true,
2846 .f_attach = filt_wlattach,
2847 .f_detach = filt_wldetach,
2848 .f_event = filt_bad_event,
2849 .f_touch = filt_wltouch,
2850 .f_process = filt_wlprocess,
2851 .f_allow_drop = filt_wlallow_drop,
2852 .f_post_register_wait = filt_wlpost_register_wait,
2853 };
2854
2855 #pragma mark - kqueues allocation and deallocation
2856
2857 OS_NOINLINE
2858 static void
2859 kqworkloop_dealloc(struct kqworkloop *, bool hash_remove);
2860
2861 static inline bool
kqworkloop_try_retain(struct kqworkloop * kqwl)2862 kqworkloop_try_retain(struct kqworkloop *kqwl)
2863 {
2864 return os_ref_retain_try_raw(&kqwl->kqwl_retains, NULL);
2865 }
2866
2867 static inline void
kqworkloop_retain(struct kqworkloop * kqwl)2868 kqworkloop_retain(struct kqworkloop *kqwl)
2869 {
2870 return os_ref_retain_raw(&kqwl->kqwl_retains, NULL);
2871 }
2872
2873 OS_ALWAYS_INLINE
2874 static inline void
kqueue_retain(kqueue_t kqu)2875 kqueue_retain(kqueue_t kqu)
2876 {
2877 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2878 kqworkloop_retain(kqu.kqwl);
2879 }
2880 }
2881
2882 OS_ALWAYS_INLINE
2883 static inline void
kqworkloop_release_live(struct kqworkloop * kqwl)2884 kqworkloop_release_live(struct kqworkloop *kqwl)
2885 {
2886 os_ref_release_live_raw(&kqwl->kqwl_retains, NULL);
2887 }
2888
2889 OS_ALWAYS_INLINE
2890 static inline void
kqueue_release_live(kqueue_t kqu)2891 kqueue_release_live(kqueue_t kqu)
2892 {
2893 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2894 kqworkloop_release_live(kqu.kqwl);
2895 }
2896 }
2897
2898 OS_ALWAYS_INLINE
2899 static inline void
kqworkloop_release(struct kqworkloop * kqwl)2900 kqworkloop_release(struct kqworkloop *kqwl)
2901 {
2902 if (os_ref_release_raw(&kqwl->kqwl_retains, NULL) == 0) {
2903 kqworkloop_dealloc(kqwl, true);
2904 }
2905 }
2906
2907 OS_ALWAYS_INLINE
2908 static inline void
kqueue_release(kqueue_t kqu)2909 kqueue_release(kqueue_t kqu)
2910 {
2911 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2912 kqworkloop_release(kqu.kqwl);
2913 }
2914 }
2915
2916 /*!
2917 * @function kqueue_destroy
2918 *
2919 * @brief
2920 * Common part to all kqueue dealloc functions.
2921 */
2922 OS_NOINLINE
2923 static void
kqueue_destroy(kqueue_t kqu,zone_t zone)2924 kqueue_destroy(kqueue_t kqu, zone_t zone)
2925 {
2926 lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2927
2928 zfree(zone, kqu.kq);
2929 }
2930
2931 /*!
2932 * @function kqueue_init
2933 *
2934 * @brief
2935 * Common part to all kqueue alloc functions.
2936 */
2937 static kqueue_t
kqueue_init(kqueue_t kqu)2938 kqueue_init(kqueue_t kqu)
2939 {
2940 lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2941 return kqu;
2942 }
2943
2944 #pragma mark kqfile allocation and deallocation
2945
2946 /*!
2947 * @function kqueue_dealloc
2948 *
2949 * @brief
2950 * Detach all knotes from a kqfile and free it.
2951 *
2952 * @discussion
2953 * We walk each list looking for knotes referencing this
2954 * this kqueue. If we find one, we try to drop it. But
2955 * if we fail to get a drop reference, that will wait
2956 * until it is dropped. So, we can just restart again
2957 * safe in the assumption that the list will eventually
2958 * not contain any more references to this kqueue (either
2959 * we dropped them all, or someone else did).
2960 *
2961 * Assumes no new events are being added to the kqueue.
2962 * Nothing locked on entry or exit.
2963 */
2964 void
kqueue_dealloc(struct kqueue * kq)2965 kqueue_dealloc(struct kqueue *kq)
2966 {
2967 KNOTE_LOCK_CTX(knlc);
2968 struct proc *p = kq->kq_p;
2969 struct filedesc *fdp = &p->p_fd;
2970 struct knote *kn;
2971
2972 assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2973
2974 proc_fdlock(p);
2975 for (int i = 0; i < fdp->fd_knlistsize; i++) {
2976 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2977 while (kn != NULL) {
2978 if (kq == knote_get_kq(kn)) {
2979 kqlock(kq);
2980 proc_fdunlock(p);
2981 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2982 knote_drop(kq, kn, &knlc);
2983 }
2984 proc_fdlock(p);
2985 /* start over at beginning of list */
2986 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2987 continue;
2988 }
2989 kn = SLIST_NEXT(kn, kn_link);
2990 }
2991 }
2992
2993 knhash_lock(fdp);
2994 proc_fdunlock(p);
2995
2996 if (fdp->fd_knhashmask != 0) {
2997 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2998 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2999 while (kn != NULL) {
3000 if (kq == knote_get_kq(kn)) {
3001 kqlock(kq);
3002 knhash_unlock(fdp);
3003 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3004 knote_drop(kq, kn, &knlc);
3005 }
3006 knhash_lock(fdp);
3007 /* start over at beginning of list */
3008 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3009 continue;
3010 }
3011 kn = SLIST_NEXT(kn, kn_link);
3012 }
3013 }
3014 }
3015 knhash_unlock(fdp);
3016
3017 kqueue_destroy(kq, kqfile_zone);
3018 }
3019
3020 /*!
3021 * @function kqueue_alloc
3022 *
3023 * @brief
3024 * Allocate a kqfile.
3025 */
3026 struct kqueue *
kqueue_alloc(struct proc * p)3027 kqueue_alloc(struct proc *p)
3028 {
3029 struct kqfile *kqf;
3030
3031 /*
3032 * kqfiles are created with kqueue() so we need to wait for
3033 * the first kevent syscall to know which bit among
3034 * KQ_KEV_{32,64,QOS} will be set in kqf_state
3035 */
3036 kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
3037 kqf->kqf_p = p;
3038 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
3039 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
3040
3041 return kqueue_init(kqf).kq;
3042 }
3043
3044 /*!
3045 * @function kqueue_internal
3046 *
3047 * @brief
3048 * Core implementation for kqueue and guarded_kqueue_np()
3049 */
3050 int
kqueue_internal(struct proc * p,fp_initfn_t fp_init,void * initarg,int32_t * retval)3051 kqueue_internal(struct proc *p, fp_initfn_t fp_init, void *initarg, int32_t *retval)
3052 {
3053 struct kqueue *kq;
3054 struct fileproc *fp;
3055 int fd, error;
3056
3057 error = falloc_withinit(p, current_cached_proc_cred(p),
3058 vfs_context_current(), &fp, &fd, fp_init, initarg);
3059 if (error) {
3060 return error;
3061 }
3062
3063 kq = kqueue_alloc(p);
3064 if (kq == NULL) {
3065 fp_free(p, fd, fp);
3066 return ENOMEM;
3067 }
3068
3069 fp->fp_flags |= FP_CLOEXEC | FP_CLOFORK;
3070 fp->f_flag = FREAD | FWRITE;
3071 fp->f_ops = &kqueueops;
3072 fp_set_data(fp, kq);
3073 fp->f_lflags |= FG_CONFINED;
3074
3075 proc_fdlock(p);
3076 procfdtbl_releasefd(p, fd, NULL);
3077 fp_drop(p, fd, fp, 1);
3078 proc_fdunlock(p);
3079
3080 *retval = fd;
3081 return error;
3082 }
3083
3084 /*!
3085 * @function kqueue
3086 *
3087 * @brief
3088 * The kqueue syscall.
3089 */
3090 int
kqueue(struct proc * p,__unused struct kqueue_args * uap,int32_t * retval)3091 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3092 {
3093 return kqueue_internal(p, NULL, NULL, retval);
3094 }
3095
3096 #pragma mark kqworkq allocation and deallocation
3097
3098 /*!
3099 * @function kqworkq_dealloc
3100 *
3101 * @brief
3102 * Deallocates a workqueue kqueue.
3103 *
3104 * @discussion
3105 * This only happens at process death, or for races with concurrent
3106 * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3107 * this kqueue, either there are none, or someone else took care of them.
3108 */
3109 void
kqworkq_dealloc(struct kqworkq * kqwq)3110 kqworkq_dealloc(struct kqworkq *kqwq)
3111 {
3112 kqueue_destroy(kqwq, kqworkq_zone);
3113 }
3114
3115 /*!
3116 * @function kqworkq_alloc
3117 *
3118 * @brief
3119 * Allocates a workqueue kqueue.
3120 *
3121 * @discussion
3122 * This is the slow path of kevent_get_kqwq.
3123 * This takes care of making sure procs have a single workq kqueue.
3124 */
3125 OS_NOINLINE
3126 static struct kqworkq *
kqworkq_alloc(struct proc * p,unsigned int flags)3127 kqworkq_alloc(struct proc *p, unsigned int flags)
3128 {
3129 struct kqworkq *kqwq, *tmp;
3130
3131 kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3132
3133 assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3134 if (flags & KEVENT_FLAG_LEGACY64) {
3135 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3136 } else {
3137 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3138 }
3139 kqwq->kqwq_p = p;
3140
3141 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3142 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3143 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3144 }
3145 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3146 /*
3147 * Because of how the bucketized system works, we mix overcommit
3148 * sources with not overcommit: each time we move a knote from
3149 * one bucket to the next due to overrides, we'd had to track
3150 * overcommitness, and it's really not worth it in the workloop
3151 * enabled world that track this faithfully.
3152 *
3153 * Incidentally, this behaves like the original manager-based
3154 * kqwq where event delivery always happened (hence is
3155 * "overcommit")
3156 */
3157 kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3158 kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3159 if (i != KQWQ_QOS_MANAGER) {
3160 kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3161 }
3162 kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i + 1;
3163 }
3164
3165 kqueue_init(kqwq);
3166
3167 if (!os_atomic_cmpxchgv(&p->p_fd.fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3168 kqworkq_dealloc(kqwq);
3169 return tmp;
3170 }
3171
3172 return kqwq;
3173 }
3174
3175 #pragma mark kqworkloop allocation and deallocation
3176
3177 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3178 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3179
3180 OS_ALWAYS_INLINE
3181 static inline void
kqhash_lock(struct filedesc * fdp)3182 kqhash_lock(struct filedesc *fdp)
3183 {
3184 lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3185 }
3186
3187 OS_ALWAYS_INLINE
3188 static inline void
kqhash_unlock(struct filedesc * fdp)3189 kqhash_unlock(struct filedesc *fdp)
3190 {
3191 lck_mtx_unlock(&fdp->fd_kqhashlock);
3192 }
3193
3194 OS_ALWAYS_INLINE
3195 static inline void
kqworkloop_hash_insert_locked(struct filedesc * fdp,kqueue_id_t id,struct kqworkloop * kqwl)3196 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3197 struct kqworkloop *kqwl)
3198 {
3199 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3200 LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3201 }
3202
3203 OS_ALWAYS_INLINE
3204 static inline struct kqworkloop *
kqworkloop_hash_lookup_locked(struct filedesc * fdp,kqueue_id_t id)3205 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3206 {
3207 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3208 struct kqworkloop *kqwl;
3209
3210 LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3211 if (kqwl->kqwl_dynamicid == id) {
3212 return kqwl;
3213 }
3214 }
3215 return NULL;
3216 }
3217
3218 static struct kqworkloop *
kqworkloop_hash_lookup_and_retain(struct filedesc * fdp,kqueue_id_t kq_id)3219 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3220 {
3221 struct kqworkloop *kqwl = NULL;
3222
3223 kqhash_lock(fdp);
3224 if (__probable(fdp->fd_kqhash)) {
3225 kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3226 if (kqwl && !kqworkloop_try_retain(kqwl)) {
3227 kqwl = NULL;
3228 }
3229 }
3230 kqhash_unlock(fdp);
3231 return kqwl;
3232 }
3233
3234 OS_NOINLINE
3235 static void
kqworkloop_hash_init(struct filedesc * fdp)3236 kqworkloop_hash_init(struct filedesc *fdp)
3237 {
3238 struct kqwllist *alloc_hash;
3239 u_long alloc_mask;
3240
3241 kqhash_unlock(fdp);
3242 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3243 kqhash_lock(fdp);
3244
3245 /* See if we won the race */
3246 if (__probable(fdp->fd_kqhashmask == 0)) {
3247 fdp->fd_kqhash = alloc_hash;
3248 fdp->fd_kqhashmask = alloc_mask;
3249 } else {
3250 kqhash_unlock(fdp);
3251 hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3252 kqhash_lock(fdp);
3253 }
3254 }
3255
3256 /*
3257 * kqueue iotier override is only supported for kqueue that has
3258 * only one port as a mach port source. Updating the iotier
3259 * override on the mach port source will update the override
3260 * on kqueue as well. Since kqueue with iotier override will
3261 * only have one port attached, there is no logic for saturation
3262 * like qos override, the iotier override of mach port source
3263 * would be reflected in kevent iotier override.
3264 */
3265 void
kqueue_set_iotier_override(kqueue_t kqu,uint8_t iotier_override)3266 kqueue_set_iotier_override(kqueue_t kqu, uint8_t iotier_override)
3267 {
3268 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3269 return;
3270 }
3271
3272 struct kqworkloop *kqwl = kqu.kqwl;
3273 os_atomic_store(&kqwl->kqwl_iotier_override, iotier_override, relaxed);
3274 }
3275
3276 uint8_t
kqueue_get_iotier_override(kqueue_t kqu)3277 kqueue_get_iotier_override(kqueue_t kqu)
3278 {
3279 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3280 return THROTTLE_LEVEL_END;
3281 }
3282
3283 struct kqworkloop *kqwl = kqu.kqwl;
3284 return os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
3285 }
3286
3287 #if CONFIG_PREADOPT_TG
3288 /*
3289 * This function is called with a borrowed reference on the thread group without
3290 * kq lock held with the mqueue lock held. It may or may not have the knote lock
3291 * (called from both fevent as well as fattach/ftouch). Upon success, an
3292 * additional reference on the TG is taken
3293 */
3294 void
kqueue_set_preadopted_thread_group(kqueue_t kqu,struct thread_group * tg,thread_qos_t qos)3295 kqueue_set_preadopted_thread_group(kqueue_t kqu, struct thread_group *tg, thread_qos_t qos)
3296 {
3297 if (!(kqu.kq->kq_state & KQ_WORKLOOP)) {
3298 KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_PREADOPT_NA),
3299 (uintptr_t)thread_tid(current_thread()), 0, 0, 0);
3300 return;
3301 }
3302
3303 struct kqworkloop *kqwl = kqu.kqwl;
3304
3305 assert(qos < THREAD_QOS_LAST);
3306
3307 thread_group_retain(tg);
3308
3309 thread_group_qos_t old_tg; thread_group_qos_t new_tg;
3310 int ret = os_atomic_rmw_loop(&kqwl->kqwl_preadopt_tg, old_tg, new_tg, relaxed, {
3311 if (!KQWL_CAN_ADOPT_PREADOPT_TG(old_tg)) {
3312 os_atomic_rmw_loop_give_up(break);
3313 }
3314
3315 if (old_tg != KQWL_PREADOPTED_TG_NULL) {
3316 /*
3317 * Note that old_tg could be a NULL TG pointer but with a QoS
3318 * set. See also workq_thread_reset_pri.
3319 *
3320 * Compare the QoS of existing preadopted tg with new one and
3321 * only overwrite the thread group if we have one with a higher
3322 * QoS.
3323 */
3324 thread_qos_t existing_qos = KQWL_GET_PREADOPTED_TG_QOS(old_tg);
3325 if (existing_qos >= qos) {
3326 os_atomic_rmw_loop_give_up(break);
3327 }
3328 }
3329
3330 // Transfer the ref taken earlier in the function to the kqwl
3331 new_tg = KQWL_ENCODE_PREADOPTED_TG_QOS(tg, qos);
3332 });
3333
3334 if (ret) {
3335 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_INCOMING_IPC, old_tg, tg);
3336
3337 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
3338 thread_group_deallocate_safe(KQWL_GET_PREADOPTED_TG(old_tg));
3339 }
3340
3341 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE, release);
3342 } else {
3343 // We failed to write to the kqwl_preadopt_tg, drop the ref we took
3344 // earlier in the function
3345 thread_group_deallocate_safe(tg);
3346 }
3347 }
3348
3349 /*
3350 * Called from fprocess of EVFILT_MACHPORT without the kqueue lock held.
3351 */
3352 bool
kqueue_process_preadopt_thread_group(thread_t thread,struct kqueue * kq,struct thread_group * tg)3353 kqueue_process_preadopt_thread_group(thread_t thread, struct kqueue *kq, struct thread_group *tg)
3354 {
3355 bool success = false;
3356 if (kq->kq_state & KQ_WORKLOOP) {
3357 struct kqworkloop *kqwl = (struct kqworkloop *) kq;
3358 thread_group_qos_t old_tg;
3359 success = os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg,
3360 KQWL_PREADOPTED_TG_SENTINEL, KQWL_PREADOPTED_TG_PROCESSED,
3361 &old_tg, relaxed);
3362 if (success) {
3363 thread_set_preadopt_thread_group(thread, tg);
3364 } else if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
3365 /*
3366 * Technically the following set_preadopt should be a no-op since this
3367 * servicer thread preadopts kqwl's permanent tg at bind time.
3368 * See kqueue_threadreq_bind.
3369 */
3370 thread_set_preadopt_thread_group(thread, KQWL_GET_PREADOPTED_TG(old_tg));
3371 } else {
3372 assert(old_tg == KQWL_PREADOPTED_TG_PROCESSED ||
3373 old_tg == KQWL_PREADOPTED_TG_NEVER);
3374 }
3375 }
3376 return success;
3377 }
3378 #endif
3379
3380 /*!
3381 * @function kqworkloop_dealloc
3382 *
3383 * @brief
3384 * Deallocates a workloop kqueue.
3385 *
3386 * @discussion
3387 * Knotes hold references on the workloop, so we can't really reach this
3388 * function unless all of these are already gone.
3389 *
3390 * Nothing locked on entry or exit.
3391 *
3392 * @param hash_remove
3393 * Whether to remove the workloop from its hash table.
3394 */
3395 static void
kqworkloop_dealloc(struct kqworkloop * kqwl,bool hash_remove)3396 kqworkloop_dealloc(struct kqworkloop *kqwl, bool hash_remove)
3397 {
3398 thread_t cur_owner;
3399
3400 cur_owner = kqwl->kqwl_owner;
3401 if (cur_owner) {
3402 if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3403 thread_drop_kevent_override(cur_owner);
3404 }
3405 thread_deallocate(cur_owner);
3406 kqwl->kqwl_owner = THREAD_NULL;
3407 }
3408
3409 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3410 struct turnstile *ts;
3411 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3412 &ts, TURNSTILE_WORKLOOPS);
3413 turnstile_cleanup();
3414 turnstile_deallocate(ts);
3415 }
3416
3417 if (hash_remove) {
3418 struct filedesc *fdp = &kqwl->kqwl_p->p_fd;
3419
3420 kqhash_lock(fdp);
3421 LIST_REMOVE(kqwl, kqwl_hashlink);
3422 #if CONFIG_PROC_RESOURCE_LIMITS
3423 fdp->num_kqwls--;
3424 #endif
3425 kqhash_unlock(fdp);
3426 }
3427
3428 #if CONFIG_PREADOPT_TG
3429 thread_group_qos_t tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3430 if (KQWL_HAS_VALID_PREADOPTED_TG(tg)) {
3431 thread_group_release(KQWL_GET_PREADOPTED_TG(tg));
3432 }
3433 #endif
3434
3435 workq_threadreq_t kqr = &kqwl->kqwl_request;
3436 if ((kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) && kqr->tr_work_interval) {
3437 kern_work_interval_release(kqr->tr_work_interval);
3438 }
3439
3440 assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3441 assert(kqwl->kqwl_owner == THREAD_NULL);
3442 assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3443
3444 lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3445 kqueue_destroy(kqwl, kqworkloop_zone);
3446 }
3447
3448 /*!
3449 * @function kqworkloop_init
3450 *
3451 * @brief
3452 * Initializes an allocated kqworkloop.
3453 */
3454 static void
kqworkloop_init(struct kqworkloop * kqwl,proc_t p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended)3455 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3456 kqueue_id_t id, workq_threadreq_param_t *trp,
3457 struct workq_threadreq_extended_param_s *trp_extended)
3458 {
3459 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3460 os_ref_init_raw(&kqwl->kqwl_retains, NULL);
3461 kqwl->kqwl_dynamicid = id;
3462 kqwl->kqwl_p = p;
3463 if (trp) {
3464 kqwl->kqwl_params = trp->trp_value;
3465 }
3466
3467 workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3468 if (trp) {
3469 if (trp->trp_flags & TRP_PRIORITY) {
3470 tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3471 }
3472 if (trp->trp_flags & TRP_BOUND_THREAD) {
3473 tr_flags |= WORKQ_TR_FLAG_PERMANENT_BIND;
3474 }
3475 if (trp->trp_flags) {
3476 tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3477 }
3478 }
3479 kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3480 kqwl->kqwl_request.tr_flags = tr_flags;
3481 os_atomic_store(&kqwl->kqwl_iotier_override, (uint8_t)THROTTLE_LEVEL_END, relaxed);
3482 #if CONFIG_PREADOPT_TG
3483 if (trp_extended && trp_extended->trp_permanent_preadopt_tg) {
3484 /*
3485 * This kqwl is permanently configured with a thread group.
3486 * By using THREAD_QOS_LAST, we make sure kqueue_set_preadopted_thread_group
3487 * has no effect on kqwl_preadopt_tg. At this point, +1 ref on
3488 * trp_extended->trp_permanent_preadopt_tg is transferred to the kqwl.
3489 */
3490 thread_group_qos_t kqwl_preadopt_tg;
3491 kqwl_preadopt_tg = KQWL_ENCODE_PERMANENT_PREADOPTED_TG(trp_extended->trp_permanent_preadopt_tg);
3492 os_atomic_store(&kqwl->kqwl_preadopt_tg, kqwl_preadopt_tg, relaxed);
3493 } else if (task_is_app(current_task())) {
3494 /*
3495 * Not a specially preconfigured kqwl so it is open to participate in sync IPC
3496 * thread group preadoption; but, apps will never adopt a thread group that
3497 * is not their own. This is a gross hack to simulate the post-process that
3498 * is done in the voucher subsystem today for thread groups.
3499 */
3500 os_atomic_store(&kqwl->kqwl_preadopt_tg, KQWL_PREADOPTED_TG_NEVER, relaxed);
3501 }
3502 #endif
3503 if (trp_extended) {
3504 if (trp_extended->trp_work_interval) {
3505 /*
3506 * The +1 ref on the work interval is transferred to the kqwl.
3507 */
3508 assert(tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
3509 kqwl->kqwl_request.tr_work_interval = trp_extended->trp_work_interval;
3510 }
3511 }
3512 for (int i = 0; i < KQWL_NBUCKETS; i++) {
3513 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3514 }
3515 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3516
3517 lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3518
3519 kqueue_init(kqwl);
3520 }
3521
3522 #if CONFIG_PROC_RESOURCE_LIMITS
3523 void
kqworkloop_check_limit_exceeded(struct filedesc * fdp)3524 kqworkloop_check_limit_exceeded(struct filedesc *fdp)
3525 {
3526 int num_kqwls = fdp->num_kqwls;
3527 if (!kqwl_above_soft_limit_notified(fdp) && fdp->kqwl_dyn_soft_limit > 0 &&
3528 num_kqwls > fdp->kqwl_dyn_soft_limit) {
3529 kqwl_above_soft_limit_send_notification(fdp);
3530 act_set_astproc_resource(current_thread());
3531 } else if (!kqwl_above_hard_limit_notified(fdp) && fdp->kqwl_dyn_hard_limit > 0
3532 && num_kqwls > fdp->kqwl_dyn_hard_limit) {
3533 kqwl_above_hard_limit_send_notification(fdp);
3534 act_set_astproc_resource(current_thread());
3535 }
3536 }
3537 #endif
3538
3539 /*!
3540 * @function kqworkloop_get_or_create
3541 *
3542 * @brief
3543 * Wrapper around kqworkloop_init that handles the uniquing of workloops.
3544 *
3545 * @returns
3546 * 0: success
3547 * EINVAL: invalid parameters
3548 * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3549 * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3550 * ENOMEM: allocation failed
3551 */
3552 static int
kqworkloop_get_or_create(struct proc * p,kqueue_id_t id,workq_threadreq_param_t * trp,struct workq_threadreq_extended_param_s * trp_extended,unsigned int flags,struct kqworkloop ** kqwlp)3553 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3554 workq_threadreq_param_t *trp,
3555 struct workq_threadreq_extended_param_s *trp_extended,
3556 unsigned int flags, struct kqworkloop **kqwlp)
3557 {
3558 struct filedesc *fdp = &p->p_fd;
3559 struct kqworkloop *alloc_kqwl = NULL;
3560 struct kqworkloop *kqwl = NULL;
3561 int error = 0;
3562
3563 assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3564
3565 if (id == 0 || id == (kqueue_id_t)-1) {
3566 return EINVAL;
3567 }
3568
3569 for (;;) {
3570 kqhash_lock(fdp);
3571 if (__improbable(fdp->fd_kqhash == NULL)) {
3572 kqworkloop_hash_init(fdp);
3573 }
3574
3575 kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3576 if (kqwl) {
3577 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3578 /*
3579 * If MUST_NOT_EXIST was passed, even if we would have failed
3580 * the try_retain, it could have gone the other way, and
3581 * userspace can't tell. Let'em fix their race.
3582 */
3583 error = EEXIST;
3584 break;
3585 }
3586
3587 if (__probable(kqworkloop_try_retain(kqwl))) {
3588 /*
3589 * This is a valid live workloop !
3590 */
3591 *kqwlp = kqwl;
3592 error = 0;
3593 break;
3594 }
3595 }
3596
3597 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3598 error = ENOENT;
3599 break;
3600 }
3601
3602 /*
3603 * We didn't find what we were looking for.
3604 *
3605 * If this is the second time we reach this point (alloc_kqwl != NULL),
3606 * then we're done.
3607 *
3608 * If this is the first time we reach this point (alloc_kqwl == NULL),
3609 * then try to allocate one without blocking.
3610 */
3611 if (__probable(alloc_kqwl == NULL)) {
3612 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3613 }
3614 if (__probable(alloc_kqwl)) {
3615 #if CONFIG_PROC_RESOURCE_LIMITS
3616 fdp->num_kqwls++;
3617 kqworkloop_check_limit_exceeded(fdp);
3618 #endif
3619 kqworkloop_init(alloc_kqwl, p, id, trp, trp_extended);
3620 /*
3621 * The newly allocated and initialized kqwl has a retain count of 1.
3622 */
3623 kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3624 if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3625 /*
3626 * If this kqworkloop is configured to be permanently bound to
3627 * a thread, we take +1 ref on that thread's behalf before we
3628 * unlock the kqhash below. The reason being this new kqwl is
3629 * findable in the hash table as soon as we unlock the kqhash
3630 * and we want to make sure this kqwl does not get deleted from
3631 * under us by the time we create a new thread and bind to it.
3632 *
3633 * This ref is released when the bound thread unbinds itself
3634 * from the kqwl on its way to termination.
3635 * See uthread_cleanup -> kqueue_threadreq_unbind.
3636 *
3637 * The kqwl now has a retain count of 2.
3638 */
3639 kqworkloop_retain(alloc_kqwl);
3640 }
3641 kqhash_unlock(fdp);
3642 /*
3643 * We do not want to keep holding kqhash lock when workq is
3644 * busy creating and initializing a new thread to bind to this
3645 * kqworkloop.
3646 */
3647 if (trp && (trp->trp_flags & TRP_BOUND_THREAD)) {
3648 error = workq_kern_threadreq_permanent_bind(p, &alloc_kqwl->kqwl_request);
3649 if (error != KERN_SUCCESS) {
3650 /*
3651 * The kqwl we just created and initialized has a retain
3652 * count of 2 at this point i.e. 1 from kqworkloop_init and
3653 * 1 on behalf of the bound thread. We need to release
3654 * both the references here to successfully deallocate this
3655 * kqwl before we return an error.
3656 *
3657 * The latter release should take care of deallocating
3658 * the kqwl itself and removing it from the kqhash.
3659 */
3660 kqworkloop_release(alloc_kqwl);
3661 kqworkloop_release(alloc_kqwl);
3662 alloc_kqwl = NULL;
3663 if (trp_extended) {
3664 /*
3665 * Since we transferred these refs to kqwl during
3666 * kqworkloop_init, the kqwl takes care of releasing them.
3667 * We don't have any refs to return to our caller
3668 * in this case.
3669 */
3670 #if CONFIG_PREADOPT_TG
3671 if (trp_extended->trp_permanent_preadopt_tg) {
3672 trp_extended->trp_permanent_preadopt_tg = NULL;
3673 }
3674 #endif
3675 if (trp_extended->trp_work_interval) {
3676 trp_extended->trp_work_interval = NULL;
3677 }
3678 }
3679 return error;
3680 } else {
3681 /*
3682 * For kqwl configured with a bound thread, KQ_SLEEP is used
3683 * to track whether the bound thread needs to be woken up
3684 * when such a kqwl is woken up.
3685 *
3686 * See kqworkloop_bound_thread_wakeup and
3687 * kqworkloop_bound_thread_park_prepost.
3688 *
3689 * Once the kqwl is initialized, this state
3690 * should always be manipulated under kqlock.
3691 */
3692 kqlock(alloc_kqwl);
3693 alloc_kqwl->kqwl_state |= KQ_SLEEP;
3694 kqunlock(alloc_kqwl);
3695 }
3696 }
3697 *kqwlp = alloc_kqwl;
3698 return 0;
3699 }
3700
3701 /*
3702 * We have to block to allocate a workloop, drop the lock,
3703 * allocate one, but then we need to retry lookups as someone
3704 * else could race with us.
3705 */
3706 kqhash_unlock(fdp);
3707
3708 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3709 }
3710
3711 kqhash_unlock(fdp);
3712
3713 if (__improbable(alloc_kqwl)) {
3714 zfree(kqworkloop_zone, alloc_kqwl);
3715 }
3716
3717 return error;
3718 }
3719
3720 #pragma mark - knotes
3721
3722 static int
filt_no_attach(struct knote * kn,__unused struct kevent_qos_s * kev)3723 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3724 {
3725 knote_set_error(kn, ENOTSUP);
3726 return 0;
3727 }
3728
3729 static void
filt_no_detach(__unused struct knote * kn)3730 filt_no_detach(__unused struct knote *kn)
3731 {
3732 }
3733
3734 static int __dead2
filt_bad_event(struct knote * kn,long hint)3735 filt_bad_event(struct knote *kn, long hint)
3736 {
3737 panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3738 }
3739
3740 static int __dead2
filt_bad_touch(struct knote * kn,struct kevent_qos_s * kev)3741 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3742 {
3743 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3744 }
3745
3746 static int __dead2
filt_bad_process(struct knote * kn,struct kevent_qos_s * kev)3747 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3748 {
3749 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3750 }
3751
3752 /*
3753 * knotes_dealloc - detach all knotes for the process and drop them
3754 *
3755 * Process is in such a state that it will not try to allocate
3756 * any more knotes during this process (stopped for exit or exec).
3757 */
3758 void
knotes_dealloc(proc_t p)3759 knotes_dealloc(proc_t p)
3760 {
3761 struct filedesc *fdp = &p->p_fd;
3762 struct kqueue *kq;
3763 struct knote *kn;
3764 struct klist *kn_hash = NULL;
3765 u_long kn_hashmask;
3766 int i;
3767
3768 proc_fdlock(p);
3769
3770 /* Close all the fd-indexed knotes up front */
3771 if (fdp->fd_knlistsize > 0) {
3772 for (i = 0; i < fdp->fd_knlistsize; i++) {
3773 while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3774 kq = knote_get_kq(kn);
3775 kqlock(kq);
3776 proc_fdunlock(p);
3777 knote_drop(kq, kn, NULL);
3778 proc_fdlock(p);
3779 }
3780 }
3781 /* free the table */
3782 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
3783 }
3784 fdp->fd_knlistsize = 0;
3785
3786 proc_fdunlock(p);
3787
3788 knhash_lock(fdp);
3789
3790 /* Clean out all the hashed knotes as well */
3791 if (fdp->fd_knhashmask != 0) {
3792 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3793 while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3794 kq = knote_get_kq(kn);
3795 kqlock(kq);
3796 knhash_unlock(fdp);
3797 knote_drop(kq, kn, NULL);
3798 knhash_lock(fdp);
3799 }
3800 }
3801 kn_hash = fdp->fd_knhash;
3802 kn_hashmask = fdp->fd_knhashmask;
3803 fdp->fd_knhashmask = 0;
3804 fdp->fd_knhash = NULL;
3805 }
3806
3807 knhash_unlock(fdp);
3808
3809 if (kn_hash) {
3810 hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3811 }
3812 }
3813
3814 /*
3815 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3816 * scheduling parameters
3817 *
3818 * Process is in such a state that it will not try to allocate
3819 * any more kqs or knotes during this process (stopped for exit or exec).
3820 */
3821 void
kqworkloops_dealloc(proc_t p)3822 kqworkloops_dealloc(proc_t p)
3823 {
3824 struct filedesc *fdp = &p->p_fd;
3825 struct kqworkloop *kqwl, *kqwln;
3826 struct kqwllist tofree;
3827
3828 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
3829 return;
3830 }
3831
3832 kqhash_lock(fdp);
3833
3834 if (fdp->fd_kqhashmask == 0) {
3835 kqhash_unlock(fdp);
3836 return;
3837 }
3838
3839 LIST_INIT(&tofree);
3840
3841 for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3842 LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3843 #if CONFIG_PREADOPT_TG
3844 /*
3845 * kqworkloops that have scheduling parameters have an
3846 * implicit retain from kqueue_workloop_ctl that needs
3847 * to be balanced on process exit.
3848 */
3849 __assert_only thread_group_qos_t preadopt_tg;
3850 preadopt_tg = os_atomic_load(&kqwl->kqwl_preadopt_tg, relaxed);
3851 #endif
3852 assert(kqwl->kqwl_params
3853 #if CONFIG_PREADOPT_TG
3854 || KQWL_HAS_PERMANENT_PREADOPTED_TG(preadopt_tg)
3855 #endif
3856 );
3857
3858 LIST_REMOVE(kqwl, kqwl_hashlink);
3859 LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3860 }
3861 }
3862 #if CONFIG_PROC_RESOURCE_LIMITS
3863 fdp->num_kqwls = 0;
3864 #endif
3865 kqhash_unlock(fdp);
3866
3867 LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3868 uint32_t ref = os_ref_get_count_raw(&kqwl->kqwl_retains);
3869 if (ref != 1) {
3870 panic("kq(%p) invalid refcount %d", kqwl, ref);
3871 }
3872 kqworkloop_dealloc(kqwl, false);
3873 }
3874 }
3875
3876 static int
kevent_register_validate_priority(struct kqueue * kq,struct knote * kn,struct kevent_qos_s * kev)3877 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3878 struct kevent_qos_s *kev)
3879 {
3880 /* We don't care about the priority of a disabled or deleted knote */
3881 if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3882 return 0;
3883 }
3884
3885 if (kq->kq_state & KQ_WORKLOOP) {
3886 /*
3887 * Workloops need valid priorities with a QOS (excluding manager) for
3888 * any enabled knote.
3889 *
3890 * When it is pre-existing, just make sure it has a valid QoS as
3891 * kevent_register() will not use the incoming priority (filters who do
3892 * have the responsibility to validate it again, see filt_wltouch).
3893 *
3894 * If the knote is being made, validate the incoming priority.
3895 */
3896 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3897 return ERANGE;
3898 }
3899 }
3900
3901 return 0;
3902 }
3903
3904 /*
3905 * Prepare a filter for waiting after register.
3906 *
3907 * The f_post_register_wait hook will be called later by kevent_register()
3908 * and should call kevent_register_wait_block()
3909 */
3910 static int
kevent_register_wait_prepare(struct knote * kn,struct kevent_qos_s * kev,int rc)3911 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3912 {
3913 thread_t thread = current_thread();
3914
3915 assert(knote_fops(kn)->f_extended_codes);
3916
3917 if (kn->kn_thread == NULL) {
3918 thread_reference(thread);
3919 kn->kn_thread = thread;
3920 } else if (kn->kn_thread != thread) {
3921 /*
3922 * kn_thread may be set from a previous aborted wait
3923 * However, it has to be from the same thread.
3924 */
3925 kev->flags |= EV_ERROR;
3926 kev->data = EXDEV;
3927 return 0;
3928 }
3929
3930 return FILTER_REGISTER_WAIT | rc;
3931 }
3932
3933 /*
3934 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3935 * aborted instead of properly woken up with thread_wakeup_thread().
3936 */
3937 static void
kevent_register_wait_cleanup(struct knote * kn)3938 kevent_register_wait_cleanup(struct knote *kn)
3939 {
3940 thread_t thread = kn->kn_thread;
3941 kn->kn_thread = NULL;
3942 thread_deallocate(thread);
3943 }
3944
3945 /*
3946 * Must be called at the end of a f_post_register_wait call from a filter.
3947 */
3948 static void
kevent_register_wait_block(struct turnstile * ts,thread_t thread,thread_continue_t cont,struct _kevent_register * cont_args)3949 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3950 thread_continue_t cont, struct _kevent_register *cont_args)
3951 {
3952 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3953 kqunlock(cont_args->kqwl);
3954 cont_args->handoff_thread = thread;
3955 thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3956 }
3957
3958 /*
3959 * Called by Filters using a f_post_register_wait to return from their wait.
3960 */
3961 static void
kevent_register_wait_return(struct _kevent_register * cont_args)3962 kevent_register_wait_return(struct _kevent_register *cont_args)
3963 {
3964 struct kqworkloop *kqwl = cont_args->kqwl;
3965 struct kevent_qos_s *kev = &cont_args->kev;
3966 int error = 0;
3967
3968 if (cont_args->handoff_thread) {
3969 thread_deallocate(cont_args->handoff_thread);
3970 }
3971
3972 if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3973 if ((kev->flags & EV_ERROR) == 0) {
3974 kev->flags |= EV_ERROR;
3975 kev->data = 0;
3976 }
3977 error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3978 if (error == 0) {
3979 cont_args->eventout++;
3980 }
3981 }
3982
3983 kqworkloop_release(kqwl);
3984 if (error == 0) {
3985 *(int32_t *)¤t_uthread()->uu_rval = cont_args->eventout;
3986 }
3987 unix_syscall_return(error);
3988 }
3989
3990 /*
3991 * kevent_register - add a new event to a kqueue
3992 *
3993 * Creates a mapping between the event source and
3994 * the kqueue via a knote data structure.
3995 *
3996 * Because many/most the event sources are file
3997 * descriptor related, the knote is linked off
3998 * the filedescriptor table for quick access.
3999 *
4000 * called with nothing locked
4001 * caller holds a reference on the kqueue
4002 */
4003
4004 int
kevent_register(struct kqueue * kq,struct kevent_qos_s * kev,struct knote ** kn_out)4005 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
4006 struct knote **kn_out)
4007 {
4008 struct proc *p = kq->kq_p;
4009 const struct filterops *fops;
4010 struct knote *kn = NULL;
4011 int result = 0, error = 0;
4012 unsigned short kev_flags = kev->flags;
4013 KNOTE_LOCK_CTX(knlc);
4014
4015 if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
4016 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
4017 } else {
4018 error = EINVAL;
4019 goto out;
4020 }
4021
4022 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4023 if (__improbable((kev->flags & EV_VANISHED) &&
4024 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
4025 error = EINVAL;
4026 goto out;
4027 }
4028
4029 /* Simplify the flags - delete and disable overrule */
4030 if (kev->flags & EV_DELETE) {
4031 kev->flags &= ~EV_ADD;
4032 }
4033 if (kev->flags & EV_DISABLE) {
4034 kev->flags &= ~EV_ENABLE;
4035 }
4036
4037 if (kq->kq_state & KQ_WORKLOOP) {
4038 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4039 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4040 kev->udata, kev->flags, kev->filter);
4041 } else if (kq->kq_state & KQ_WORKQ) {
4042 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4043 0, kev->udata, kev->flags, kev->filter);
4044 } else {
4045 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4046 VM_KERNEL_UNSLIDE_OR_PERM(kq),
4047 kev->udata, kev->flags, kev->filter);
4048 }
4049
4050 restart:
4051 /* find the matching knote from the fd tables/hashes */
4052 kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4053 error = kevent_register_validate_priority(kq, kn, kev);
4054 result = 0;
4055 if (error) {
4056 if (kn) {
4057 kqunlock(kq);
4058 }
4059 goto out;
4060 }
4061
4062 if (kn == NULL && (kev->flags & EV_ADD) == 0) {
4063 /*
4064 * No knote found, EV_ADD wasn't specified
4065 */
4066
4067 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4068 (kq->kq_state & KQ_WORKLOOP)) {
4069 /*
4070 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4071 * that doesn't care about ENOENT, so just pretend the deletion
4072 * happened.
4073 */
4074 } else {
4075 error = ENOENT;
4076 }
4077 goto out;
4078 } else if (kn == NULL) {
4079 /*
4080 * No knote found, need to attach a new one (attach)
4081 */
4082
4083 struct fileproc *knote_fp = NULL;
4084
4085 /* grab a file reference for the new knote */
4086 if (fops->f_isfd) {
4087 if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
4088 goto out;
4089 }
4090 }
4091
4092 kn = knote_alloc();
4093 kn->kn_fp = knote_fp;
4094 kn->kn_is_fd = fops->f_isfd;
4095 kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
4096 kn->kn_status = 0;
4097
4098 /* was vanish support requested */
4099 if (kev->flags & EV_VANISHED) {
4100 kev->flags &= ~EV_VANISHED;
4101 kn->kn_status |= KN_REQVANISH;
4102 }
4103
4104 /* snapshot matching/dispatching protocol flags into knote */
4105 if (kev->flags & EV_DISABLE) {
4106 kn->kn_status |= KN_DISABLED;
4107 }
4108
4109 /*
4110 * copy the kevent state into knote
4111 * protocol is that fflags and data
4112 * are saved off, and cleared before
4113 * calling the attach routine.
4114 *
4115 * - kn->kn_sfflags aliases with kev->xflags
4116 * - kn->kn_sdata aliases with kev->data
4117 * - kn->kn_filter is the top 8 bits of kev->filter
4118 */
4119 kn->kn_kevent = *(struct kevent_internal_s *)kev;
4120 kn->kn_sfflags = kev->fflags;
4121 kn->kn_filtid = (uint8_t)~kev->filter;
4122 kn->kn_fflags = 0;
4123 knote_reset_priority(kq, kn, kev->qos);
4124
4125 /* Add the knote for lookup thru the fd table */
4126 error = kq_add_knote(kq, kn, &knlc, p);
4127 if (error) {
4128 knote_free(kn);
4129 if (knote_fp != NULL) {
4130 fp_drop(p, (int)kev->ident, knote_fp, 0);
4131 }
4132
4133 if (error == ERESTART) {
4134 goto restart;
4135 }
4136 goto out;
4137 }
4138
4139 /* fp reference count now applies to knote */
4140
4141 /*
4142 * we can't use filter_call() because f_attach can change the filter ops
4143 * for a filter that supports f_extended_codes, so we need to reload
4144 * knote_fops() and not use `fops`.
4145 */
4146 result = fops->f_attach(kn, kev);
4147 if (result && !knote_fops(kn)->f_extended_codes) {
4148 result = FILTER_ACTIVE;
4149 }
4150
4151 kqlock(kq);
4152
4153 if (result & FILTER_THREADREQ_NODEFEER) {
4154 enable_preemption();
4155 }
4156
4157 if (kn->kn_flags & EV_ERROR) {
4158 /*
4159 * Failed to attach correctly, so drop.
4160 */
4161 kn->kn_filtid = EVFILTID_DETACHED;
4162 error = (int)kn->kn_sdata;
4163 knote_drop(kq, kn, &knlc);
4164 result = 0;
4165 goto out;
4166 }
4167
4168 /*
4169 * end "attaching" phase - now just attached
4170 *
4171 * Mark the thread request overcommit, if appropos
4172 *
4173 * If the attach routine indicated that an
4174 * event is already fired, activate the knote.
4175 */
4176 if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
4177 (kq->kq_state & KQ_WORKLOOP)) {
4178 kqworkloop_set_overcommit((struct kqworkloop *)kq);
4179 }
4180 } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4181 /*
4182 * The knote was dropped while we were waiting for the lock,
4183 * we need to re-evaluate entirely
4184 */
4185
4186 goto restart;
4187 } else if (kev->flags & EV_DELETE) {
4188 /*
4189 * Deletion of a knote (drop)
4190 *
4191 * If the filter wants to filter drop events, let it do so.
4192 *
4193 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4194 * we must wait for the knote to be re-enabled (unless it is being
4195 * re-enabled atomically here).
4196 */
4197
4198 if (knote_fops(kn)->f_allow_drop) {
4199 bool drop;
4200
4201 kqunlock(kq);
4202 drop = knote_fops(kn)->f_allow_drop(kn, kev);
4203 kqlock(kq);
4204
4205 if (!drop) {
4206 goto out_unlock;
4207 }
4208 }
4209
4210 if ((kev->flags & EV_ENABLE) == 0 &&
4211 (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4212 (kn->kn_status & KN_DISABLED) != 0) {
4213 kn->kn_status |= KN_DEFERDELETE;
4214 error = EINPROGRESS;
4215 goto out_unlock;
4216 }
4217
4218 knote_drop(kq, kn, &knlc);
4219 goto out;
4220 } else {
4221 /*
4222 * Regular update of a knote (touch)
4223 *
4224 * Call touch routine to notify filter of changes in filter values
4225 * (and to re-determine if any events are fired).
4226 *
4227 * If the knote is in defer-delete, avoid calling the filter touch
4228 * routine (it has delivered its last event already).
4229 *
4230 * If the touch routine had no failure,
4231 * apply the requested side effects to the knote.
4232 */
4233
4234 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4235 if (kev->flags & EV_ENABLE) {
4236 result = FILTER_ACTIVE;
4237 }
4238 } else {
4239 kqunlock(kq);
4240 result = filter_call(knote_fops(kn), f_touch(kn, kev));
4241 kqlock(kq);
4242 if (result & FILTER_THREADREQ_NODEFEER) {
4243 enable_preemption();
4244 }
4245 }
4246
4247 if (kev->flags & EV_ERROR) {
4248 result = 0;
4249 goto out_unlock;
4250 }
4251
4252 if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
4253 kn->kn_udata != kev->udata) {
4254 // this allows klist_copy_udata() not to take locks
4255 os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
4256 }
4257 if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
4258 kn->kn_status |= KN_DISABLED;
4259 knote_dequeue(kq, kn);
4260 }
4261 }
4262
4263 /* accept new kevent state */
4264 knote_apply_touch(kq, kn, kev, result);
4265
4266 out_unlock:
4267 /*
4268 * When the filter asked for a post-register wait,
4269 * we leave the kqueue locked for kevent_register()
4270 * to call the filter's f_post_register_wait hook.
4271 */
4272 if (result & FILTER_REGISTER_WAIT) {
4273 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4274 *kn_out = kn;
4275 } else {
4276 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4277 }
4278
4279 out:
4280 /* output local errors through the kevent */
4281 if (error) {
4282 kev->flags |= EV_ERROR;
4283 kev->data = error;
4284 }
4285 return result;
4286 }
4287
4288 /*
4289 * knote_process - process a triggered event
4290 *
4291 * Validate that it is really still a triggered event
4292 * by calling the filter routines (if necessary). Hold
4293 * a use reference on the knote to avoid it being detached.
4294 *
4295 * If it is still considered triggered, we will have taken
4296 * a copy of the state under the filter lock. We use that
4297 * snapshot to dispatch the knote for future processing (or
4298 * not, if this was a lost event).
4299 *
4300 * Our caller assures us that nobody else can be processing
4301 * events from this knote during the whole operation. But
4302 * others can be touching or posting events to the knote
4303 * interspersed with our processing it.
4304 *
4305 * caller holds a reference on the kqueue.
4306 * kqueue locked on entry and exit - but may be dropped
4307 */
4308 static int
knote_process(struct knote * kn,kevent_ctx_t kectx,kevent_callback_t callback)4309 knote_process(struct knote *kn, kevent_ctx_t kectx,
4310 kevent_callback_t callback)
4311 {
4312 struct kevent_qos_s kev;
4313 struct kqueue *kq = knote_get_kq(kn);
4314 KNOTE_LOCK_CTX(knlc);
4315 int result = FILTER_ACTIVE;
4316 int error = 0;
4317 bool drop = false;
4318
4319 /*
4320 * Must be active
4321 * Must be queued and not disabled/suppressed or dropping
4322 */
4323 assert(kn->kn_status & KN_QUEUED);
4324 assert(kn->kn_status & KN_ACTIVE);
4325 assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4326
4327 if (kq->kq_state & KQ_WORKLOOP) {
4328 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4329 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4330 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4331 kn->kn_filtid);
4332 } else if (kq->kq_state & KQ_WORKQ) {
4333 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4334 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4335 kn->kn_filtid);
4336 } else {
4337 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4338 VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4339 kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4340 }
4341
4342 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4343 /*
4344 * When the knote is dropping or has dropped,
4345 * then there's nothing we want to process.
4346 */
4347 return EJUSTRETURN;
4348 }
4349
4350 /*
4351 * While waiting for the knote lock, we may have dropped the kq lock.
4352 * and a touch may have disabled and dequeued the knote.
4353 */
4354 if (!(kn->kn_status & KN_QUEUED)) {
4355 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4356 return EJUSTRETURN;
4357 }
4358
4359 /*
4360 * For deferred-drop or vanished events, we just create a fake
4361 * event to acknowledge end-of-life. Otherwise, we call the
4362 * filter's process routine to snapshot the kevent state under
4363 * the filter's locking protocol.
4364 *
4365 * suppress knotes to avoid returning the same event multiple times in
4366 * a single call.
4367 */
4368 knote_suppress(kq, kn);
4369
4370 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4371 uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4372 if (kn->kn_status & KN_DEFERDELETE) {
4373 kev_flags |= EV_DELETE;
4374 } else {
4375 kev_flags |= EV_VANISHED;
4376 }
4377
4378 /* create fake event */
4379 kev = (struct kevent_qos_s){
4380 .filter = kn->kn_filter,
4381 .ident = kn->kn_id,
4382 .flags = kev_flags,
4383 .udata = kn->kn_udata,
4384 };
4385 } else {
4386 kqunlock(kq);
4387 kev = (struct kevent_qos_s) { };
4388 result = filter_call(knote_fops(kn), f_process(kn, &kev));
4389 kqlock(kq);
4390 }
4391
4392 /*
4393 * Determine how to dispatch the knote for future event handling.
4394 * not-fired: just return (do not callout, leave deactivated).
4395 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4396 * is the deferred delete event delivery itself). Otherwise,
4397 * drop it.
4398 * Dispatch: don't clear state, just mark it disabled.
4399 * Cleared: just leave it deactivated.
4400 * Others: re-activate as there may be more events to handle.
4401 * This will not wake up more handlers right now, but
4402 * at the completion of handling events it may trigger
4403 * more handler threads (TODO: optimize based on more than
4404 * just this one event being detected by the filter).
4405 */
4406 if ((result & FILTER_ACTIVE) == 0) {
4407 if ((kn->kn_status & KN_ACTIVE) == 0) {
4408 /*
4409 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4410 * within f_process() but that doesn't necessarily make them
4411 * ready to process, so we should leave them be.
4412 *
4413 * For other knotes, since we will not return an event,
4414 * there's no point keeping the knote suppressed.
4415 */
4416 knote_unsuppress(kq, kn);
4417 }
4418 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4419 return EJUSTRETURN;
4420 }
4421
4422 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4423 knote_adjust_qos(kq, kn, result);
4424 }
4425
4426 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
4427 kqueue_update_iotier_override(kq);
4428 }
4429
4430 kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4431
4432 if (kev.flags & EV_ONESHOT) {
4433 if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4434 (kn->kn_status & KN_DEFERDELETE) == 0) {
4435 /* defer dropping non-delete oneshot dispatch2 events */
4436 kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4437 } else {
4438 drop = true;
4439 }
4440 } else if (kn->kn_flags & EV_DISPATCH) {
4441 /* disable all dispatch knotes */
4442 kn->kn_status |= KN_DISABLED;
4443 } else if ((kn->kn_flags & EV_CLEAR) == 0) {
4444 /* re-activate in case there are more events */
4445 knote_activate(kq, kn, FILTER_ACTIVE);
4446 }
4447
4448 /*
4449 * callback to handle each event as we find it.
4450 * If we have to detach and drop the knote, do
4451 * it while we have the kq unlocked.
4452 */
4453 if (drop) {
4454 knote_drop(kq, kn, &knlc);
4455 } else {
4456 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4457 }
4458
4459 if (kev.flags & EV_VANISHED) {
4460 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4461 kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4462 kn->kn_filtid);
4463 }
4464
4465 error = (callback)(&kev, kectx);
4466 kqlock(kq);
4467 return error;
4468 }
4469
4470 /*
4471 * Returns -1 if the kqueue was unbound and processing should not happen
4472 */
4473 #define KQWQAE_BEGIN_PROCESSING 1
4474 #define KQWQAE_END_PROCESSING 2
4475 #define KQWQAE_UNBIND 3
4476 static int
kqworkq_acknowledge_events(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags,int kqwqae_op)4477 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4478 int kevent_flags, int kqwqae_op)
4479 {
4480 struct knote *kn;
4481 int rc = 0;
4482 bool unbind;
4483 struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index - 1];
4484 struct kqtailq *queue = &kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
4485
4486 kqlock_held(&kqwq->kqwq_kqueue);
4487
4488 /*
4489 * Return suppressed knotes to their original state.
4490 * For workq kqueues, suppressed ones that are still
4491 * truly active (not just forced into the queue) will
4492 * set flags we check below to see if anything got
4493 * woken up.
4494 */
4495 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4496 knote_unsuppress(kqwq, kn);
4497 }
4498
4499 if (kqwqae_op == KQWQAE_UNBIND) {
4500 unbind = true;
4501 } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4502 unbind = false;
4503 } else {
4504 unbind = TAILQ_EMPTY(queue);
4505 }
4506 if (unbind) {
4507 thread_t thread = kqr_thread_fast(kqr);
4508 thread_qos_t old_override;
4509
4510 #if MACH_ASSERT
4511 thread_t self = current_thread();
4512 struct uthread *ut = get_bsdthread_info(self);
4513
4514 assert(thread == self);
4515 assert(ut->uu_kqr_bound == kqr);
4516 #endif // MACH_ASSERT
4517
4518 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4519 if (!TAILQ_EMPTY(queue)) {
4520 /*
4521 * Request a new thread if we didn't process the whole
4522 * queue.
4523 */
4524 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4525 kqr->tr_kq_qos_index, 0);
4526 }
4527 if (old_override) {
4528 thread_drop_kevent_override(thread);
4529 }
4530 rc = -1;
4531 }
4532
4533 return rc;
4534 }
4535
4536 /*
4537 * Return 0 to indicate that processing should proceed,
4538 * -1 if there is nothing to process.
4539 *
4540 * Called with kqueue locked and returns the same way,
4541 * but may drop lock temporarily.
4542 */
4543 static int
kqworkq_begin_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4544 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4545 int kevent_flags)
4546 {
4547 int rc = 0;
4548
4549 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4550 0, kqr->tr_kq_qos_index);
4551
4552 rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4553 KQWQAE_BEGIN_PROCESSING);
4554
4555 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4556 thread_tid(kqr_thread(kqr)),
4557 !TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
4558
4559 return rc;
4560 }
4561
4562 static thread_qos_t
kqworkloop_acknowledge_events(struct kqworkloop * kqwl)4563 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4564 {
4565 kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4566 struct knote *kn, *tmp;
4567
4568 kqlock_held(kqwl);
4569
4570 TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4571 /*
4572 * If a knote that can adjust QoS is disabled because of the automatic
4573 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4574 * further overrides keep pushing.
4575 */
4576 if (knote_fops(kn)->f_adjusts_qos &&
4577 (kn->kn_status & KN_DISABLED) != 0 &&
4578 (kn->kn_status & KN_DROPPING) == 0 &&
4579 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4580 qos = MAX(qos, kn->kn_qos_override);
4581 continue;
4582 }
4583 knote_unsuppress(kqwl, kn);
4584 }
4585
4586 return qos;
4587 }
4588
4589 static int
kqworkloop_begin_processing(struct kqworkloop * kqwl,unsigned int kevent_flags)4590 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4591 {
4592 workq_threadreq_t kqr = &kqwl->kqwl_request;
4593 struct kqueue *kq = &kqwl->kqwl_kqueue;
4594 int rc = 0, op = KQWL_UTQ_NONE;
4595
4596 kqlock_held(kq);
4597
4598 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4599 kqwl->kqwl_dynamicid, 0, 0);
4600
4601 /* nobody else should still be processing */
4602 assert((kq->kq_state & KQ_PROCESSING) == 0);
4603
4604 kq->kq_state |= KQ_PROCESSING;
4605
4606 if (kevent_flags & KEVENT_FLAG_PARKING) {
4607 /*
4608 * When "parking" we want to process events and if no events are found
4609 * unbind. (Except for WORKQ_TR_FLAG_PERMANENT_BIND where the soft unbind
4610 * and bound thread park happen in the caller.)
4611 *
4612 * However, non overcommit threads sometimes park even when they have
4613 * more work so that the pool can narrow. For these, we need to unbind
4614 * early, so that calling kqworkloop_update_threads_qos() can ask the
4615 * workqueue subsystem whether the thread should park despite having
4616 * pending events.
4617 *
4618 */
4619 if (kqr->tr_flags & (WORKQ_TR_FLAG_OVERCOMMIT | WORKQ_TR_FLAG_PERMANENT_BIND)) {
4620 op = KQWL_UTQ_PARKING;
4621 } else {
4622 op = KQWL_UTQ_UNBINDING;
4623 }
4624 } else if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4625 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4626 }
4627
4628 if (op != KQWL_UTQ_NONE) {
4629 thread_qos_t qos_override;
4630 thread_t thread = kqr_thread_fast(kqr);
4631
4632 qos_override = kqworkloop_acknowledge_events(kqwl);
4633
4634 if (op == KQWL_UTQ_UNBINDING) {
4635 kqworkloop_unbind_locked(kqwl, thread,
4636 KQWL_OVERRIDE_DROP_IMMEDIATELY, 0);
4637 kqworkloop_release_live(kqwl);
4638 }
4639 kqworkloop_update_threads_qos(kqwl, op, qos_override);
4640 if (op == KQWL_UTQ_PARKING &&
4641 (!kqwl->kqwl_count || kqwl->kqwl_owner)) {
4642 if ((kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) &&
4643 (!(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND))) {
4644 kqworkloop_unbind_locked(kqwl, thread,
4645 KQWL_OVERRIDE_DROP_DELAYED, 0);
4646 kqworkloop_release_live(kqwl);
4647 }
4648 rc = -1; /* To indicate stop begin processing. */
4649 } else if (op == KQWL_UTQ_UNBINDING &&
4650 kqr_thread(kqr) != thread) {
4651 rc = -1; /* To indicate stop begin processing. */
4652 }
4653
4654 if (rc == -1) {
4655 kq->kq_state &= ~KQ_PROCESSING;
4656 if (kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND) {
4657 goto done;
4658 }
4659 kqworkloop_unbind_delayed_override_drop(thread);
4660 }
4661 }
4662 done:
4663 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4664 kqwl->kqwl_dynamicid, 0, 0);
4665
4666 return rc;
4667 }
4668
4669 /*
4670 * Return 0 to indicate that processing should proceed,
4671 * -1 if there is nothing to process.
4672 * EBADF if the kqueue is draining
4673 *
4674 * Called with kqueue locked and returns the same way,
4675 * but may drop lock temporarily.
4676 * May block.
4677 */
4678 static int
kqfile_begin_processing(struct kqfile * kq)4679 kqfile_begin_processing(struct kqfile *kq)
4680 {
4681 kqlock_held(kq);
4682
4683 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4684 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4685 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4686
4687 /* wait to become the exclusive processing thread */
4688 while ((kq->kqf_state & (KQ_PROCESSING | KQ_DRAIN)) == KQ_PROCESSING) {
4689 kq->kqf_state |= KQ_PROCWAIT;
4690 lck_spin_sleep(&kq->kqf_lock, LCK_SLEEP_DEFAULT,
4691 &kq->kqf_suppressed, THREAD_UNINT | THREAD_WAIT_NOREPORT);
4692 }
4693
4694 if (kq->kqf_state & KQ_DRAIN) {
4695 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4696 VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4697 return EBADF;
4698 }
4699
4700 /* Nobody else processing */
4701
4702 /* anything left to process? */
4703 if (kq->kqf_count == 0) {
4704 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4705 VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4706 return -1;
4707 }
4708
4709 /* convert to processing mode */
4710 kq->kqf_state |= KQ_PROCESSING;
4711
4712 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4713 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4714 return 0;
4715 }
4716
4717 /*
4718 * Try to end the processing, only called when a workq thread is attempting to
4719 * park (KEVENT_FLAG_PARKING is set).
4720 *
4721 * When returning -1, the kqworkq is setup again so that it is ready to be
4722 * processed.
4723 */
4724 static int
kqworkq_end_processing(struct kqworkq * kqwq,workq_threadreq_t kqr,int kevent_flags)4725 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4726 int kevent_flags)
4727 {
4728 if (kevent_flags & KEVENT_FLAG_PARKING) {
4729 /*
4730 * if acknowledge events "succeeds" it means there are events,
4731 * which is a failure condition for end_processing.
4732 */
4733 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4734 KQWQAE_END_PROCESSING);
4735 if (rc == 0) {
4736 return -1;
4737 }
4738 }
4739
4740 return 0;
4741 }
4742
4743 /*
4744 * Try to end the processing, only called when a workq thread is attempting to
4745 * park (KEVENT_FLAG_PARKING is set).
4746 *
4747 * When returning -1, the kqworkq is setup again so that it is ready to be
4748 * processed (as if kqworkloop_begin_processing had just been called).
4749 *
4750 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4751 * the kqworkloop is unbound from its servicer as a side effect.
4752 */
4753 static int
kqworkloop_end_processing(struct kqworkloop * kqwl,int flags,int kevent_flags)4754 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4755 {
4756 struct kqueue *kq = &kqwl->kqwl_kqueue;
4757 workq_threadreq_t kqr = &kqwl->kqwl_request;
4758 int rc = 0;
4759
4760 kqlock_held(kq);
4761
4762 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4763 kqwl->kqwl_dynamicid, 0, 0);
4764
4765 if (kevent_flags & KEVENT_FLAG_PARKING) {
4766 thread_t thread = kqr_thread_fast(kqr);
4767 thread_qos_t qos_override;
4768
4769 /*
4770 * When KEVENT_FLAG_PARKING is set, we need to attempt
4771 * an unbind while still under the lock.
4772 *
4773 * So we do everything kqworkloop_unbind() would do, but because
4774 * we're inside kqueue_process(), if the workloop actually
4775 * received events while our locks were dropped, we have
4776 * the opportunity to fail the end processing and loop again.
4777 *
4778 * This avoids going through the process-wide workqueue lock
4779 * hence scales better.
4780 */
4781 assert(flags & KQ_PROCESSING);
4782 qos_override = kqworkloop_acknowledge_events(kqwl);
4783 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4784
4785 if (kqwl->kqwl_wakeup_qos && !kqwl->kqwl_owner) {
4786 rc = -1; /* To indicate we should continue processing. */
4787 } else {
4788 if (kqr_thread_permanently_bound(kqr)) {
4789 /*
4790 * For these, the actual soft unbind and bound thread park
4791 * happen in the caller.
4792 */
4793 kq->kq_state &= ~flags;
4794 } else {
4795 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
4796 kqworkloop_release_live(kqwl);
4797 kq->kq_state &= ~flags;
4798 kqworkloop_unbind_delayed_override_drop(thread);
4799 }
4800 }
4801 } else {
4802 kq->kq_state &= ~flags;
4803 kq->kq_state |= KQ_R2K_ARMED;
4804 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4805 }
4806
4807 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4808 kqwl->kqwl_dynamicid, 0, 0);
4809
4810 return rc;
4811 }
4812
4813 /*
4814 * Called with kqueue lock held.
4815 *
4816 * 0: no more events
4817 * -1: has more events
4818 * EBADF: kqueue is in draining mode
4819 */
4820 static int
kqfile_end_processing(struct kqfile * kq)4821 kqfile_end_processing(struct kqfile *kq)
4822 {
4823 struct knote *kn;
4824 int procwait;
4825
4826 kqlock_held(kq);
4827
4828 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4829
4830 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4831 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4832
4833 /*
4834 * Return suppressed knotes to their original state.
4835 */
4836 while ((kn = TAILQ_FIRST(&kq->kqf_suppressed)) != NULL) {
4837 knote_unsuppress(kq, kn);
4838 }
4839
4840 procwait = (kq->kqf_state & KQ_PROCWAIT);
4841 kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4842
4843 if (procwait) {
4844 /* first wake up any thread already waiting to process */
4845 thread_wakeup(&kq->kqf_suppressed);
4846 }
4847
4848 if (kq->kqf_state & KQ_DRAIN) {
4849 return EBADF;
4850 }
4851 return kq->kqf_count != 0 ? -1 : 0;
4852 }
4853
4854 static int
kqueue_workloop_ctl_internal(proc_t p,uintptr_t cmd,uint64_t __unused options,struct kqueue_workloop_params * params,int * retval)4855 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4856 struct kqueue_workloop_params *params, int *retval)
4857 {
4858 int error = 0;
4859 struct kqworkloop *kqwl;
4860 struct filedesc *fdp = &p->p_fd;
4861 workq_threadreq_param_t trp = { };
4862 struct workq_threadreq_extended_param_s trp_extended = {0};
4863 integer_t trp_preadopt_priority = 0;
4864 integer_t trp_preadopt_policy = 0;
4865
4866 switch (cmd) {
4867 case KQ_WORKLOOP_CREATE:
4868 if (!params->kqwlp_flags) {
4869 error = EINVAL;
4870 break;
4871 }
4872
4873 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4874 (params->kqwlp_sched_pri < 1 ||
4875 params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4876 error = EINVAL;
4877 break;
4878 }
4879
4880 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4881 invalid_policy(params->kqwlp_sched_pol)) {
4882 error = EINVAL;
4883 break;
4884 }
4885
4886 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4887 (params->kqwlp_cpu_percent <= 0 ||
4888 params->kqwlp_cpu_percent > 100 ||
4889 params->kqwlp_cpu_refillms <= 0 ||
4890 params->kqwlp_cpu_refillms > 0x00ffffff)) {
4891 error = EINVAL;
4892 break;
4893 }
4894
4895 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WITH_BOUND_THREAD) {
4896 if (!bootarg_thread_bound_kqwl_support_enabled) {
4897 error = ENOTSUP;
4898 break;
4899 }
4900 trp.trp_flags |= TRP_BOUND_THREAD;
4901 }
4902
4903 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_WORK_INTERVAL) {
4904 /*
4905 * This flag serves the purpose of preadopting tg from work interval
4906 * on servicer/creator/bound thread at wakeup/creation time in kernel.
4907 *
4908 * Additionally, it helps the bound thread join the work interval
4909 * before it comes out to userspace for the first time.
4910 */
4911 struct work_interval *work_interval = NULL;
4912 kern_return_t kr;
4913
4914 kr = kern_port_name_to_work_interval(params->kqwl_wi_port,
4915 &work_interval);
4916 if (kr != KERN_SUCCESS) {
4917 error = EINVAL;
4918 break;
4919 }
4920 /* work_interval has a +1 ref */
4921
4922 kr = kern_work_interval_get_policy(work_interval,
4923 &trp_preadopt_policy,
4924 &trp_preadopt_priority);
4925 if (kr != KERN_SUCCESS) {
4926 kern_work_interval_release(work_interval);
4927 error = EINVAL;
4928 break;
4929 }
4930 /* The work interval comes with scheduling policy. */
4931 if (trp_preadopt_policy) {
4932 trp.trp_flags |= TRP_POLICY;
4933 trp.trp_pol = (uint8_t)trp_preadopt_policy;
4934
4935 trp.trp_flags |= TRP_PRIORITY;
4936 trp.trp_pri = (uint8_t)trp_preadopt_priority;
4937 }
4938 #if CONFIG_PREADOPT_TG
4939 kr = kern_work_interval_get_thread_group(work_interval,
4940 &trp_extended.trp_permanent_preadopt_tg);
4941 if (kr != KERN_SUCCESS) {
4942 kern_work_interval_release(work_interval);
4943 error = EINVAL;
4944 break;
4945 }
4946 /*
4947 * In case of KERN_SUCCESS, we take
4948 * : +1 ref on a thread group backing this work interval
4949 * via kern_work_interval_get_thread_group and pass it on to kqwl.
4950 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4951 * get back this ref, we release them before returning.
4952 */
4953 #endif
4954 if (trp.trp_flags & TRP_BOUND_THREAD) {
4955 /*
4956 * For TRP_BOUND_THREAD, we pass +1 ref on the work_interval on to
4957 * kqwl so the bound thread can join it before coming out to
4958 * userspace.
4959 * If, for whatever reasons, kqworkloop_get_or_create fails and we
4960 * get back this ref, we release them before returning.
4961 */
4962 trp_extended.trp_work_interval = work_interval;
4963 } else {
4964 kern_work_interval_release(work_interval);
4965 }
4966 }
4967
4968 if (!(trp.trp_flags & (TRP_POLICY | TRP_PRIORITY))) {
4969 /*
4970 * We always prefer scheduling policy + priority that comes with
4971 * a work interval. It it does not exist, we fallback to what the user
4972 * has asked.
4973 */
4974 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4975 trp.trp_flags |= TRP_PRIORITY;
4976 trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4977 }
4978 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4979 trp.trp_flags |= TRP_POLICY;
4980 trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4981 }
4982 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4983 trp.trp_flags |= TRP_CPUPERCENT;
4984 trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4985 trp.trp_refillms = params->kqwlp_cpu_refillms;
4986 }
4987 }
4988
4989 #if CONFIG_PREADOPT_TG
4990 if ((trp.trp_flags == 0) &&
4991 (trp_extended.trp_permanent_preadopt_tg == NULL)) {
4992 #else
4993 if (trp.trp_flags == 0) {
4994 #endif
4995 error = EINVAL;
4996 break;
4997 }
4998
4999 error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
5000 &trp_extended,
5001 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5002 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
5003 if (error) {
5004 /* kqworkloop_get_or_create did not consume these refs. */
5005 #if CONFIG_PREADOPT_TG
5006 if (trp_extended.trp_permanent_preadopt_tg) {
5007 thread_group_release(trp_extended.trp_permanent_preadopt_tg);
5008 }
5009 #endif
5010 if (trp_extended.trp_work_interval) {
5011 kern_work_interval_release(trp_extended.trp_work_interval);
5012 }
5013 break;
5014 }
5015
5016 if (!fdt_flag_test(fdp, FD_WORKLOOP)) {
5017 /* FD_WORKLOOP indicates we've ever created a workloop
5018 * via this syscall but its only ever added to a process, never
5019 * removed.
5020 */
5021 proc_fdlock(p);
5022 fdt_flag_set(fdp, FD_WORKLOOP);
5023 proc_fdunlock(p);
5024 }
5025 break;
5026 case KQ_WORKLOOP_DESTROY:
5027 error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL, NULL,
5028 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5029 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
5030 if (error) {
5031 break;
5032 }
5033 kqlock(kqwl);
5034 trp.trp_value = kqwl->kqwl_params;
5035 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5036 trp.trp_flags |= TRP_RELEASED;
5037 kqwl->kqwl_params = trp.trp_value;
5038 if (trp.trp_flags & TRP_BOUND_THREAD) {
5039 kqworkloop_bound_thread_wakeup(kqwl);
5040 }
5041 kqworkloop_release_live(kqwl);
5042 } else {
5043 error = EINVAL;
5044 }
5045 kqunlock(kqwl);
5046 kqworkloop_release(kqwl);
5047 break;
5048 }
5049 *retval = 0;
5050 return error;
5051 }
5052
5053 int
5054 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
5055 {
5056 struct kqueue_workloop_params params = {
5057 .kqwlp_id = 0,
5058 };
5059 if (uap->sz < sizeof(params.kqwlp_version)) {
5060 return EINVAL;
5061 }
5062
5063 size_t copyin_sz = MIN(sizeof(params), uap->sz);
5064 int rv = copyin(uap->addr, ¶ms, copyin_sz);
5065 if (rv) {
5066 return rv;
5067 }
5068
5069 if (params.kqwlp_version != (int)uap->sz) {
5070 return EINVAL;
5071 }
5072
5073 return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms,
5074 retval);
5075 }
5076
5077 static int
5078 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
5079 {
5080 struct kqfile *kq = (struct kqfile *)fp_get_data(fp);
5081 int retnum = 0;
5082
5083 assert((kq->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5084
5085 if (which == FREAD) {
5086 kqlock(kq);
5087 if (kqfile_begin_processing(kq) == 0) {
5088 retnum = kq->kqf_count;
5089 kqfile_end_processing(kq);
5090 } else if ((kq->kqf_state & KQ_DRAIN) == 0) {
5091 selrecord(kq->kqf_p, &kq->kqf_sel, wql);
5092 }
5093 kqunlock(kq);
5094 }
5095 return retnum;
5096 }
5097
5098 /*
5099 * kqueue_close -
5100 */
5101 static int
5102 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5103 {
5104 struct kqfile *kqf = fg_get_data(fg);
5105
5106 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5107 kqlock(kqf);
5108 selthreadclear(&kqf->kqf_sel);
5109 kqunlock(kqf);
5110 kqueue_dealloc(&kqf->kqf_kqueue);
5111 fg_set_data(fg, NULL);
5112 return 0;
5113 }
5114
5115 /*
5116 * Max depth of the nested kq path that can be created.
5117 * Note that this has to be less than the size of kq_level
5118 * to avoid wrapping around and mislabeling the level. We also
5119 * want to be aggressive about this so that we don't overflow the
5120 * kernel stack while posting kevents
5121 */
5122 #define MAX_NESTED_KQ 10
5123
5124 /*
5125 * The callers has taken a use-count reference on this kqueue and will donate it
5126 * to the kqueue we are being added to. This keeps the kqueue from closing until
5127 * that relationship is torn down.
5128 */
5129 static int
5130 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
5131 __unused struct kevent_qos_s *kev)
5132 {
5133 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5134 struct kqueue *kq = &kqf->kqf_kqueue;
5135 struct kqueue *parentkq = knote_get_kq(kn);
5136
5137 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5138
5139 if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
5140 knote_set_error(kn, EINVAL);
5141 return 0;
5142 }
5143
5144 /*
5145 * We have to avoid creating a cycle when nesting kqueues
5146 * inside another. Rather than trying to walk the whole
5147 * potential DAG of nested kqueues, we just use a simple
5148 * ceiling protocol. When a kqueue is inserted into another,
5149 * we check that the (future) parent is not already nested
5150 * into another kqueue at a lower level than the potenial
5151 * child (because it could indicate a cycle). If that test
5152 * passes, we just mark the nesting levels accordingly.
5153 *
5154 * Only up to MAX_NESTED_KQ can be nested.
5155 *
5156 * Note: kqworkq and kqworkloop cannot be nested and have reused their
5157 * kq_level field, so ignore these as parent.
5158 */
5159
5160 kqlock(parentkq);
5161
5162 if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
5163 if (parentkq->kq_level > 0 &&
5164 parentkq->kq_level < kq->kq_level) {
5165 kqunlock(parentkq);
5166 knote_set_error(kn, EINVAL);
5167 return 0;
5168 }
5169
5170 /* set parent level appropriately */
5171 uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5172 if (plevel < kq->kq_level + 1) {
5173 if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5174 kqunlock(parentkq);
5175 knote_set_error(kn, EINVAL);
5176 return 0;
5177 }
5178 plevel = kq->kq_level + 1;
5179 }
5180
5181 parentkq->kq_level = plevel;
5182 }
5183
5184 kqunlock(parentkq);
5185
5186 kn->kn_filtid = EVFILTID_KQREAD;
5187 kqlock(kq);
5188 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5189 /* indicate nesting in child, if needed */
5190 if (kq->kq_level == 0) {
5191 kq->kq_level = 1;
5192 }
5193
5194 int count = kq->kq_count;
5195 kqunlock(kq);
5196 return count > 0;
5197 }
5198
5199 __attribute__((noinline))
5200 static void
5201 kqfile_wakeup(struct kqfile *kqf, long hint, wait_result_t wr)
5202 {
5203 /* wakeup a thread waiting on this queue */
5204 selwakeup(&kqf->kqf_sel);
5205
5206 /* wake up threads in kqueue_scan() */
5207 if (kqf->kqf_state & KQ_SLEEP) {
5208 kqf->kqf_state &= ~KQ_SLEEP;
5209 thread_wakeup_with_result(&kqf->kqf_count, wr);
5210 }
5211
5212 if (hint == NOTE_REVOKE) {
5213 /* wakeup threads waiting their turn to process */
5214 if (kqf->kqf_state & KQ_PROCWAIT) {
5215 assert(kqf->kqf_state & KQ_PROCESSING);
5216 kqf->kqf_state &= ~KQ_PROCWAIT;
5217 thread_wakeup(&kqf->kqf_suppressed);
5218 }
5219
5220 /* no need to KNOTE: knote_fdclose() takes care of it */
5221 } else {
5222 /* wakeup other kqueues/select sets we're inside */
5223 KNOTE(&kqf->kqf_sel.si_note, hint);
5224 }
5225 }
5226
5227 /*
5228 * kqueue_drain - called when kq is closed
5229 */
5230 static int
5231 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5232 {
5233 struct kqfile *kqf = (struct kqfile *)fp_get_data(fp);
5234
5235 assert((kqf->kqf_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5236
5237 kqlock(kqf);
5238 kqf->kqf_state |= KQ_DRAIN;
5239 kqfile_wakeup(kqf, NOTE_REVOKE, THREAD_RESTART);
5240 kqunlock(kqf);
5241 return 0;
5242 }
5243
5244 int
5245 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5246 {
5247 assert((kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
5248
5249 kqlock(kq);
5250 if (isstat64 != 0) {
5251 struct stat64 *sb64 = (struct stat64 *)ub;
5252
5253 bzero((void *)sb64, sizeof(*sb64));
5254 sb64->st_size = kq->kq_count;
5255 if (kq->kq_state & KQ_KEV_QOS) {
5256 sb64->st_blksize = sizeof(struct kevent_qos_s);
5257 } else if (kq->kq_state & KQ_KEV64) {
5258 sb64->st_blksize = sizeof(struct kevent64_s);
5259 } else if (IS_64BIT_PROCESS(p)) {
5260 sb64->st_blksize = sizeof(struct user64_kevent);
5261 } else {
5262 sb64->st_blksize = sizeof(struct user32_kevent);
5263 }
5264 sb64->st_mode = S_IFIFO;
5265 } else {
5266 struct stat *sb = (struct stat *)ub;
5267
5268 bzero((void *)sb, sizeof(*sb));
5269 sb->st_size = kq->kq_count;
5270 if (kq->kq_state & KQ_KEV_QOS) {
5271 sb->st_blksize = sizeof(struct kevent_qos_s);
5272 } else if (kq->kq_state & KQ_KEV64) {
5273 sb->st_blksize = sizeof(struct kevent64_s);
5274 } else if (IS_64BIT_PROCESS(p)) {
5275 sb->st_blksize = sizeof(struct user64_kevent);
5276 } else {
5277 sb->st_blksize = sizeof(struct user32_kevent);
5278 }
5279 sb->st_mode = S_IFIFO;
5280 }
5281 kqunlock(kq);
5282 return 0;
5283 }
5284
5285 static inline bool
5286 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5287 {
5288 if (current_proc() == kq->kq_p) {
5289 /*
5290 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5291 * do combined send/receive and in the case of self-IPC, the AST may bet
5292 * set on a thread that will not return to userspace and needs the
5293 * thread the AST would create to unblock itself.
5294 *
5295 * At this time, we really want to target:
5296 *
5297 * - kevent variants that can cause thread creations, and dispatch
5298 * really only uses kevent_qos and kevent_id,
5299 *
5300 * - workq_kernreturn (directly about thread creations)
5301 *
5302 * - bsdthread_ctl which is used for qos changes and has direct impact
5303 * on the creator thread scheduling decisions.
5304 */
5305 switch (current_uthread()->syscall_code) {
5306 case SYS_kevent_qos:
5307 case SYS_kevent_id:
5308 case SYS_workq_kernreturn:
5309 case SYS_bsdthread_ctl:
5310 return true;
5311 }
5312 }
5313 return false;
5314 }
5315
5316 /*
5317 * Interact with the pthread kext to request a servicing there at a specific QoS
5318 * level.
5319 *
5320 * - Caller holds the kqlock
5321 *
5322 * - May be called with the kqueue's wait queue set locked,
5323 * so cannot do anything that could recurse on that.
5324 */
5325 static void
5326 kqueue_threadreq_initiate(kqueue_t kqu, workq_threadreq_t kqr,
5327 kq_index_t qos, int flags)
5328 {
5329 assert(kqr_thread(kqr) == THREAD_NULL);
5330 assert(!kqr_thread_requested(kqr));
5331 struct turnstile *ts = TURNSTILE_NULL;
5332
5333 if (workq_is_exiting(kqu.kq->kq_p)) {
5334 return;
5335 }
5336
5337 kqlock_held(kqu);
5338
5339 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5340 struct kqworkloop *kqwl = kqu.kqwl;
5341
5342 assert(kqwl->kqwl_owner == THREAD_NULL);
5343 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5344 kqwl->kqwl_dynamicid, 0, qos, kqwl->kqwl_wakeup_qos);
5345 ts = kqwl->kqwl_turnstile;
5346 /* Add a thread request reference on the kqueue. */
5347 kqworkloop_retain(kqwl);
5348
5349 #if CONFIG_PREADOPT_TG
5350 thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5351 &kqwl->kqwl_preadopt_tg, relaxed);
5352 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5353 /*
5354 * This kqwl has been permanently configured with a thread group.
5355 * See kqworkloops with scheduling parameters.
5356 */
5357 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5358 } else {
5359 /*
5360 * This thread is the one which is ack-ing the thread group on the kqwl
5361 * under the kqlock and will take action accordingly, pairs with the
5362 * release barrier in kqueue_set_preadopted_thread_group
5363 */
5364 uint16_t tg_acknowledged;
5365 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive,
5366 KQWL_PREADOPT_TG_NEEDS_REDRIVE, KQWL_PREADOPT_TG_CLEAR_REDRIVE,
5367 &tg_acknowledged, acquire)) {
5368 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5369 }
5370 }
5371 #endif
5372 } else {
5373 assert(kqu.kq->kq_state & KQ_WORKQ);
5374 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), -1, 0, qos,
5375 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5376 }
5377
5378 /*
5379 * New-style thread request supported.
5380 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5381 * its use until a corresponding kqueue_threadreq_bind callback.
5382 */
5383 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5384 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5385 }
5386 if (qos == KQWQ_QOS_MANAGER) {
5387 qos = WORKQ_THREAD_QOS_MANAGER;
5388 }
5389
5390 if (!workq_kern_threadreq_initiate(kqu.kq->kq_p, kqr, ts, qos, flags)) {
5391 /*
5392 * Process is shutting down or exec'ing.
5393 * All the kqueues are going to be cleaned up
5394 * soon. Forget we even asked for a thread -
5395 * and make sure we don't ask for more.
5396 */
5397 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
5398 kqueue_release_live(kqu);
5399 }
5400 }
5401
5402 /*
5403 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5404 *
5405 * This is used when kqueue_threadreq_bind may cause a lock inversion.
5406 */
5407 __attribute__((always_inline))
5408 void
5409 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5410 struct uthread *ut)
5411 {
5412 ut->uu_kqr_bound = kqr;
5413 kqr->tr_thread = get_machthread(ut);
5414 kqr->tr_state = WORKQ_TR_STATE_BINDING;
5415 }
5416
5417 /*
5418 * kqueue_threadreq_bind_commit - commit a bind prepost
5419 *
5420 * The workq code has to commit any binding prepost before the thread has
5421 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5422 */
5423 void
5424 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5425 {
5426 struct uthread *ut = get_bsdthread_info(thread);
5427 workq_threadreq_t kqr = ut->uu_kqr_bound;
5428 kqueue_t kqu = kqr_kqueue(p, kqr);
5429
5430 kqlock(kqu);
5431 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5432 kqueue_threadreq_bind(p, kqr, thread, 0);
5433 }
5434 kqunlock(kqu);
5435 }
5436
5437 void
5438 kqworkloop_bound_thread_terminate(workq_threadreq_t kqr,
5439 uint16_t *uu_workq_flags_orig)
5440 {
5441 struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5442 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5443
5444 assert(uth == current_uthread());
5445
5446 kqlock(kqwl);
5447
5448 *uu_workq_flags_orig = uth->uu_workq_flags;
5449
5450 uth->uu_workq_flags &= ~UT_WORKQ_NEW;
5451 uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_JOINED;
5452 uth->uu_workq_flags &= ~UT_WORKQ_WORK_INTERVAL_FAILED;
5453
5454 workq_kern_bound_thread_reset_pri(NULL, uth);
5455
5456 kqunlock(kqwl);
5457 }
5458
5459 /*
5460 * This is called from kqueue_process with kqlock held.
5461 */
5462 __attribute__((noreturn, noinline))
5463 static void
5464 kqworkloop_bound_thread_park(struct kqworkloop *kqwl, thread_t thread)
5465 {
5466 assert(thread == current_thread());
5467
5468 kqlock_held(kqwl);
5469
5470 assert(!kqwl->kqwl_count);
5471
5472 /*
5473 * kevent entry points will take a reference on workloops so we need to
5474 * undo it before we park for good.
5475 */
5476 kqworkloop_release_live(kqwl);
5477
5478 workq_threadreq_t kqr = &kqwl->kqwl_request;
5479 workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5480
5481 if (trp.trp_flags & TRP_RELEASED) {
5482 /*
5483 * We need this check since the kqlock is dropped and retaken
5484 * multiple times during kqueue_process and because KQ_SLEEP is not
5485 * set, kqworkloop_bound_thread_wakeup is going to be a no-op.
5486 */
5487 kqunlock(kqwl);
5488 workq_kern_bound_thread_terminate(kqr);
5489 } else {
5490 kqworkloop_unbind_locked(kqwl,
5491 thread, KQWL_OVERRIDE_DROP_DELAYED, KQUEUE_THREADREQ_UNBIND_SOFT);
5492 workq_kern_bound_thread_park(kqr);
5493 }
5494 __builtin_unreachable();
5495 }
5496
5497 /*
5498 * A helper function for pthread workqueue subsystem.
5499 *
5500 * This is used to keep things that the workq code needs to do after
5501 * the bound thread's assert_wait minimum.
5502 */
5503 void
5504 kqworkloop_bound_thread_park_prepost(workq_threadreq_t kqr)
5505 {
5506 assert(current_thread() == kqr->tr_thread);
5507
5508 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5509
5510 kqlock_held(kqwl);
5511
5512 kqwl->kqwl_state |= KQ_SLEEP;
5513
5514 /* uu_kqueue_override is protected under kqlock. */
5515 kqworkloop_unbind_delayed_override_drop(kqr->tr_thread);
5516
5517 kqunlock(kqwl);
5518 }
5519
5520 /*
5521 * A helper function for pthread workqueue subsystem.
5522 *
5523 * This is used to keep things that the workq code needs to do after
5524 * the bound thread's assert_wait minimum.
5525 */
5526 void
5527 kqworkloop_bound_thread_park_commit(workq_threadreq_t kqr,
5528 event_t event,
5529 thread_continue_t continuation)
5530 {
5531 assert(current_thread() == kqr->tr_thread);
5532
5533 struct kqworkloop *kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5534 struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
5535
5536 kqlock(kqwl);
5537 if (!(kqwl->kqwl_state & KQ_SLEEP)) {
5538 /*
5539 * When we dropped the kqlock to unset the voucher, someone came
5540 * around and made us runnable. But because we weren't waiting on the
5541 * event their thread_wakeup() was ineffectual. To correct for that,
5542 * we just run the continuation ourselves.
5543 */
5544 assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)));
5545 if (uth->uu_workq_flags & UT_WORKQ_DYING) {
5546 __assert_only workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(kqr);
5547 assert(trp.trp_flags & TRP_RELEASED);
5548 }
5549 kqunlock(kqwl);
5550 continuation(NULL, THREAD_AWAKENED);
5551 } else {
5552 assert((uth->uu_workq_flags & (UT_WORKQ_RUNNING | UT_WORKQ_DYING)) == 0);
5553 thread_set_pending_block_hint(get_machthread(uth),
5554 kThreadWaitParkedBoundWorkQueue);
5555 assert_wait(event, THREAD_INTERRUPTIBLE);
5556 kqunlock(kqwl);
5557 thread_block(continuation);
5558 }
5559 }
5560
5561 static void
5562 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5563 workq_kern_threadreq_flags_t flags)
5564 {
5565 assert(kqr_thread_requested_pending(kqr));
5566
5567 kqlock_held(kqu);
5568
5569 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5570 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5571 }
5572
5573 #if CONFIG_PREADOPT_TG
5574 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5575 struct kqworkloop *kqwl = kqu.kqwl;
5576 thread_group_qos_t kqwl_preadopt_tg = os_atomic_load(
5577 &kqwl->kqwl_preadopt_tg, relaxed);
5578 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(kqwl_preadopt_tg)) {
5579 /*
5580 * This kqwl has been permanently configured with a thread group.
5581 * See kqworkloops with scheduling parameters.
5582 */
5583 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5584 } else {
5585 uint16_t tg_ack_status;
5586 /*
5587 * This thread is the one which is ack-ing the thread group on the kqwl
5588 * under the kqlock and will take action accordingly, needs acquire
5589 * barrier.
5590 */
5591 if (os_atomic_cmpxchgv(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_NEEDS_REDRIVE,
5592 KQWL_PREADOPT_TG_CLEAR_REDRIVE, &tg_ack_status, acquire)) {
5593 flags |= WORKQ_THREADREQ_REEVALUATE_PREADOPT_TG;
5594 }
5595 }
5596 }
5597 #endif
5598
5599 workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5600 }
5601
5602 /*
5603 * kqueue_threadreq_bind - bind thread to processing kqrequest
5604 *
5605 * The provided thread will be responsible for delivering events
5606 * associated with the given kqrequest. Bind it and get ready for
5607 * the thread to eventually arrive.
5608 */
5609 void
5610 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5611 unsigned int flags)
5612 {
5613 kqueue_t kqu = kqr_kqueue(p, kqr);
5614 struct uthread *ut = get_bsdthread_info(thread);
5615
5616 kqlock_held(kqu);
5617
5618 assert(ut->uu_kqueue_override == 0);
5619
5620 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5621 assert(ut->uu_kqr_bound == kqr);
5622 assert(kqr->tr_thread == thread);
5623 } else if (kqr->tr_state == WORKQ_TR_STATE_BOUND) {
5624 assert(flags & KQUEUE_THREADREQ_BIND_SOFT);
5625 assert(kqr_thread_permanently_bound(kqr));
5626 } else {
5627 assert(kqr_thread_requested_pending(kqr));
5628 assert(kqr->tr_thread == THREAD_NULL);
5629 assert(ut->uu_kqr_bound == NULL);
5630 ut->uu_kqr_bound = kqr;
5631 kqr->tr_thread = thread;
5632 }
5633
5634 kqr->tr_state = WORKQ_TR_STATE_BOUND;
5635
5636 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5637 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5638
5639 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5640 /*
5641 * <rdar://problem/38626999> shows that asserting here is not ok.
5642 *
5643 * This is not supposed to happen for correct use of the interface,
5644 * but it is sadly possible for userspace (with the help of memory
5645 * corruption, such as over-release of a dispatch queue) to make
5646 * the creator thread the "owner" of a workloop.
5647 *
5648 * Once that happens, and that creator thread picks up the same
5649 * workloop as a servicer, we trip this codepath. We need to fixup
5650 * the state to forget about this thread being the owner, as the
5651 * entire workloop state machine expects servicers to never be
5652 * owners and everything would basically go downhill from here.
5653 */
5654 kqu.kqwl->kqwl_owner = THREAD_NULL;
5655 if (kqworkloop_override(kqu.kqwl)) {
5656 thread_drop_kevent_override(thread);
5657 }
5658 }
5659
5660 if (ts && (flags & KQUEUE_THREADREQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5661 /*
5662 * Past this point, the interlock is the kq req lock again,
5663 * so we can fix the inheritor for good.
5664 */
5665 filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5666 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5667 }
5668
5669 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5670 thread_tid(thread), kqr->tr_kq_qos_index,
5671 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5672
5673 ut->uu_kqueue_override = kqr->tr_kq_override_index;
5674 if (kqr->tr_kq_override_index) {
5675 thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5676 }
5677
5678 #if CONFIG_PREADOPT_TG
5679 /* Remove reference from kqwl and mark it as bound with the SENTINEL */
5680 thread_group_qos_t old_tg;
5681 thread_group_qos_t new_tg;
5682 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
5683 if ((old_tg == KQWL_PREADOPTED_TG_NEVER) || KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5684 /*
5685 * Either an app or a kqwl permanently configured with a thread group.
5686 * Nothing to do.
5687 */
5688 os_atomic_rmw_loop_give_up(break);
5689 }
5690 assert(old_tg != KQWL_PREADOPTED_TG_PROCESSED);
5691 new_tg = KQWL_PREADOPTED_TG_SENTINEL;
5692 });
5693
5694 if (ret) {
5695 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqu.kqwl, KQWL_PREADOPT_OP_SERVICER_BIND, old_tg, new_tg);
5696
5697 if (KQWL_HAS_VALID_PREADOPTED_TG(old_tg)) {
5698 struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5699 assert(tg != NULL);
5700
5701 thread_set_preadopt_thread_group(thread, tg);
5702 thread_group_release_live(tg); // The thread has a reference
5703 } else {
5704 /*
5705 * The thread may already have a preadopt thread group on it -
5706 * we need to make sure to clear that.
5707 */
5708 thread_set_preadopt_thread_group(thread, NULL);
5709 }
5710
5711 /* We have taken action on the preadopted thread group set on the
5712 * set on the kqwl, clear any redrive requests */
5713 os_atomic_store(&kqu.kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5714 } else {
5715 if (KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
5716 struct thread_group *tg = KQWL_GET_PREADOPTED_TG(old_tg);
5717 assert(tg != NULL);
5718 /*
5719 * For KQUEUE_THREADREQ_BIND_SOFT, technically the following
5720 * set_preadopt should be a no-op since this bound servicer thread
5721 * preadopts kqwl's permanent tg at first-initial bind time and
5722 * never leaves it until its termination.
5723 */
5724 thread_set_preadopt_thread_group(thread, tg);
5725 /*
5726 * From this point on, kqwl and thread both have +1 ref on this tg.
5727 */
5728 }
5729 }
5730 #endif
5731 kqueue_update_iotier_override(kqu);
5732 } else {
5733 assert(kqr->tr_kq_override_index == 0);
5734
5735 #if CONFIG_PREADOPT_TG
5736 /*
5737 * The thread may have a preadopt thread group on it already because it
5738 * got tagged with it as a creator thread. So we need to make sure to
5739 * clear that since we don't have preadopt thread groups for non-kqwl
5740 * cases
5741 */
5742 thread_set_preadopt_thread_group(thread, NULL);
5743 #endif
5744 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5745 thread_tid(thread), kqr->tr_kq_qos_index,
5746 (kqr->tr_kq_override_index << 16) |
5747 !TAILQ_EMPTY(&kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1]));
5748 }
5749 }
5750
5751 /*
5752 * kqueue_threadreq_cancel - abort a pending thread request
5753 *
5754 * Called when exiting/exec'ing. Forget our pending request.
5755 */
5756 void
5757 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5758 {
5759 kqueue_release(kqr_kqueue(p, kqr));
5760 }
5761
5762 workq_threadreq_param_t
5763 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5764 {
5765 struct kqworkloop *kqwl;
5766 workq_threadreq_param_t trp;
5767
5768 assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5769 kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5770 trp.trp_value = kqwl->kqwl_params;
5771 return trp;
5772 }
5773
5774 /*
5775 * kqueue_threadreq_unbind - unbind thread from processing kqueue
5776 *
5777 * End processing the per-QoS bucket of events and allow other threads
5778 * to be requested for future servicing.
5779 *
5780 * caller holds a reference on the kqueue.
5781 */
5782 void
5783 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5784 {
5785 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5786 kqworkloop_unbind(kqr_kqworkloop(kqr));
5787 } else {
5788 kqworkq_unbind(p, kqr);
5789 }
5790 }
5791
5792 /*
5793 * If we aren't already busy processing events [for this QoS],
5794 * request workq thread support as appropriate.
5795 *
5796 * TBD - for now, we don't segregate out processing by QoS.
5797 *
5798 * - May be called with the kqueue's wait queue set locked,
5799 * so cannot do anything that could recurse on that.
5800 */
5801 static void
5802 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5803 {
5804 workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5805
5806 /* convert to thread qos value */
5807 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
5808
5809 if (!kqr_thread_requested(kqr)) {
5810 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5811 }
5812 }
5813
5814 /*
5815 * This represent the asynchronous QoS a given workloop contributes,
5816 * hence is the max of the current active knotes (override index)
5817 * and the workloop max qos (userspace async qos).
5818 */
5819 static kq_index_t
5820 kqworkloop_override(struct kqworkloop *kqwl)
5821 {
5822 workq_threadreq_t kqr = &kqwl->kqwl_request;
5823 return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5824 }
5825
5826 static inline void
5827 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5828 {
5829 workq_threadreq_t kqr = &kqwl->kqwl_request;
5830
5831 kqlock_held(kqwl);
5832
5833 if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5834 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5835 act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5836 }
5837 }
5838
5839 static void
5840 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5841 {
5842 workq_threadreq_t kqr = &kqwl->kqwl_request;
5843 struct kqueue *kq = &kqwl->kqwl_kqueue;
5844 kq_index_t old_override = kqworkloop_override(kqwl);
5845
5846 kqlock_held(kqwl);
5847
5848 switch (op) {
5849 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5850 kqwl->kqwl_wakeup_qos = qos;
5851 kqworkloop_request_fire_r2k_notification(kqwl);
5852 goto recompute;
5853
5854 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5855 kqr->tr_kq_override_index = qos;
5856 goto recompute;
5857
5858 case KQWL_UTQ_PARKING:
5859 case KQWL_UTQ_UNBINDING:
5860 kqr->tr_kq_override_index = qos;
5861 OS_FALLTHROUGH;
5862
5863 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5864 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5865 assert(qos == THREAD_QOS_UNSPECIFIED);
5866 }
5867 if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5868 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5869 }
5870 kqwl->kqwl_wakeup_qos = 0;
5871 for (kq_index_t i = KQWL_NBUCKETS; i > 0; i--) {
5872 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i - 1])) {
5873 kqwl->kqwl_wakeup_qos = i;
5874 kqworkloop_request_fire_r2k_notification(kqwl);
5875 break;
5876 }
5877 }
5878 OS_FALLTHROUGH;
5879
5880 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5881 recompute:
5882 /*
5883 * When modifying the wakeup QoS or the override QoS, we always need to
5884 * maintain our invariant that kqr_override_index is at least as large
5885 * as the highest QoS for which an event is fired.
5886 *
5887 * However this override index can be larger when there is an overriden
5888 * suppressed knote pushing on the kqueue.
5889 */
5890 if (qos < kqwl->kqwl_wakeup_qos) {
5891 qos = kqwl->kqwl_wakeup_qos;
5892 }
5893 if (kqr->tr_kq_override_index < qos) {
5894 kqr->tr_kq_override_index = qos;
5895 }
5896 break;
5897
5898 case KQWL_UTQ_REDRIVE_EVENTS:
5899 break;
5900
5901 case KQWL_UTQ_SET_QOS_INDEX:
5902 kqr->tr_kq_qos_index = qos;
5903 break;
5904
5905 default:
5906 panic("unknown kqwl thread qos update operation: %d", op);
5907 }
5908
5909 thread_t kqwl_owner = kqwl->kqwl_owner;
5910 thread_t servicer = kqr_thread(kqr);
5911 boolean_t qos_changed = FALSE;
5912 kq_index_t new_override = kqworkloop_override(kqwl);
5913
5914 /*
5915 * Apply the diffs to the owner if applicable
5916 */
5917 if (kqwl_owner) {
5918 #if 0
5919 /* JMM - need new trace hooks for owner overrides */
5920 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5921 kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5922 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
5923 #endif
5924 if (new_override == old_override) {
5925 // nothing to do
5926 } else if (old_override == THREAD_QOS_UNSPECIFIED) {
5927 thread_add_kevent_override(kqwl_owner, new_override);
5928 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5929 thread_drop_kevent_override(kqwl_owner);
5930 } else { /* old_override != new_override */
5931 thread_update_kevent_override(kqwl_owner, new_override);
5932 }
5933 }
5934
5935 /*
5936 * apply the diffs to the servicer
5937 */
5938
5939 if (!kqr_thread_requested(kqr)) {
5940 /*
5941 * No servicer, nor thread-request
5942 *
5943 * Make a new thread request, unless there is an owner (or the workloop
5944 * is suspended in userland) or if there is no asynchronous work in the
5945 * first place.
5946 */
5947
5948 if (kqwl_owner == NULL && kqwl->kqwl_wakeup_qos) {
5949 int initiate_flags = 0;
5950 if (op == KQWL_UTQ_UNBINDING) {
5951 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5952 }
5953
5954 /* kqueue_threadreq_initiate handles the acknowledgement of the TG
5955 * if needed */
5956 kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5957 }
5958 } else if (servicer) {
5959 /*
5960 * Servicer in flight
5961 *
5962 * Just apply the diff to the servicer
5963 */
5964
5965 #if CONFIG_PREADOPT_TG
5966 /* When there's a servicer for the kqwl already, then the servicer will
5967 * adopt the thread group in the kqr, we don't need to poke the
5968 * workqueue subsystem to make different decisions due to the thread
5969 * group. Consider the current request ack-ed.
5970 */
5971 os_atomic_store(&kqwl->kqwl_preadopt_tg_needs_redrive, KQWL_PREADOPT_TG_CLEAR_REDRIVE, relaxed);
5972 #endif
5973
5974 if (kqr_thread_permanently_bound(kqr) && (kqwl->kqwl_state & KQ_SLEEP)) {
5975 kqr->tr_qos = new_override;
5976 workq_kern_bound_thread_reset_pri(kqr, get_bsdthread_info(servicer));
5977 } else {
5978 struct uthread *ut = get_bsdthread_info(servicer);
5979 if (ut->uu_kqueue_override != new_override) {
5980 if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5981 thread_add_servicer_override(servicer, new_override);
5982 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5983 thread_drop_servicer_override(servicer);
5984 } else { /* ut->uu_kqueue_override != new_override */
5985 thread_update_servicer_override(servicer, new_override);
5986 }
5987 ut->uu_kqueue_override = new_override;
5988 qos_changed = TRUE;
5989 }
5990 }
5991 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5992 /*
5993 * No events to deliver anymore.
5994 *
5995 * However canceling with turnstiles is challenging, so the fact that
5996 * the request isn't useful will be discovered by the servicer himself
5997 * later on.
5998 */
5999 } else if (old_override != new_override) {
6000 /*
6001 * Request is in flight
6002 *
6003 * Apply the diff to the thread request.
6004 */
6005 kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
6006 qos_changed = TRUE;
6007 }
6008
6009 if (qos_changed) {
6010 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6011 thread_tid(servicer), kqr->tr_kq_qos_index,
6012 (kqr->tr_kq_override_index << 16) | kqwl->kqwl_wakeup_qos);
6013 }
6014 }
6015
6016 static void
6017 kqworkloop_update_iotier_override(struct kqworkloop *kqwl)
6018 {
6019 workq_threadreq_t kqr = &kqwl->kqwl_request;
6020 thread_t servicer = kqr_thread(kqr);
6021 uint8_t iotier = os_atomic_load(&kqwl->kqwl_iotier_override, relaxed);
6022
6023 kqlock_held(kqwl);
6024
6025 if (servicer) {
6026 thread_update_servicer_iotier_override(servicer, iotier);
6027 }
6028 }
6029
6030 static void
6031 kqworkloop_bound_thread_wakeup(struct kqworkloop *kqwl)
6032 {
6033 workq_threadreq_t kqr = &kqwl->kqwl_request;
6034
6035 kqlock_held(kqwl);
6036
6037 assert(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND);
6038
6039 __assert_only struct uthread *uth = get_bsdthread_info(kqr->tr_thread);
6040 assert(workq_thread_is_permanently_bound(uth));
6041
6042 /*
6043 * The bound thread takes up the responsibility of setting the KQ_SLEEP
6044 * on its way to parking. See kqworkloop_bound_thread_park_prepost.
6045 * This state is always manipulated under kqlock.
6046 */
6047 if (kqwl->kqwl_state & KQ_SLEEP) {
6048 kqwl->kqwl_state &= ~KQ_SLEEP;
6049 kqueue_threadreq_bind(current_proc(),
6050 kqr, kqr->tr_thread, KQUEUE_THREADREQ_BIND_SOFT);
6051 workq_kern_bound_thread_wakeup(kqr);
6052 }
6053 }
6054
6055 static void
6056 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
6057 {
6058 if (qos <= kqwl->kqwl_wakeup_qos) {
6059 /*
6060 * Shortcut wakeups that really do nothing useful
6061 */
6062 return;
6063 }
6064
6065 if ((kqwl->kqwl_state & KQ_PROCESSING) &&
6066 kqr_thread(&kqwl->kqwl_request) == current_thread()) {
6067 /*
6068 * kqworkloop_end_processing() will perform the required QoS
6069 * computations when it unsets the processing mode.
6070 */
6071 return;
6072 }
6073
6074 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
6075
6076 /*
6077 * In case of thread bound kqwl, we let the kqworkloop_update_threads_qos
6078 * take care of overriding the servicer first before it waking up. This
6079 * simplifies the soft bind of the parked bound thread later.
6080 */
6081 if (kqr_thread_permanently_bound(&kqwl->kqwl_request)) {
6082 kqworkloop_bound_thread_wakeup(kqwl);
6083 }
6084 }
6085
6086 static struct kqtailq *
6087 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6088 {
6089 if (kq.kq->kq_state & KQ_WORKLOOP) {
6090 return &kq.kqwl->kqwl_suppressed;
6091 } else if (kq.kq->kq_state & KQ_WORKQ) {
6092 return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index - 1];
6093 } else {
6094 return &kq.kqf->kqf_suppressed;
6095 }
6096 }
6097
6098 struct turnstile *
6099 kqueue_alloc_turnstile(kqueue_t kqu)
6100 {
6101 struct kqworkloop *kqwl = kqu.kqwl;
6102 kq_state_t kq_state;
6103
6104 kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
6105 if (kq_state & KQ_HAS_TURNSTILE) {
6106 /* force a dependency to pair with the atomic or with release below */
6107 return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
6108 (uintptr_t)kq_state);
6109 }
6110
6111 if (!(kq_state & KQ_WORKLOOP)) {
6112 return TURNSTILE_NULL;
6113 }
6114
6115 struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
6116 bool workq_locked = false;
6117
6118 kqlock(kqu);
6119
6120 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6121 workq_locked = true;
6122 workq_kern_threadreq_lock(kqwl->kqwl_p);
6123 }
6124
6125 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
6126 free_ts = ts;
6127 ts = kqwl->kqwl_turnstile;
6128 } else {
6129 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
6130 ts, TURNSTILE_WORKLOOPS);
6131
6132 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6133 os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
6134
6135 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
6136 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
6137 &kqwl->kqwl_request, kqwl->kqwl_owner,
6138 ts, TURNSTILE_IMMEDIATE_UPDATE);
6139 /*
6140 * The workq may no longer be the interlock after this.
6141 * In which case the inheritor wasn't updated.
6142 */
6143 }
6144 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
6145 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6146 }
6147 }
6148
6149 if (workq_locked) {
6150 workq_kern_threadreq_unlock(kqwl->kqwl_p);
6151 }
6152
6153 kqunlock(kqu);
6154
6155 if (free_ts) {
6156 turnstile_deallocate(free_ts);
6157 } else {
6158 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
6159 }
6160 return ts;
6161 }
6162
6163 __attribute__((always_inline))
6164 struct turnstile *
6165 kqueue_turnstile(kqueue_t kqu)
6166 {
6167 kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
6168 if (kq_state & KQ_WORKLOOP) {
6169 return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
6170 }
6171 return TURNSTILE_NULL;
6172 }
6173
6174 __attribute__((always_inline))
6175 struct turnstile *
6176 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
6177 {
6178 struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6179 if (kqwl) {
6180 return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
6181 }
6182 return TURNSTILE_NULL;
6183 }
6184
6185 static void
6186 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
6187 {
6188 workq_threadreq_t kqr = &kqwl->kqwl_request;
6189
6190 /*
6191 * This test is racy, but since we never remove this bit,
6192 * it allows us to avoid taking a lock.
6193 */
6194 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
6195 return;
6196 }
6197
6198 kqlock_held(kqwl);
6199
6200 if (kqr_thread_requested_pending(kqr)) {
6201 kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
6202 WORKQ_THREADREQ_MAKE_OVERCOMMIT);
6203 } else {
6204 kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
6205 }
6206 }
6207
6208 static void
6209 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
6210 kq_index_t override_index)
6211 {
6212 workq_threadreq_t kqr;
6213 kq_index_t old_override_index;
6214 kq_index_t queue_index = kn->kn_qos_index;
6215
6216 if (override_index <= queue_index) {
6217 return;
6218 }
6219
6220 kqr = kqworkq_get_request(kqwq, queue_index);
6221
6222 kqlock_held(kqwq);
6223
6224 old_override_index = kqr->tr_kq_override_index;
6225 if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
6226 thread_t servicer = kqr_thread(kqr);
6227 kqr->tr_kq_override_index = override_index;
6228
6229 /* apply the override to [incoming?] servicing thread */
6230 if (servicer) {
6231 if (old_override_index) {
6232 thread_update_kevent_override(servicer, override_index);
6233 } else {
6234 thread_add_kevent_override(servicer, override_index);
6235 }
6236 }
6237 }
6238 }
6239
6240 static void
6241 kqueue_update_iotier_override(kqueue_t kqu)
6242 {
6243 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6244 kqworkloop_update_iotier_override(kqu.kqwl);
6245 }
6246 }
6247
6248 static void
6249 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
6250 {
6251 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6252 kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6253 qos);
6254 } else {
6255 kqworkq_update_override(kqu.kqwq, kn, qos);
6256 }
6257 }
6258
6259 static void
6260 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
6261 enum kqwl_unbind_locked_mode how, unsigned int flags)
6262 {
6263 struct uthread *ut = get_bsdthread_info(thread);
6264 workq_threadreq_t kqr = &kqwl->kqwl_request;
6265
6266 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6267 thread_tid(thread), 0, 0);
6268
6269 kqlock_held(kqwl);
6270
6271 assert(ut->uu_kqr_bound == kqr);
6272
6273 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6274 ut->uu_kqr_bound = NULL;
6275 }
6276
6277 if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
6278 ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6279 thread_drop_servicer_override(thread);
6280 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6281 }
6282
6283 if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6284 turnstile_update_inheritor(kqwl->kqwl_turnstile,
6285 TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6286 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6287 TURNSTILE_INTERLOCK_HELD);
6288 }
6289
6290 #if CONFIG_PREADOPT_TG
6291 /* The kqueue is able to adopt a thread group again */
6292
6293 thread_group_qos_t old_tg, new_tg = NULL;
6294 int ret = os_atomic_rmw_loop(kqr_preadopt_thread_group_addr(kqr), old_tg, new_tg, relaxed, {
6295 new_tg = old_tg;
6296 if (old_tg == KQWL_PREADOPTED_TG_SENTINEL || old_tg == KQWL_PREADOPTED_TG_PROCESSED) {
6297 new_tg = KQWL_PREADOPTED_TG_NULL;
6298 }
6299 });
6300
6301 if (ret) {
6302 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) &&
6303 KQWL_HAS_PERMANENT_PREADOPTED_TG(old_tg)) {
6304 // The permanently configured bound thread remains a part of the
6305 // thread group until its termination.
6306 } else {
6307 // Servicer can drop any preadopt thread group it has since it has
6308 // unbound.
6309 KQWL_PREADOPT_TG_HISTORY_WRITE_ENTRY(kqwl, KQWL_PREADOPT_OP_SERVICER_UNBIND, old_tg, KQWL_PREADOPTED_TG_NULL);
6310 thread_set_preadopt_thread_group(thread, NULL);
6311 }
6312 }
6313 #endif
6314 thread_update_servicer_iotier_override(thread, THROTTLE_LEVEL_END);
6315
6316 if ((flags & KQUEUE_THREADREQ_UNBIND_SOFT) == 0) {
6317 kqr->tr_thread = THREAD_NULL;
6318 kqr->tr_state = WORKQ_TR_STATE_IDLE;
6319 }
6320 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
6321 }
6322
6323 static void
6324 kqworkloop_unbind_delayed_override_drop(thread_t thread)
6325 {
6326 struct uthread *ut = get_bsdthread_info(thread);
6327 if (!workq_thread_is_permanently_bound(ut)) {
6328 assert(ut->uu_kqr_bound == NULL);
6329 }
6330 if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
6331 thread_drop_servicer_override(thread);
6332 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6333 }
6334 }
6335
6336 /*
6337 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6338 *
6339 * It will acknowledge events, and possibly request a new thread if:
6340 * - there were active events left
6341 * - we pended waitq hook callouts during processing
6342 * - we pended wakeups while processing (or unsuppressing)
6343 *
6344 * Called with kqueue lock held.
6345 */
6346 static void
6347 kqworkloop_unbind(struct kqworkloop *kqwl)
6348 {
6349 struct kqueue *kq = &kqwl->kqwl_kqueue;
6350 workq_threadreq_t kqr = &kqwl->kqwl_request;
6351 thread_t thread = kqr_thread_fast(kqr);
6352 int op = KQWL_UTQ_PARKING;
6353 kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
6354
6355 /*
6356 * For kqwl permanently bound to a thread, this path is only
6357 * exercised when the thread is on its way to terminate.
6358 * We don't care about asking for a new thread in that case.
6359 */
6360 bool kqwl_had_bound_thread = kqr_thread_permanently_bound(kqr);
6361
6362 assert(thread == current_thread());
6363
6364 kqlock(kqwl);
6365
6366 if (!kqwl_had_bound_thread) {
6367 /*
6368 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6369 * unsuppressing knotes not to be applied until the eventual call to
6370 * kqworkloop_update_threads_qos() below.
6371 */
6372 assert((kq->kq_state & KQ_PROCESSING) == 0);
6373 if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
6374 kq->kq_state |= KQ_PROCESSING;
6375 qos_override = kqworkloop_acknowledge_events(kqwl);
6376 kq->kq_state &= ~KQ_PROCESSING;
6377 }
6378 }
6379
6380 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED, 0);
6381
6382 if (!kqwl_had_bound_thread) {
6383 kqworkloop_update_threads_qos(kqwl, op, qos_override);
6384 }
6385
6386 kqunlock(kqwl);
6387
6388 /*
6389 * Drop the override on the current thread last, after the call to
6390 * kqworkloop_update_threads_qos above.
6391 */
6392 kqworkloop_unbind_delayed_override_drop(thread);
6393
6394 /* If last reference, dealloc the workloop kq */
6395 kqworkloop_release(kqwl);
6396 }
6397
6398 static thread_qos_t
6399 kqworkq_unbind_locked(struct kqworkq *kqwq,
6400 workq_threadreq_t kqr, thread_t thread)
6401 {
6402 struct uthread *ut = get_bsdthread_info(thread);
6403 kq_index_t old_override = kqr->tr_kq_override_index;
6404
6405 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6406 thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
6407
6408 kqlock_held(kqwq);
6409
6410 assert(ut->uu_kqr_bound == kqr);
6411 ut->uu_kqr_bound = NULL;
6412 kqr->tr_thread = THREAD_NULL;
6413 kqr->tr_state = WORKQ_TR_STATE_IDLE;
6414 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
6415 kqwq->kqwq_state &= ~KQ_R2K_ARMED;
6416
6417 return old_override;
6418 }
6419
6420 /*
6421 * kqworkq_unbind - unbind of a workq kqueue from a thread
6422 *
6423 * We may have to request new threads.
6424 * This can happen there are no waiting processing threads and:
6425 * - there were active events we never got to (count > 0)
6426 * - we pended waitq hook callouts during processing
6427 * - we pended wakeups while processing (or unsuppressing)
6428 */
6429 static void
6430 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
6431 {
6432 struct kqworkq *kqwq = (struct kqworkq *)p->p_fd.fd_wqkqueue;
6433 __assert_only int rc;
6434
6435 kqlock(kqwq);
6436 rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6437 assert(rc == -1);
6438 kqunlock(kqwq);
6439 }
6440
6441 workq_threadreq_t
6442 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6443 {
6444 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
6445 return &kqwq->kqwq_request[qos_index - 1];
6446 }
6447
6448 static void
6449 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
6450 {
6451 kq_index_t qos = _pthread_priority_thread_qos(pp);
6452
6453 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6454 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6455 pp = _pthread_priority_normalize(pp);
6456 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6457 if (qos == THREAD_QOS_UNSPECIFIED) {
6458 /* On workqueues, outside of QoS means MANAGER */
6459 qos = KQWQ_QOS_MANAGER;
6460 pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6461 } else {
6462 pp = _pthread_priority_normalize(pp);
6463 }
6464 } else {
6465 pp = _pthread_unspecified_priority();
6466 qos = THREAD_QOS_UNSPECIFIED;
6467 }
6468
6469 kn->kn_qos = (int32_t)pp;
6470
6471 if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6472 /* Never lower QoS when in "Merge" mode */
6473 kn->kn_qos_override = qos;
6474 }
6475
6476 /* only adjust in-use qos index when not suppressed */
6477 if (kn->kn_status & KN_SUPPRESSED) {
6478 kqueue_update_override(kqu, kn, qos);
6479 } else if (kn->kn_qos_index != qos) {
6480 knote_dequeue(kqu, kn);
6481 kn->kn_qos_index = qos;
6482 }
6483 }
6484
6485 static void
6486 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
6487 {
6488 thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6489
6490 kqlock_held(kq);
6491
6492 assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6493 assert(qos_index < THREAD_QOS_LAST);
6494
6495 /*
6496 * Early exit for knotes that should not change QoS
6497 */
6498 if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6499 panic("filter %d cannot change QoS", kn->kn_filtid);
6500 } else if (__improbable(!knote_has_qos(kn))) {
6501 return;
6502 }
6503
6504 /*
6505 * knotes with the FALLBACK flag will only use their registration QoS if the
6506 * incoming event has no QoS, else, the registration QoS acts as a floor.
6507 */
6508 thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
6509 if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
6510 if (qos_index == THREAD_QOS_UNSPECIFIED) {
6511 qos_index = req_qos;
6512 }
6513 } else {
6514 if (qos_index < req_qos) {
6515 qos_index = req_qos;
6516 }
6517 }
6518 if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
6519 /* Never lower QoS when in "Merge" mode */
6520 return;
6521 }
6522
6523 if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
6524 /*
6525 * When we're trying to update the QoS override and that both an
6526 * f_event() and other f_* calls are running concurrently, any of these
6527 * in flight calls may want to perform overrides that aren't properly
6528 * serialized with each other.
6529 *
6530 * The first update that observes this racy situation enters a "Merge"
6531 * mode which causes subsequent override requests to saturate the
6532 * override instead of replacing its value.
6533 *
6534 * This mode is left when knote_unlock() or knote_post()
6535 * observe that no other f_* routine is in flight.
6536 */
6537 kn->kn_status |= KN_MERGE_QOS;
6538 }
6539
6540 /*
6541 * Now apply the override if it changed.
6542 */
6543
6544 if (kn->kn_qos_override == qos_index) {
6545 return;
6546 }
6547
6548 kn->kn_qos_override = qos_index;
6549
6550 if (kn->kn_status & KN_SUPPRESSED) {
6551 /*
6552 * For suppressed events, the kn_qos_index field cannot be touched as it
6553 * allows us to know on which supress queue the knote is for a kqworkq.
6554 *
6555 * Also, there's no natural push applied on the kqueues when this field
6556 * changes anyway. We hence need to apply manual overrides in this case,
6557 * which will be cleared when the events are later acknowledged.
6558 */
6559 kqueue_update_override(kq, kn, qos_index);
6560 } else if (kn->kn_qos_index != qos_index) {
6561 knote_dequeue(kq, kn);
6562 kn->kn_qos_index = qos_index;
6563 }
6564 }
6565
6566 void
6567 klist_init(struct klist *list)
6568 {
6569 SLIST_INIT(list);
6570 }
6571
6572
6573 /*
6574 * Query/Post each knote in the object's list
6575 *
6576 * The object lock protects the list. It is assumed that the filter/event
6577 * routine for the object can determine that the object is already locked (via
6578 * the hint) and not deadlock itself.
6579 *
6580 * Autodetach is a specific contract which will detach all knotes from the
6581 * object prior to posting the final event for that knote. This is done while
6582 * under the object lock. A breadcrumb is left in the knote's next pointer to
6583 * indicate to future calls to f_detach routines that they need not reattempt
6584 * to knote_detach from the object's klist again. This is currently used by
6585 * EVFILTID_SPEC, EVFILTID_TTY, EVFILTID_PTMX
6586 *
6587 */
6588 void
6589 knote(struct klist *list, long hint, bool autodetach)
6590 {
6591 struct knote *kn;
6592 struct knote *tmp_kn;
6593 SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmp_kn) {
6594 /*
6595 * We can modify the knote's next pointer since since we are holding the
6596 * object lock and the list can't be concurrently modified. Anyone
6597 * determining auto-detached-ness of a knote should take the primitive lock
6598 * to synchronize.
6599 *
6600 * Note that we do this here instead of the filter's f_event since we may
6601 * not even post the event if the knote is being dropped.
6602 */
6603 if (autodetach) {
6604 kn->kn_selnext.sle_next = KNOTE_AUTODETACHED;
6605 }
6606 knote_post(kn, hint);
6607 }
6608
6609 /* Blast away the entire klist */
6610 if (autodetach) {
6611 klist_init(list);
6612 }
6613 }
6614
6615 /*
6616 * attach a knote to the specified list. Return true if this is the first entry.
6617 * The list is protected by whatever lock the object it is associated with uses.
6618 */
6619 int
6620 knote_attach(struct klist *list, struct knote *kn)
6621 {
6622 int ret = SLIST_EMPTY(list);
6623 SLIST_INSERT_HEAD(list, kn, kn_selnext);
6624 return ret;
6625 }
6626
6627 /*
6628 * detach a knote from the specified list. Return true if that was the last
6629 * entry. The list is protected by whatever lock the object it is associated
6630 * with uses.
6631 */
6632 int
6633 knote_detach(struct klist *list, struct knote *kn)
6634 {
6635 assert(!KNOTE_IS_AUTODETACHED(kn));
6636
6637 SLIST_REMOVE(list, kn, knote, kn_selnext);
6638 return SLIST_EMPTY(list);
6639 }
6640
6641 /*
6642 * knote_vanish - Indicate that the source has vanished
6643 *
6644 * Used only for vanishing ports - vanishing fds go
6645 * through knote_fdclose()
6646 *
6647 * If the knote has requested EV_VANISHED delivery,
6648 * arrange for that. Otherwise, deliver a NOTE_REVOKE
6649 * event for backward compatibility.
6650 *
6651 * The knote is marked as having vanished. The source's
6652 * reference to the knote is dropped by caller, but the knote's
6653 * source reference is only cleaned up later when the knote is dropped.
6654 *
6655 * Our caller already has the object lock held. Calling
6656 * the detach routine would try to take that lock
6657 * recursively - which likely is not supported.
6658 */
6659 void
6660 knote_vanish(struct klist *list, bool make_active)
6661 {
6662 struct knote *kn;
6663 struct knote *kn_next;
6664
6665 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6666 struct kqueue *kq = knote_get_kq(kn);
6667
6668 kqlock(kq);
6669 if (__probable(kn->kn_status & KN_REQVANISH)) {
6670 /*
6671 * If EV_VANISH supported - prepare to deliver one
6672 */
6673 kn->kn_status |= KN_VANISHED;
6674 } else {
6675 /*
6676 * Handle the legacy way to indicate that the port/portset was
6677 * deallocated or left the current Mach portspace (modern technique
6678 * is with an EV_VANISHED protocol).
6679 *
6680 * Deliver an EV_EOF event for these changes (hopefully it will get
6681 * delivered before the port name recycles to the same generation
6682 * count and someone tries to re-register a kevent for it or the
6683 * events are udata-specific - avoiding a conflict).
6684 */
6685 kn->kn_flags |= EV_EOF | EV_ONESHOT;
6686 }
6687 if (make_active) {
6688 knote_activate(kq, kn, FILTER_ACTIVE);
6689 }
6690 kqunlock(kq);
6691 }
6692 }
6693
6694 /*
6695 * remove all knotes referencing a specified fd
6696 *
6697 * Entered with the proc_fd lock already held.
6698 * It returns the same way, but may drop it temporarily.
6699 */
6700 void
6701 knote_fdclose(struct proc *p, int fd)
6702 {
6703 struct filedesc *fdt = &p->p_fd;
6704 struct klist *list;
6705 struct knote *kn;
6706 KNOTE_LOCK_CTX(knlc);
6707
6708 restart:
6709 list = &fdt->fd_knlist[fd];
6710 SLIST_FOREACH(kn, list, kn_link) {
6711 struct kqueue *kq = knote_get_kq(kn);
6712
6713 kqlock(kq);
6714
6715 if (kq->kq_p != p) {
6716 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6717 __func__, kq->kq_p, p);
6718 }
6719
6720 /*
6721 * If the knote supports EV_VANISHED delivery,
6722 * transition it to vanished mode (or skip over
6723 * it if already vanished).
6724 */
6725 if (kn->kn_status & KN_VANISHED) {
6726 kqunlock(kq);
6727 continue;
6728 }
6729
6730 proc_fdunlock(p);
6731 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6732 /* the knote was dropped by someone, nothing to do */
6733 } else if (kn->kn_status & KN_REQVANISH) {
6734 /*
6735 * Since we have REQVANISH for this knote, we need to notify clients about
6736 * the EV_VANISHED.
6737 *
6738 * But unlike mach ports, we want to do the detach here as well and not
6739 * defer it so that we can release the iocount that is on the knote and
6740 * close the fp.
6741 */
6742 kn->kn_status |= KN_VANISHED;
6743
6744 /*
6745 * There may be a concurrent post happening, make sure to wait for it
6746 * before we detach. knote_wait_for_post() unlocks on kq on exit
6747 */
6748 knote_wait_for_post(kq, kn);
6749
6750 knote_fops(kn)->f_detach(kn);
6751 if (kn->kn_is_fd) {
6752 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6753 }
6754 kn->kn_filtid = EVFILTID_DETACHED;
6755 kqlock(kq);
6756
6757 knote_activate(kq, kn, FILTER_ACTIVE);
6758 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6759 } else {
6760 knote_drop(kq, kn, &knlc);
6761 }
6762
6763 proc_fdlock(p);
6764 goto restart;
6765 }
6766 }
6767
6768 /*
6769 * knote_fdfind - lookup a knote in the fd table for process
6770 *
6771 * If the filter is file-based, lookup based on fd index.
6772 * Otherwise use a hash based on the ident.
6773 *
6774 * Matching is based on kq, filter, and ident. Optionally,
6775 * it may also be based on the udata field in the kevent -
6776 * allowing multiple event registration for the file object
6777 * per kqueue.
6778 *
6779 * fd_knhashlock or fdlock held on entry (and exit)
6780 */
6781 static struct knote *
6782 knote_fdfind(struct kqueue *kq,
6783 const struct kevent_internal_s *kev,
6784 bool is_fd,
6785 struct proc *p)
6786 {
6787 struct filedesc *fdp = &p->p_fd;
6788 struct klist *list = NULL;
6789 struct knote *kn = NULL;
6790
6791 /*
6792 * determine where to look for the knote
6793 */
6794 if (is_fd) {
6795 /* fd-based knotes are linked off the fd table */
6796 if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6797 list = &fdp->fd_knlist[kev->kei_ident];
6798 }
6799 } else if (fdp->fd_knhashmask != 0) {
6800 /* hash non-fd knotes here too */
6801 list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6802 }
6803
6804 /*
6805 * scan the selected list looking for a match
6806 */
6807 if (list != NULL) {
6808 SLIST_FOREACH(kn, list, kn_link) {
6809 if (kq == knote_get_kq(kn) &&
6810 kev->kei_ident == kn->kn_id &&
6811 kev->kei_filter == kn->kn_filter) {
6812 if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6813 if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6814 kev->kei_udata == kn->kn_udata) {
6815 break; /* matching udata-specific knote */
6816 }
6817 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6818 break; /* matching non-udata-specific knote */
6819 }
6820 }
6821 }
6822 }
6823 return kn;
6824 }
6825
6826 /*
6827 * kq_add_knote- Add knote to the fd table for process
6828 * while checking for duplicates.
6829 *
6830 * All file-based filters associate a list of knotes by file
6831 * descriptor index. All other filters hash the knote by ident.
6832 *
6833 * May have to grow the table of knote lists to cover the
6834 * file descriptor index presented.
6835 *
6836 * fd_knhashlock and fdlock unheld on entry (and exit).
6837 *
6838 * Takes a rwlock boost if inserting the knote is successful.
6839 */
6840 static int
6841 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6842 struct proc *p)
6843 {
6844 struct filedesc *fdp = &p->p_fd;
6845 struct klist *list = NULL;
6846 int ret = 0;
6847 bool is_fd = kn->kn_is_fd;
6848
6849 if (is_fd) {
6850 proc_fdlock(p);
6851 } else {
6852 knhash_lock(fdp);
6853 }
6854
6855 if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6856 /* found an existing knote: we can't add this one */
6857 ret = ERESTART;
6858 goto out_locked;
6859 }
6860
6861 /* knote was not found: add it now */
6862 if (!is_fd) {
6863 if (fdp->fd_knhashmask == 0) {
6864 u_long size = 0;
6865
6866 list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6867 if (list == NULL) {
6868 ret = ENOMEM;
6869 goto out_locked;
6870 }
6871
6872 fdp->fd_knhash = list;
6873 fdp->fd_knhashmask = size;
6874 }
6875
6876 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6877 SLIST_INSERT_HEAD(list, kn, kn_link);
6878 ret = 0;
6879 goto out_locked;
6880 } else {
6881 /* knote is fd based */
6882
6883 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6884 u_int size = 0;
6885
6886 /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6887 if (kn->kn_id >= (uint64_t)proc_limitgetcur_nofile(p)) {
6888 ret = EINVAL;
6889 goto out_locked;
6890 }
6891 /* have to grow the fd_knlist */
6892 size = fdp->fd_knlistsize;
6893 while (size <= kn->kn_id) {
6894 size += KQEXTENT;
6895 }
6896
6897 if (size >= (UINT_MAX / sizeof(struct klist))) {
6898 ret = EINVAL;
6899 goto out_locked;
6900 }
6901
6902 list = kalloc_type(struct klist, size, Z_WAITOK | Z_ZERO);
6903 if (list == NULL) {
6904 ret = ENOMEM;
6905 goto out_locked;
6906 }
6907
6908 bcopy(fdp->fd_knlist, list,
6909 fdp->fd_knlistsize * sizeof(struct klist));
6910 kfree_type(struct klist, fdp->fd_knlistsize, fdp->fd_knlist);
6911 fdp->fd_knlist = list;
6912 fdp->fd_knlistsize = size;
6913 }
6914
6915 list = &fdp->fd_knlist[kn->kn_id];
6916 SLIST_INSERT_HEAD(list, kn, kn_link);
6917 ret = 0;
6918 goto out_locked;
6919 }
6920
6921 out_locked:
6922 if (ret == 0) {
6923 kqlock(kq);
6924 assert((kn->kn_status & KN_LOCKED) == 0);
6925 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6926 kqueue_retain(kq); /* retain a kq ref */
6927 }
6928 if (is_fd) {
6929 proc_fdunlock(p);
6930 } else {
6931 knhash_unlock(fdp);
6932 }
6933
6934 return ret;
6935 }
6936
6937 /*
6938 * kq_remove_knote - remove a knote from the fd table for process
6939 *
6940 * If the filter is file-based, remove based on fd index.
6941 * Otherwise remove from the hash based on the ident.
6942 *
6943 * fd_knhashlock and fdlock unheld on entry (and exit).
6944 */
6945 static void
6946 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6947 struct knote_lock_ctx *knlc)
6948 {
6949 struct filedesc *fdp = &p->p_fd;
6950 struct klist *list = NULL;
6951 uint16_t kq_state;
6952 bool is_fd = kn->kn_is_fd;
6953
6954 if (is_fd) {
6955 proc_fdlock(p);
6956 } else {
6957 knhash_lock(fdp);
6958 }
6959
6960 if (is_fd) {
6961 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6962 list = &fdp->fd_knlist[kn->kn_id];
6963 } else {
6964 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6965 }
6966 SLIST_REMOVE(list, kn, knote, kn_link);
6967
6968 kqlock(kq);
6969
6970 /* Update the servicer iotier override */
6971 kqueue_update_iotier_override(kq);
6972
6973 kq_state = kq->kq_state;
6974 if (knlc) {
6975 knote_unlock_cancel(kq, kn, knlc);
6976 } else {
6977 kqunlock(kq);
6978 }
6979 if (is_fd) {
6980 proc_fdunlock(p);
6981 } else {
6982 knhash_unlock(fdp);
6983 }
6984
6985 if (kq_state & KQ_DYNAMIC) {
6986 kqworkloop_release((struct kqworkloop *)kq);
6987 }
6988 }
6989
6990 /*
6991 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6992 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6993 *
6994 * fd_knhashlock or fdlock unheld on entry (and exit)
6995 */
6996
6997 static struct knote *
6998 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6999 bool is_fd, struct proc *p)
7000 {
7001 struct filedesc *fdp = &p->p_fd;
7002 struct knote *kn;
7003
7004 if (is_fd) {
7005 proc_fdlock(p);
7006 } else {
7007 knhash_lock(fdp);
7008 }
7009
7010 /*
7011 * Temporary horrible hack:
7012 * this cast is gross and will go away in a future change.
7013 * It is OK to do because we don't look at xflags/s_fflags,
7014 * and that when we cast down the kev this way,
7015 * the truncated filter field works.
7016 */
7017 kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
7018
7019 if (kn) {
7020 kqlock(kq);
7021 assert(knote_get_kq(kn) == kq);
7022 }
7023
7024 if (is_fd) {
7025 proc_fdunlock(p);
7026 } else {
7027 knhash_unlock(fdp);
7028 }
7029
7030 return kn;
7031 }
7032
7033 static struct kqtailq *
7034 knote_get_tailq(kqueue_t kqu, struct knote *kn)
7035 {
7036 kq_index_t qos_index = kn->kn_qos_index;
7037
7038 if (kqu.kq->kq_state & KQ_WORKLOOP) {
7039 assert(qos_index > 0 && qos_index <= KQWL_NBUCKETS);
7040 return &kqu.kqwl->kqwl_queue[qos_index - 1];
7041 } else if (kqu.kq->kq_state & KQ_WORKQ) {
7042 assert(qos_index > 0 && qos_index <= KQWQ_NBUCKETS);
7043 return &kqu.kqwq->kqwq_queue[qos_index - 1];
7044 } else {
7045 assert(qos_index == QOS_INDEX_KQFILE);
7046 return &kqu.kqf->kqf_queue;
7047 }
7048 }
7049
7050 static void
7051 knote_enqueue(kqueue_t kqu, struct knote *kn)
7052 {
7053 kqlock_held(kqu);
7054
7055 if ((kn->kn_status & KN_ACTIVE) == 0) {
7056 return;
7057 }
7058
7059 if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING | KN_QUEUED)) {
7060 return;
7061 }
7062
7063 struct kqtailq *queue = knote_get_tailq(kqu, kn);
7064 bool wakeup = TAILQ_EMPTY(queue);
7065
7066 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7067 kn->kn_status |= KN_QUEUED;
7068 kqu.kq->kq_count++;
7069
7070 if (wakeup) {
7071 if (kqu.kq->kq_state & KQ_WORKLOOP) {
7072 kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
7073 } else if (kqu.kq->kq_state & KQ_WORKQ) {
7074 kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
7075 } else {
7076 kqfile_wakeup(kqu.kqf, 0, THREAD_AWAKENED);
7077 }
7078 }
7079 }
7080
7081 __attribute__((always_inline))
7082 static inline void
7083 knote_dequeue(kqueue_t kqu, struct knote *kn)
7084 {
7085 if (kn->kn_status & KN_QUEUED) {
7086 struct kqtailq *queue = knote_get_tailq(kqu, kn);
7087
7088 // attaching the knote calls knote_reset_priority() without
7089 // the kqlock which is fine, so we can't call kqlock_held()
7090 // if we're not queued.
7091 kqlock_held(kqu);
7092
7093 TAILQ_REMOVE(queue, kn, kn_tqe);
7094 kn->kn_status &= ~KN_QUEUED;
7095 kqu.kq->kq_count--;
7096 if ((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
7097 assert((kqu.kq->kq_count == 0) ==
7098 (bool)TAILQ_EMPTY(queue));
7099 }
7100 }
7101 }
7102
7103 /* called with kqueue lock held */
7104 static void
7105 knote_suppress(kqueue_t kqu, struct knote *kn)
7106 {
7107 struct kqtailq *suppressq;
7108
7109 kqlock_held(kqu);
7110
7111 assert((kn->kn_status & KN_SUPPRESSED) == 0);
7112 assert(kn->kn_status & KN_QUEUED);
7113
7114 knote_dequeue(kqu, kn);
7115 /* deactivate - so new activations indicate a wakeup */
7116 kn->kn_status &= ~KN_ACTIVE;
7117 kn->kn_status |= KN_SUPPRESSED;
7118 suppressq = kqueue_get_suppressed_queue(kqu, kn);
7119 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7120 }
7121
7122 __attribute__((always_inline))
7123 static inline void
7124 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
7125 {
7126 struct kqtailq *suppressq;
7127
7128 kqlock_held(kqu);
7129
7130 assert(kn->kn_status & KN_SUPPRESSED);
7131
7132 kn->kn_status &= ~KN_SUPPRESSED;
7133 suppressq = kqueue_get_suppressed_queue(kqu, kn);
7134 TAILQ_REMOVE(suppressq, kn, kn_tqe);
7135
7136 /*
7137 * If the knote is no longer active, reset its push,
7138 * and resynchronize kn_qos_index with kn_qos_override
7139 * for knotes with a real qos.
7140 */
7141 if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
7142 kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
7143 }
7144 kn->kn_qos_index = kn->kn_qos_override;
7145 }
7146
7147 /* called with kqueue lock held */
7148 static void
7149 knote_unsuppress(kqueue_t kqu, struct knote *kn)
7150 {
7151 knote_unsuppress_noqueue(kqu, kn);
7152 knote_enqueue(kqu, kn);
7153 }
7154
7155 __attribute__((always_inline))
7156 static inline void
7157 knote_mark_active(struct knote *kn)
7158 {
7159 if ((kn->kn_status & KN_ACTIVE) == 0) {
7160 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7161 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
7162 kn->kn_filtid);
7163 }
7164
7165 kn->kn_status |= KN_ACTIVE;
7166 }
7167
7168 /* called with kqueue lock held */
7169 static void
7170 knote_activate(kqueue_t kqu, struct knote *kn, int result)
7171 {
7172 assert(result & FILTER_ACTIVE);
7173 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
7174 // may dequeue the knote
7175 knote_adjust_qos(kqu.kq, kn, result);
7176 }
7177 knote_mark_active(kn);
7178 knote_enqueue(kqu, kn);
7179 }
7180
7181 /*
7182 * This function applies changes requested by f_attach or f_touch for
7183 * a given filter. It proceeds in a carefully chosen order to help
7184 * every single transition do the minimal amount of work possible.
7185 */
7186 static void
7187 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
7188 int result)
7189 {
7190 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
7191 kn->kn_status &= ~KN_DISABLED;
7192
7193 /*
7194 * it is possible for userland to have knotes registered for a given
7195 * workloop `wl_orig` but really handled on another workloop `wl_new`.
7196 *
7197 * In that case, rearming will happen from the servicer thread of
7198 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7199 * this knote to stay suppressed forever if we only relied on
7200 * kqworkloop_acknowledge_events to be called by `wl_orig`.
7201 *
7202 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7203 * unsuppress because that would mess with the processing phase of
7204 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7205 * will be called.
7206 */
7207 if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
7208 if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
7209 knote_unsuppress_noqueue(kqu, kn);
7210 }
7211 }
7212 }
7213
7214 if (result & FILTER_ADJUST_EVENT_IOTIER_BIT) {
7215 kqueue_update_iotier_override(kqu);
7216 }
7217
7218 if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
7219 // may dequeue the knote
7220 knote_reset_priority(kqu, kn, kev->qos);
7221 }
7222
7223 /*
7224 * When we unsuppress above, or because of knote_reset_priority(),
7225 * the knote may have been dequeued, we need to restore the invariant
7226 * that if the knote is active it needs to be queued now that
7227 * we're done applying changes.
7228 */
7229 if (result & FILTER_ACTIVE) {
7230 knote_activate(kqu, kn, result);
7231 } else {
7232 knote_enqueue(kqu, kn);
7233 }
7234
7235 if ((result & FILTER_THREADREQ_NODEFEER) &&
7236 act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
7237 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7238 }
7239 }
7240
7241 /*
7242 * knote_drop - disconnect and drop the knote
7243 *
7244 * Called with the kqueue locked, returns with the kqueue unlocked.
7245 *
7246 * If a knote locking context is passed, it is canceled.
7247 *
7248 * The knote may have already been detached from
7249 * (or not yet attached to) its source object.
7250 */
7251 static void
7252 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
7253 {
7254 struct proc *p = kq->kq_p;
7255
7256 kqlock_held(kq);
7257
7258 assert((kn->kn_status & KN_DROPPING) == 0);
7259 if (knlc == NULL) {
7260 assert((kn->kn_status & KN_LOCKED) == 0);
7261 }
7262 kn->kn_status |= KN_DROPPING;
7263
7264 if (kn->kn_status & KN_SUPPRESSED) {
7265 knote_unsuppress_noqueue(kq, kn);
7266 } else {
7267 knote_dequeue(kq, kn);
7268 }
7269 knote_wait_for_post(kq, kn);
7270
7271 /* Even if we are autodetached, the filter may need to do cleanups of any
7272 * stuff stashed on the knote so always make the call and let each filter
7273 * handle the possibility of autodetached-ness */
7274 knote_fops(kn)->f_detach(kn);
7275
7276 /* kq may be freed when kq_remove_knote() returns */
7277 kq_remove_knote(kq, kn, p, knlc);
7278 if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
7279 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
7280 }
7281
7282 knote_free(kn);
7283 }
7284
7285 void
7286 knote_init(void)
7287 {
7288 #if CONFIG_MEMORYSTATUS
7289 /* Initialize the memorystatus list lock */
7290 memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
7291 #endif
7292 }
7293 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
7294
7295 const struct filterops *
7296 knote_fops(struct knote *kn)
7297 {
7298 return sysfilt_ops[kn->kn_filtid];
7299 }
7300
7301 static struct knote *
7302 knote_alloc(void)
7303 {
7304 return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
7305 }
7306
7307 static void
7308 knote_free(struct knote *kn)
7309 {
7310 assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
7311 zfree(knote_zone, kn);
7312 }
7313
7314 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
7315
7316 kevent_ctx_t
7317 kevent_get_context(thread_t thread)
7318 {
7319 uthread_t ut = get_bsdthread_info(thread);
7320 return &ut->uu_save.uus_kevent;
7321 }
7322
7323 static inline bool
7324 kevent_args_requesting_events(unsigned int flags, int nevents)
7325 {
7326 return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
7327 }
7328
7329 static inline int
7330 kevent_adjust_flags_for_proc(proc_t p, int flags)
7331 {
7332 __builtin_assume(p);
7333 return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
7334 }
7335
7336 /*!
7337 * @function kevent_get_kqfile
7338 *
7339 * @brief
7340 * Lookup a kqfile by fd.
7341 *
7342 * @discussion
7343 * Callers: kevent, kevent64, kevent_qos
7344 *
7345 * This is not assumed to be a fastpath (kqfile interfaces are legacy)
7346 */
7347 OS_NOINLINE
7348 static int
7349 kevent_get_kqfile(struct proc *p, int fd, int flags,
7350 struct fileproc **fpp, struct kqueue **kqp)
7351 {
7352 int error = 0;
7353 struct kqueue *kq;
7354
7355 error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
7356 if (__improbable(error)) {
7357 return error;
7358 }
7359 kq = (struct kqueue *)fp_get_data((*fpp));
7360
7361 uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
7362 if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
7363 kqlock(kq);
7364 kq_state = kq->kq_state;
7365 if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
7366 if (flags & KEVENT_FLAG_LEGACY32) {
7367 kq_state |= KQ_KEV32;
7368 } else if (flags & KEVENT_FLAG_LEGACY64) {
7369 kq_state |= KQ_KEV64;
7370 } else {
7371 kq_state |= KQ_KEV_QOS;
7372 }
7373 kq->kq_state = kq_state;
7374 }
7375 kqunlock(kq);
7376 }
7377
7378 /*
7379 * kqfiles can't be used through the legacy kevent()
7380 * and other interfaces at the same time.
7381 */
7382 if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
7383 (bool)(kq_state & KQ_KEV32))) {
7384 fp_drop(p, fd, *fpp, 0);
7385 return EINVAL;
7386 }
7387
7388 *kqp = kq;
7389 return 0;
7390 }
7391
7392 /*!
7393 * @function kevent_get_kqwq
7394 *
7395 * @brief
7396 * Lookup or create the process kqwq (faspath).
7397 *
7398 * @discussion
7399 * Callers: kevent64, kevent_qos
7400 */
7401 OS_ALWAYS_INLINE
7402 static int
7403 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
7404 {
7405 struct kqworkq *kqwq = p->p_fd.fd_wqkqueue;
7406
7407 if (__improbable(kevent_args_requesting_events(flags, nevents))) {
7408 return EINVAL;
7409 }
7410 if (__improbable(kqwq == NULL)) {
7411 kqwq = kqworkq_alloc(p, flags);
7412 if (__improbable(kqwq == NULL)) {
7413 return ENOMEM;
7414 }
7415 }
7416
7417 *kqp = &kqwq->kqwq_kqueue;
7418 return 0;
7419 }
7420
7421 #pragma mark kevent copyio
7422
7423 /*!
7424 * @function kevent_get_data_size
7425 *
7426 * @brief
7427 * Copies in the extra data size from user-space.
7428 */
7429 static int
7430 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
7431 kevent_ctx_t kectx)
7432 {
7433 if (!data_avail || !data_out) {
7434 kectx->kec_data_size = 0;
7435 kectx->kec_data_resid = 0;
7436 } else if (flags & KEVENT_FLAG_PROC64) {
7437 user64_size_t usize = 0;
7438 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7439 if (__improbable(error)) {
7440 return error;
7441 }
7442 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7443 } else {
7444 user32_size_t usize = 0;
7445 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
7446 if (__improbable(error)) {
7447 return error;
7448 }
7449 kectx->kec_data_avail = data_avail;
7450 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
7451 }
7452 kectx->kec_data_out = data_out;
7453 kectx->kec_data_avail = data_avail;
7454 return 0;
7455 }
7456
7457 /*!
7458 * @function kevent_put_data_size
7459 *
7460 * @brief
7461 * Copies out the residual data size to user-space if any has been used.
7462 */
7463 static int
7464 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
7465 {
7466 if (kectx->kec_data_resid == kectx->kec_data_size) {
7467 return 0;
7468 }
7469 if (flags & KEVENT_FLAG_KERNEL) {
7470 *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
7471 return 0;
7472 }
7473 if (flags & KEVENT_FLAG_PROC64) {
7474 user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
7475 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7476 } else {
7477 user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
7478 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
7479 }
7480 }
7481
7482 /*!
7483 * @function kevent_legacy_copyin
7484 *
7485 * @brief
7486 * Handles the copyin of a kevent/kevent64 event.
7487 */
7488 static int
7489 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
7490 {
7491 int error;
7492
7493 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7494
7495 if (flags & KEVENT_FLAG_LEGACY64) {
7496 struct kevent64_s kev64;
7497
7498 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7499 if (__improbable(error)) {
7500 return error;
7501 }
7502 *addrp += sizeof(kev64);
7503 *kevp = (struct kevent_qos_s){
7504 .ident = kev64.ident,
7505 .filter = kev64.filter,
7506 /* Make sure user doesn't pass in any system flags */
7507 .flags = kev64.flags & ~EV_SYSFLAGS,
7508 .udata = kev64.udata,
7509 .fflags = kev64.fflags,
7510 .data = kev64.data,
7511 .ext[0] = kev64.ext[0],
7512 .ext[1] = kev64.ext[1],
7513 };
7514 } else if (flags & KEVENT_FLAG_PROC64) {
7515 struct user64_kevent kev64;
7516
7517 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7518 if (__improbable(error)) {
7519 return error;
7520 }
7521 *addrp += sizeof(kev64);
7522 *kevp = (struct kevent_qos_s){
7523 .ident = kev64.ident,
7524 .filter = kev64.filter,
7525 /* Make sure user doesn't pass in any system flags */
7526 .flags = kev64.flags & ~EV_SYSFLAGS,
7527 .udata = kev64.udata,
7528 .fflags = kev64.fflags,
7529 .data = kev64.data,
7530 };
7531 } else {
7532 struct user32_kevent kev32;
7533
7534 error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7535 if (__improbable(error)) {
7536 return error;
7537 }
7538 *addrp += sizeof(kev32);
7539 *kevp = (struct kevent_qos_s){
7540 .ident = (uintptr_t)kev32.ident,
7541 .filter = kev32.filter,
7542 /* Make sure user doesn't pass in any system flags */
7543 .flags = kev32.flags & ~EV_SYSFLAGS,
7544 .udata = CAST_USER_ADDR_T(kev32.udata),
7545 .fflags = kev32.fflags,
7546 .data = (intptr_t)kev32.data,
7547 };
7548 }
7549
7550 return 0;
7551 }
7552
7553 /*!
7554 * @function kevent_modern_copyin
7555 *
7556 * @brief
7557 * Handles the copyin of a kevent_qos/kevent_id event.
7558 */
7559 static int
7560 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7561 {
7562 int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7563 if (__probable(!error)) {
7564 /* Make sure user doesn't pass in any system flags */
7565 *addrp += sizeof(struct kevent_qos_s);
7566 kevp->flags &= ~EV_SYSFLAGS;
7567 }
7568 return error;
7569 }
7570
7571 /*!
7572 * @function kevent_legacy_copyout
7573 *
7574 * @brief
7575 * Handles the copyout of a kevent/kevent64 event.
7576 */
7577 static int
7578 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7579 {
7580 int advance;
7581 int error;
7582
7583 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7584
7585 /*
7586 * fully initialize the differnt output event structure
7587 * types from the internal kevent (and some universal
7588 * defaults for fields not represented in the internal
7589 * form).
7590 *
7591 * Note: these structures have no padding hence the C99
7592 * initializers below do not leak kernel info.
7593 */
7594 if (flags & KEVENT_FLAG_LEGACY64) {
7595 struct kevent64_s kev64 = {
7596 .ident = kevp->ident,
7597 .filter = kevp->filter,
7598 .flags = kevp->flags,
7599 .fflags = kevp->fflags,
7600 .data = (int64_t)kevp->data,
7601 .udata = kevp->udata,
7602 .ext[0] = kevp->ext[0],
7603 .ext[1] = kevp->ext[1],
7604 };
7605 advance = sizeof(struct kevent64_s);
7606 error = copyout((caddr_t)&kev64, *addrp, advance);
7607 } else if (flags & KEVENT_FLAG_PROC64) {
7608 /*
7609 * deal with the special case of a user-supplied
7610 * value of (uintptr_t)-1.
7611 */
7612 uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7613 (uint64_t)-1LL : (uint64_t)kevp->ident;
7614 struct user64_kevent kev64 = {
7615 .ident = ident,
7616 .filter = kevp->filter,
7617 .flags = kevp->flags,
7618 .fflags = kevp->fflags,
7619 .data = (int64_t) kevp->data,
7620 .udata = (user_addr_t) kevp->udata,
7621 };
7622 advance = sizeof(kev64);
7623 error = copyout((caddr_t)&kev64, *addrp, advance);
7624 } else {
7625 struct user32_kevent kev32 = {
7626 .ident = (uint32_t)kevp->ident,
7627 .filter = kevp->filter,
7628 .flags = kevp->flags,
7629 .fflags = kevp->fflags,
7630 .data = (int32_t)kevp->data,
7631 .udata = (uint32_t)kevp->udata,
7632 };
7633 advance = sizeof(kev32);
7634 error = copyout((caddr_t)&kev32, *addrp, advance);
7635 }
7636 if (__probable(!error)) {
7637 *addrp += advance;
7638 }
7639 return error;
7640 }
7641
7642 /*!
7643 * @function kevent_modern_copyout
7644 *
7645 * @brief
7646 * Handles the copyout of a kevent_qos/kevent_id event.
7647 */
7648 OS_ALWAYS_INLINE
7649 static inline int
7650 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7651 {
7652 int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7653 if (__probable(!error)) {
7654 *addrp += sizeof(struct kevent_qos_s);
7655 }
7656 return error;
7657 }
7658
7659 #pragma mark kevent core implementation
7660
7661 /*!
7662 * @function kevent_callback_inline
7663 *
7664 * @brief
7665 * Callback for each individual event
7666 *
7667 * @discussion
7668 * This is meant to be inlined in kevent_modern_callback and
7669 * kevent_legacy_callback.
7670 */
7671 OS_ALWAYS_INLINE
7672 static inline int
7673 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7674 {
7675 int error;
7676
7677 assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7678
7679 /*
7680 * Copy out the appropriate amount of event data for this user.
7681 */
7682 if (legacy) {
7683 error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7684 kectx->kec_process_flags);
7685 } else {
7686 error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7687 }
7688
7689 /*
7690 * If there isn't space for additional events, return
7691 * a harmless error to stop the processing here
7692 */
7693 if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7694 error = EWOULDBLOCK;
7695 }
7696 return error;
7697 }
7698
7699 /*!
7700 * @function kevent_modern_callback
7701 *
7702 * @brief
7703 * Callback for each individual modern event.
7704 *
7705 * @discussion
7706 * This callback handles kevent_qos/kevent_id events.
7707 */
7708 static int
7709 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7710 {
7711 return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7712 }
7713
7714 /*!
7715 * @function kevent_legacy_callback
7716 *
7717 * @brief
7718 * Callback for each individual legacy event.
7719 *
7720 * @discussion
7721 * This callback handles kevent/kevent64 events.
7722 */
7723 static int
7724 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7725 {
7726 return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7727 }
7728
7729 /*!
7730 * @function kevent_cleanup
7731 *
7732 * @brief
7733 * Handles the cleanup returning from a kevent call.
7734 *
7735 * @discussion
7736 * kevent entry points will take a reference on workloops,
7737 * and a usecount on the fileglob of kqfiles.
7738 *
7739 * This function undoes this on the exit paths of kevents.
7740 *
7741 * @returns
7742 * The error to return to userspace.
7743 */
7744 static int
7745 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7746 {
7747 // poll should not call any codepath leading to this
7748 assert((flags & KEVENT_FLAG_POLL) == 0);
7749
7750 if (flags & KEVENT_FLAG_WORKLOOP) {
7751 kqworkloop_release(kqu.kqwl);
7752 } else if (flags & KEVENT_FLAG_WORKQ) {
7753 /* nothing held */
7754 } else {
7755 fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7756 }
7757
7758 /* don't restart after signals... */
7759 if (error == ERESTART) {
7760 error = EINTR;
7761 } else if (error == 0) {
7762 /* don't abandon other output just because of residual copyout failures */
7763 (void)kevent_put_data_size(flags, kectx);
7764 }
7765
7766 if (flags & KEVENT_FLAG_PARKING) {
7767 thread_t th = current_thread();
7768 struct uthread *uth = get_bsdthread_info(th);
7769 workq_threadreq_t kqr = uth->uu_kqr_bound;
7770 if (kqr && !(kqr->tr_flags & WORKQ_TR_FLAG_PERMANENT_BIND)) {
7771 thread_unfreeze_base_pri(th);
7772 }
7773 }
7774 return error;
7775 }
7776
7777 /*!
7778 * @function kqueue_process
7779 *
7780 * @brief
7781 * Process the triggered events in a kqueue.
7782 *
7783 * @discussion
7784 * Walk the queued knotes and validate that they are really still triggered
7785 * events by calling the filter routines (if necessary).
7786 *
7787 * For each event that is still considered triggered, invoke the callback
7788 * routine provided.
7789 *
7790 * caller holds a reference on the kqueue.
7791 * kqueue locked on entry and exit - but may be dropped
7792 * kqueue list locked (held for duration of call)
7793 *
7794 * This is only called by kqueue_scan() so that the compiler can inline it.
7795 *
7796 * For kqworkloops that are permanently configured with a bound thread, this
7797 * function parks the bound thread (instead of returning) if there are no events
7798 * or errors to be returned and KEVENT_FLAG_PARKING was specified.
7799 *
7800 * @returns
7801 * - 0: no event was returned, no other error occured
7802 * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7803 * - EWOULDBLOCK: (not an error) events have been found and we should return
7804 * - EFAULT: copyout failed
7805 * - filter specific errors
7806 */
7807 static int
7808 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7809 kevent_callback_t callback)
7810 {
7811 workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7812 struct knote *kn;
7813 int error = 0, rc = 0;
7814 struct kqtailq *base_queue, *queue;
7815 uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7816 bool kqwl_permanently_bound = false;
7817
7818 if (kq_type & KQ_WORKQ) {
7819 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7820 } else if (kq_type & KQ_WORKLOOP) {
7821 kqwl_permanently_bound = kqr_thread_permanently_bound(kqr);
7822 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7823 } else {
7824 kqfile_retry:
7825 rc = kqfile_begin_processing(kqu.kqf);
7826 if (rc == EBADF) {
7827 return EBADF;
7828 }
7829 }
7830
7831 if (rc == -1) {
7832 /* Nothing to process */
7833 if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7834 kqwl_permanently_bound) {
7835 goto kqwl_bound_thread_park;
7836 }
7837 return 0;
7838 }
7839
7840 /*
7841 * loop through the enqueued knotes associated with this request,
7842 * processing each one. Each request may have several queues
7843 * of knotes to process (depending on the type of kqueue) so we
7844 * have to loop through all the queues as long as we have additional
7845 * space.
7846 */
7847
7848 process_again:
7849 if (kq_type & KQ_WORKQ) {
7850 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index - 1];
7851 } else if (kq_type & KQ_WORKLOOP) {
7852 base_queue = &kqu.kqwl->kqwl_queue[0];
7853 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7854 } else {
7855 base_queue = queue = &kqu.kqf->kqf_queue;
7856 }
7857
7858 do {
7859 while ((kn = TAILQ_FIRST(queue)) != NULL) {
7860 error = knote_process(kn, kectx, callback);
7861 if (error == EJUSTRETURN) {
7862 error = 0;
7863 } else if (__improbable(error)) {
7864 /* error is EWOULDBLOCK when the out event array is full */
7865 goto stop_processing;
7866 }
7867 }
7868 } while (queue-- > base_queue);
7869
7870 if (kectx->kec_process_noutputs) {
7871 /* callers will transform this into no error */
7872 error = EWOULDBLOCK;
7873 }
7874
7875 stop_processing:
7876 /*
7877 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7878 * we want to unbind the kqrequest from the thread.
7879 *
7880 * However, because the kq locks are dropped several times during process,
7881 * new knotes may have fired again, in which case, we want to fail the end
7882 * processing and process again, until it converges.
7883 *
7884 * If we have an error or returned events, end processing never fails.
7885 */
7886 if (error) {
7887 flags &= ~KEVENT_FLAG_PARKING;
7888 }
7889 if (kq_type & KQ_WORKQ) {
7890 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7891 } else if (kq_type & KQ_WORKLOOP) {
7892 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7893 } else {
7894 rc = kqfile_end_processing(kqu.kqf);
7895 }
7896
7897 if (__probable(error)) {
7898 return error;
7899 }
7900
7901 if (__probable(rc >= 0)) {
7902 assert(rc == 0 || rc == EBADF);
7903 if (rc == 0) {
7904 if ((kq_type & KQ_WORKLOOP) && (flags & KEVENT_FLAG_PARKING) &&
7905 kqwl_permanently_bound) {
7906 goto kqwl_bound_thread_park;
7907 }
7908 }
7909 return rc;
7910 }
7911
7912 if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7913 assert(flags & KEVENT_FLAG_PARKING);
7914 goto process_again;
7915 } else {
7916 goto kqfile_retry;
7917 }
7918
7919 kqwl_bound_thread_park:
7920 #if DEVELOPMENT | DEBUG
7921 assert(current_thread() == kqr_thread_fast(kqr));
7922 assert(workq_thread_is_permanently_bound(current_uthread()));
7923 #endif
7924 kqworkloop_bound_thread_park(kqu.kqwl, kqr_thread_fast(kqr));
7925 __builtin_unreachable();
7926 }
7927
7928 /*!
7929 * @function kqueue_scan_continue
7930 *
7931 * @brief
7932 * The continuation used by kqueue_scan for kevent entry points.
7933 *
7934 * @discussion
7935 * Assumes we inherit a use/ref count on the kq or its fileglob.
7936 *
7937 * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7938 * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7939 */
7940 OS_NORETURN OS_NOINLINE
7941 static void
7942 kqueue_scan_continue(void *data, wait_result_t wait_result)
7943 {
7944 uthread_t ut = current_uthread();
7945 kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7946 int error = 0, flags = kectx->kec_process_flags;
7947 struct kqueue *kq = data;
7948
7949 /*
7950 * only kevent variants call in here, so we know the callback is
7951 * kevent_legacy_callback or kevent_modern_callback.
7952 */
7953 assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7954
7955 switch (wait_result) {
7956 case THREAD_AWAKENED:
7957 if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7958 error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7959 } else {
7960 error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7961 }
7962 break;
7963 case THREAD_TIMED_OUT:
7964 error = 0;
7965 break;
7966 case THREAD_INTERRUPTED:
7967 error = EINTR;
7968 break;
7969 case THREAD_RESTART:
7970 error = EBADF;
7971 break;
7972 default:
7973 panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7974 }
7975
7976
7977 error = kevent_cleanup(kq, flags, error, kectx);
7978 *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7979 unix_syscall_return(error);
7980 }
7981
7982 /*!
7983 * @function kqueue_scan
7984 *
7985 * @brief
7986 * Scan and wait for events in a kqueue (used by poll & kevent).
7987 *
7988 * @discussion
7989 * Process the triggered events in a kqueue.
7990 *
7991 * If there are no events triggered arrange to wait for them:
7992 * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7993 * - possibly until kectx->kec_deadline expires
7994 *
7995 * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7996 * are set, then it will wait in the kqueue_scan_continue continuation.
7997 *
7998 * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7999 * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
8000 *
8001 * @param kqu
8002 * The kqueue being scanned.
8003 *
8004 * @param flags
8005 * The KEVENT_FLAG_* flags for this call.
8006 *
8007 * @param kectx
8008 * The context used for this scan.
8009 * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
8010 *
8011 * @param callback
8012 * The callback to be called on events sucessfully processed.
8013 * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
8014 */
8015 int
8016 kqueue_scan(kqueue_t kqu, int flags, kevent_ctx_t kectx,
8017 kevent_callback_t callback)
8018 {
8019 int error;
8020
8021 for (;;) {
8022 kqlock(kqu);
8023 error = kqueue_process(kqu, flags, kectx, callback);
8024
8025 /*
8026 * If we got an error, events returned (EWOULDBLOCK)
8027 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
8028 * just return.
8029 */
8030 if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
8031 kqunlock(kqu);
8032 return error == EWOULDBLOCK ? 0 : error;
8033 }
8034
8035 assert((kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
8036
8037 kqu.kqf->kqf_state |= KQ_SLEEP;
8038 assert_wait_deadline(&kqu.kqf->kqf_count, THREAD_ABORTSAFE,
8039 kectx->kec_deadline);
8040 kqunlock(kqu);
8041
8042 if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
8043 thread_block_parameter(kqueue_scan_continue, kqu.kqf);
8044 __builtin_unreachable();
8045 }
8046
8047 wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
8048 switch (wr) {
8049 case THREAD_AWAKENED:
8050 break;
8051 case THREAD_TIMED_OUT:
8052 return 0;
8053 case THREAD_INTERRUPTED:
8054 return EINTR;
8055 case THREAD_RESTART:
8056 return EBADF;
8057 default:
8058 panic("%s: - bad wait_result (%d)", __func__, wr);
8059 }
8060 }
8061 }
8062
8063 /*!
8064 * @function kevent_internal
8065 *
8066 * @brief
8067 * Common kevent code.
8068 *
8069 * @discussion
8070 * Needs to be inlined to specialize for legacy or modern and
8071 * eliminate dead code.
8072 *
8073 * This is the core logic of kevent entry points, that will:
8074 * - register kevents
8075 * - optionally scan the kqueue for events
8076 *
8077 * The caller is giving kevent_internal a reference on the kqueue
8078 * or its fileproc that needs to be cleaned up by kevent_cleanup().
8079 */
8080 OS_ALWAYS_INLINE
8081 static inline int
8082 kevent_internal(kqueue_t kqu,
8083 user_addr_t changelist, int nchanges,
8084 user_addr_t ueventlist, int nevents,
8085 int flags, kevent_ctx_t kectx, int32_t *retval,
8086 bool legacy)
8087 {
8088 int error = 0, noutputs = 0, register_rc;
8089
8090 /* only bound threads can receive events on workloops */
8091 if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
8092 #if CONFIG_WORKLOOP_DEBUG
8093 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
8094 .uu_kqid = kqu.kqwl->kqwl_dynamicid,
8095 .uu_kq = error ? NULL : kqu.kq,
8096 .uu_error = error,
8097 .uu_nchanges = nchanges,
8098 .uu_nevents = nevents,
8099 .uu_flags = flags,
8100 });
8101 #endif // CONFIG_WORKLOOP_DEBUG
8102
8103 if (flags & KEVENT_FLAG_KERNEL) {
8104 /* see kevent_workq_internal */
8105 error = copyout(&kqu.kqwl->kqwl_dynamicid,
8106 ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
8107 kectx->kec_data_resid -= sizeof(kqueue_id_t);
8108 if (__improbable(error)) {
8109 goto out;
8110 }
8111 }
8112
8113 if (kevent_args_requesting_events(flags, nevents)) {
8114 /*
8115 * Disable the R2K notification while doing a register, if the
8116 * caller wants events too, we don't want the AST to be set if we
8117 * will process these events soon.
8118 */
8119 kqlock(kqu);
8120 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
8121 kqunlock(kqu);
8122 flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
8123 }
8124 }
8125
8126 /* register all the change requests the user provided... */
8127 while (nchanges > 0 && error == 0) {
8128 struct kevent_qos_s kev;
8129 struct knote *kn = NULL;
8130
8131 if (legacy) {
8132 error = kevent_legacy_copyin(&changelist, &kev, flags);
8133 } else {
8134 error = kevent_modern_copyin(&changelist, &kev);
8135 }
8136 if (error) {
8137 break;
8138 }
8139
8140 register_rc = kevent_register(kqu.kq, &kev, &kn);
8141 if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
8142 thread_t thread = current_thread();
8143
8144 kqlock_held(kqu);
8145
8146 if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
8147 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
8148 }
8149
8150 // f_post_register_wait is meant to call a continuation and not to
8151 // return, which is why we don't support FILTER_REGISTER_WAIT if
8152 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
8153 // waits isn't the last.
8154 //
8155 // It is implementable, but not used by any userspace code at the
8156 // moment, so for now return ENOTSUP if someone tries to do it.
8157 if (nchanges == 1 && noutputs < nevents &&
8158 (flags & KEVENT_FLAG_KERNEL) == 0 &&
8159 (flags & KEVENT_FLAG_PARKING) == 0 &&
8160 (flags & KEVENT_FLAG_ERROR_EVENTS) &&
8161 (flags & KEVENT_FLAG_WORKLOOP)) {
8162 uthread_t ut = get_bsdthread_info(thread);
8163
8164 /*
8165 * store the continuation/completion data in the uthread
8166 *
8167 * Note: the kectx aliases with this,
8168 * and is destroyed in the process.
8169 */
8170 ut->uu_save.uus_kevent_register = (struct _kevent_register){
8171 .kev = kev,
8172 .kqwl = kqu.kqwl,
8173 .eventout = noutputs,
8174 .ueventlist = ueventlist,
8175 };
8176 knote_fops(kn)->f_post_register_wait(ut, kn,
8177 &ut->uu_save.uus_kevent_register);
8178 __builtin_unreachable();
8179 }
8180 kqunlock(kqu);
8181
8182 kev.flags |= EV_ERROR;
8183 kev.data = ENOTSUP;
8184 } else {
8185 assert((register_rc & FILTER_REGISTER_WAIT) == 0);
8186 }
8187
8188 // keep in sync with kevent_register_wait_return()
8189 if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
8190 if ((kev.flags & EV_ERROR) == 0) {
8191 kev.flags |= EV_ERROR;
8192 kev.data = 0;
8193 }
8194 if (legacy) {
8195 error = kevent_legacy_copyout(&kev, &ueventlist, flags);
8196 } else {
8197 error = kevent_modern_copyout(&kev, &ueventlist);
8198 }
8199 if (error == 0) {
8200 noutputs++;
8201 }
8202 } else if (kev.flags & EV_ERROR) {
8203 error = (int)kev.data;
8204 }
8205 nchanges--;
8206 }
8207
8208 if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
8209 nevents > 0 && noutputs == 0 && error == 0) {
8210 kectx->kec_process_flags = flags;
8211 kectx->kec_process_nevents = nevents;
8212 kectx->kec_process_noutputs = 0;
8213 kectx->kec_process_eventlist = ueventlist;
8214
8215 if (legacy) {
8216 error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
8217 } else {
8218 error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
8219 }
8220
8221 noutputs = kectx->kec_process_noutputs;
8222 } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
8223 /*
8224 * If we didn't through kqworkloop_end_processing(),
8225 * we need to do it here.
8226 *
8227 * kqueue_scan will call kqworkloop_end_processing(),
8228 * so we only need to do it if we didn't scan.
8229 */
8230 kqlock(kqu);
8231 kqworkloop_end_processing(kqu.kqwl, 0, 0);
8232 kqunlock(kqu);
8233 }
8234
8235 *retval = noutputs;
8236 out:
8237 return kevent_cleanup(kqu.kq, flags, error, kectx);
8238 }
8239
8240 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
8241
8242 /*!
8243 * @function kevent_modern_internal
8244 *
8245 * @brief
8246 * The backend of the kevent_id and kevent_workq_internal entry points.
8247 *
8248 * @discussion
8249 * Needs to be inline due to the number of arguments.
8250 */
8251 OS_NOINLINE
8252 static int
8253 kevent_modern_internal(kqueue_t kqu,
8254 user_addr_t changelist, int nchanges,
8255 user_addr_t ueventlist, int nevents,
8256 int flags, kevent_ctx_t kectx, int32_t *retval)
8257 {
8258 return kevent_internal(kqu.kq, changelist, nchanges,
8259 ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
8260 }
8261
8262 /*!
8263 * @function kevent_id
8264 *
8265 * @brief
8266 * The kevent_id() syscall.
8267 */
8268 int
8269 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
8270 {
8271 int error, flags = uap->flags & KEVENT_FLAG_USER;
8272 uthread_t uth = current_uthread();
8273 workq_threadreq_t kqr = uth->uu_kqr_bound;
8274 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8275 kqueue_t kqu;
8276
8277 flags = kevent_adjust_flags_for_proc(p, flags);
8278 flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
8279
8280 if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
8281 KEVENT_FLAG_WORKLOOP)) {
8282 return EINVAL;
8283 }
8284
8285 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8286 if (__improbable(error)) {
8287 return error;
8288 }
8289
8290 kectx->kec_deadline = 0;
8291 kectx->kec_fp = NULL;
8292 kectx->kec_fd = -1;
8293 /* the kec_process_* fields are filled if kqueue_scann is called only */
8294
8295 /*
8296 * Get the kq we are going to be working on
8297 * As a fastpath, look at the currently bound workloop.
8298 */
8299 kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
8300 if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
8301 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
8302 return EEXIST;
8303 }
8304 kqworkloop_retain(kqu.kqwl);
8305 } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
8306 return EXDEV;
8307 } else {
8308 error = kqworkloop_get_or_create(p, uap->id, NULL, NULL,
8309 flags, &kqu.kqwl);
8310 if (__improbable(error)) {
8311 return error;
8312 }
8313 }
8314
8315 return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
8316 uap->eventlist, uap->nevents, flags, kectx, retval);
8317 }
8318
8319 /**!
8320 * @function kevent_workq_internal
8321 *
8322 * @discussion
8323 * This function is exported for the sake of the workqueue subsystem.
8324 *
8325 * It is called in two ways:
8326 * - when a thread is about to go to userspace to ask for pending event
8327 * - when a thread is returning from userspace with events back
8328 *
8329 * the workqueue subsystem will only use the following flags:
8330 * - KEVENT_FLAG_STACK_DATA (always)
8331 * - KEVENT_FLAG_IMMEDIATE (always)
8332 * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
8333 * userspace).
8334 *
8335 * It implicitly acts on the bound kqueue, and for the case of workloops
8336 * will copyout the kqueue ID before anything else.
8337 *
8338 *
8339 * Pthread will have setup the various arguments to fit this stack layout:
8340 *
8341 * +-------....----+--------------+-----------+--------------------+
8342 * | user stack | data avail | nevents | pthread_self() |
8343 * +-------....----+--------------+-----------+--------------------+
8344 * ^ ^
8345 * data_out eventlist
8346 *
8347 * When a workloop is used, the workloop ID is copied out right before
8348 * the eventlist and is taken from the data buffer.
8349 *
8350 * @warning
8351 * This function is carefuly tailored to not make any call except the final tail
8352 * call into kevent_modern_internal. (LTO inlines current_uthread()).
8353 *
8354 * This function is performance sensitive due to the workq subsystem.
8355 */
8356 int
8357 kevent_workq_internal(struct proc *p,
8358 user_addr_t changelist, int nchanges,
8359 user_addr_t eventlist, int nevents,
8360 user_addr_t data_out, user_size_t *data_available,
8361 unsigned int flags, int32_t *retval)
8362 {
8363 uthread_t uth = current_uthread();
8364 workq_threadreq_t kqr = uth->uu_kqr_bound;
8365 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8366 kqueue_t kqu;
8367
8368 assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
8369 flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
8370
8371 kectx->kec_data_out = data_out;
8372 kectx->kec_data_avail = (uint64_t)data_available;
8373 kectx->kec_data_size = *data_available;
8374 kectx->kec_data_resid = *data_available;
8375 kectx->kec_deadline = 0;
8376 kectx->kec_fp = NULL;
8377 kectx->kec_fd = -1;
8378 /* the kec_process_* fields are filled if kqueue_scann is called only */
8379
8380 flags = kevent_adjust_flags_for_proc(p, flags);
8381
8382 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
8383 kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
8384 kqworkloop_retain(kqu.kqwl);
8385
8386 flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
8387 KEVENT_FLAG_KERNEL;
8388 } else {
8389 kqu.kqwq = p->p_fd.fd_wqkqueue;
8390
8391 flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
8392 }
8393
8394 return kevent_modern_internal(kqu, changelist, nchanges,
8395 eventlist, nevents, flags, kectx, retval);
8396 }
8397
8398 /*!
8399 * @function kevent_qos
8400 *
8401 * @brief
8402 * The kevent_qos() syscall.
8403 */
8404 int
8405 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
8406 {
8407 uthread_t uth = current_uthread();
8408 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8409 int error, flags = uap->flags & KEVENT_FLAG_USER;
8410 struct kqueue *kq;
8411
8412 if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
8413 return EINVAL;
8414 }
8415
8416 flags = kevent_adjust_flags_for_proc(p, flags);
8417
8418 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
8419 if (__improbable(error)) {
8420 return error;
8421 }
8422
8423 kectx->kec_deadline = 0;
8424 kectx->kec_fp = NULL;
8425 kectx->kec_fd = uap->fd;
8426 /* the kec_process_* fields are filled if kqueue_scann is called only */
8427
8428 /* get the kq we are going to be working on */
8429 if (__probable(flags & KEVENT_FLAG_WORKQ)) {
8430 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8431 } else {
8432 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8433 }
8434 if (__improbable(error)) {
8435 return error;
8436 }
8437
8438 return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
8439 uap->eventlist, uap->nevents, flags, kectx, retval);
8440 }
8441
8442 #pragma mark legacy syscalls: kevent, kevent64
8443
8444 /*!
8445 * @function kevent_legacy_get_deadline
8446 *
8447 * @brief
8448 * Compute the deadline for the legacy kevent syscalls.
8449 *
8450 * @discussion
8451 * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
8452 * as this takes precedence over the deadline.
8453 *
8454 * This function will fail if utimeout is USER_ADDR_NULL
8455 * (the caller should check).
8456 */
8457 static int
8458 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
8459 {
8460 struct timespec ts;
8461
8462 if (flags & KEVENT_FLAG_PROC64) {
8463 struct user64_timespec ts64;
8464 int error = copyin(utimeout, &ts64, sizeof(ts64));
8465 if (__improbable(error)) {
8466 return error;
8467 }
8468 ts.tv_sec = (unsigned long)ts64.tv_sec;
8469 ts.tv_nsec = (long)ts64.tv_nsec;
8470 } else {
8471 struct user32_timespec ts32;
8472 int error = copyin(utimeout, &ts32, sizeof(ts32));
8473 if (__improbable(error)) {
8474 return error;
8475 }
8476 ts.tv_sec = ts32.tv_sec;
8477 ts.tv_nsec = ts32.tv_nsec;
8478 }
8479 if (!timespec_is_valid(&ts)) {
8480 return EINVAL;
8481 }
8482
8483 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
8484 return 0;
8485 }
8486
8487 /*!
8488 * @function kevent_legacy_internal
8489 *
8490 * @brief
8491 * The core implementation for kevent and kevent64
8492 */
8493 OS_NOINLINE
8494 static int
8495 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
8496 int32_t *retval, int flags)
8497 {
8498 uthread_t uth = current_uthread();
8499 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
8500 struct kqueue *kq;
8501 int error;
8502
8503 if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
8504 return EINVAL;
8505 }
8506
8507 flags = kevent_adjust_flags_for_proc(p, flags);
8508
8509 kectx->kec_data_out = 0;
8510 kectx->kec_data_avail = 0;
8511 kectx->kec_data_size = 0;
8512 kectx->kec_data_resid = 0;
8513 kectx->kec_deadline = 0;
8514 kectx->kec_fp = NULL;
8515 kectx->kec_fd = uap->fd;
8516 /* the kec_process_* fields are filled if kqueue_scann is called only */
8517
8518 /* convert timeout to absolute - if we have one (and not immediate) */
8519 if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
8520 error = kevent_legacy_get_deadline(flags, uap->timeout,
8521 &kectx->kec_deadline);
8522 if (__improbable(error)) {
8523 return error;
8524 }
8525 }
8526
8527 /* get the kq we are going to be working on */
8528 if (flags & KEVENT_FLAG_WORKQ) {
8529 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8530 } else {
8531 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8532 }
8533 if (__improbable(error)) {
8534 return error;
8535 }
8536
8537 return kevent_internal(kq, uap->changelist, uap->nchanges,
8538 uap->eventlist, uap->nevents, flags, kectx, retval,
8539 /*legacy*/ true);
8540 }
8541
8542 /*!
8543 * @function kevent
8544 *
8545 * @brief
8546 * The legacy kevent() syscall.
8547 */
8548 int
8549 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8550 {
8551 struct kevent64_args args = {
8552 .fd = uap->fd,
8553 .changelist = uap->changelist,
8554 .nchanges = uap->nchanges,
8555 .eventlist = uap->eventlist,
8556 .nevents = uap->nevents,
8557 .timeout = uap->timeout,
8558 };
8559
8560 return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8561 }
8562
8563 /*!
8564 * @function kevent64
8565 *
8566 * @brief
8567 * The legacy kevent64() syscall.
8568 */
8569 int
8570 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8571 {
8572 int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8573 return kevent_legacy_internal(p, uap, retval, flags);
8574 }
8575
8576 #pragma mark - socket interface
8577
8578 #if SOCKETS
8579 #include <sys/param.h>
8580 #include <sys/socket.h>
8581 #include <sys/protosw.h>
8582 #include <sys/domain.h>
8583 #include <sys/mbuf.h>
8584 #include <sys/kern_event.h>
8585 #include <sys/malloc.h>
8586 #include <sys/sys_domain.h>
8587 #include <sys/syslog.h>
8588
8589 #ifndef ROUNDUP64
8590 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8591 #endif
8592
8593 #ifndef ADVANCE64
8594 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8595 #endif
8596
8597 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8598 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8599
8600 static int kev_attach(struct socket *so, int proto, struct proc *p);
8601 static int kev_detach(struct socket *so);
8602 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8603 struct ifnet *ifp, struct proc *p);
8604 static lck_mtx_t * event_getlock(struct socket *, int);
8605 static int event_lock(struct socket *, int, void *);
8606 static int event_unlock(struct socket *, int, void *);
8607
8608 static int event_sofreelastref(struct socket *);
8609 static void kev_delete(struct kern_event_pcb *);
8610
8611 static struct pr_usrreqs event_usrreqs = {
8612 .pru_attach = kev_attach,
8613 .pru_control = kev_control,
8614 .pru_detach = kev_detach,
8615 .pru_soreceive = soreceive,
8616 };
8617
8618 static struct protosw eventsw[] = {
8619 {
8620 .pr_type = SOCK_RAW,
8621 .pr_protocol = SYSPROTO_EVENT,
8622 .pr_flags = PR_ATOMIC,
8623 .pr_usrreqs = &event_usrreqs,
8624 .pr_lock = event_lock,
8625 .pr_unlock = event_unlock,
8626 .pr_getlock = event_getlock,
8627 }
8628 };
8629
8630 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8631 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8632
8633 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8634 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8635
8636 struct kevtstat kevtstat;
8637 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8638 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8639 kevt_getstat, "S,kevtstat", "");
8640
8641 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8642 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8643 kevt_pcblist, "S,xkevtpcb", "");
8644
8645 SYSCTL_UINT(_net_systm_kevt, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
8646 (unsigned int *)&kevtstat.kes_pcbcount, 0, "");
8647
8648 static lck_mtx_t *
8649 event_getlock(struct socket *so, int flags)
8650 {
8651 #pragma unused(flags)
8652 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8653
8654 if (so->so_pcb != NULL) {
8655 if (so->so_usecount < 0) {
8656 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8657 so, so->so_usecount, solockhistory_nr(so));
8658 }
8659 /* NOTREACHED */
8660 } else {
8661 panic("%s: so=%p NULL NO so_pcb %s", __func__,
8662 so, solockhistory_nr(so));
8663 /* NOTREACHED */
8664 }
8665 return &ev_pcb->evp_mtx;
8666 }
8667
8668 static int
8669 event_lock(struct socket *so, int refcount, void *lr)
8670 {
8671 void *lr_saved;
8672
8673 if (lr == NULL) {
8674 lr_saved = __builtin_return_address(0);
8675 } else {
8676 lr_saved = lr;
8677 }
8678
8679 if (so->so_pcb != NULL) {
8680 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8681 } else {
8682 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
8683 so, lr_saved, solockhistory_nr(so));
8684 /* NOTREACHED */
8685 }
8686
8687 if (so->so_usecount < 0) {
8688 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s", __func__,
8689 so, so->so_pcb, lr_saved, so->so_usecount,
8690 solockhistory_nr(so));
8691 /* NOTREACHED */
8692 }
8693
8694 if (refcount) {
8695 so->so_usecount++;
8696 }
8697
8698 so->lock_lr[so->next_lock_lr] = lr_saved;
8699 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8700 return 0;
8701 }
8702
8703 static int
8704 event_unlock(struct socket *so, int refcount, void *lr)
8705 {
8706 void *lr_saved;
8707 lck_mtx_t *mutex_held;
8708
8709 if (lr == NULL) {
8710 lr_saved = __builtin_return_address(0);
8711 } else {
8712 lr_saved = lr;
8713 }
8714
8715 if (refcount) {
8716 so->so_usecount--;
8717 }
8718 if (so->so_usecount < 0) {
8719 panic("%s: so=%p usecount=%d lrh= %s", __func__,
8720 so, so->so_usecount, solockhistory_nr(so));
8721 /* NOTREACHED */
8722 }
8723 if (so->so_pcb == NULL) {
8724 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s", __func__,
8725 so, so->so_usecount, (void *)lr_saved,
8726 solockhistory_nr(so));
8727 /* NOTREACHED */
8728 }
8729 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8730
8731 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8732 so->unlock_lr[so->next_unlock_lr] = lr_saved;
8733 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8734
8735 if (so->so_usecount == 0) {
8736 VERIFY(so->so_flags & SOF_PCBCLEARING);
8737 event_sofreelastref(so);
8738 } else {
8739 lck_mtx_unlock(mutex_held);
8740 }
8741
8742 return 0;
8743 }
8744
8745 static int
8746 event_sofreelastref(struct socket *so)
8747 {
8748 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8749
8750 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8751
8752 so->so_pcb = NULL;
8753
8754 /*
8755 * Disable upcall in the event another thread is in kev_post_msg()
8756 * appending record to the receive socket buffer, since sbwakeup()
8757 * may release the socket lock otherwise.
8758 */
8759 so->so_rcv.sb_flags &= ~SB_UPCALL;
8760 so->so_snd.sb_flags &= ~SB_UPCALL;
8761 so->so_event = sonullevent;
8762 lck_mtx_unlock(&(ev_pcb->evp_mtx));
8763
8764 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8765 lck_rw_lock_exclusive(&kev_rwlock);
8766 LIST_REMOVE(ev_pcb, evp_link);
8767 kevtstat.kes_pcbcount--;
8768 kevtstat.kes_gencnt++;
8769 lck_rw_done(&kev_rwlock);
8770 kev_delete(ev_pcb);
8771
8772 sofreelastref(so, 1);
8773 return 0;
8774 }
8775
8776 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8777
8778 static
8779 struct kern_event_head kern_event_head;
8780
8781 static u_int32_t static_event_id = 0;
8782
8783 static KALLOC_TYPE_DEFINE(ev_pcb_zone, struct kern_event_pcb, NET_KT_DEFAULT);
8784
8785 /*
8786 * Install the protosw's for the NKE manager. Invoked at extension load time
8787 */
8788 void
8789 kern_event_init(struct domain *dp)
8790 {
8791 struct protosw *pr;
8792 int i;
8793
8794 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8795 VERIFY(dp == systemdomain);
8796
8797 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8798 net_add_proto(pr, dp, 1);
8799 }
8800 }
8801
8802 static int
8803 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8804 {
8805 int error = 0;
8806 struct kern_event_pcb *ev_pcb;
8807
8808 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8809 if (error != 0) {
8810 return error;
8811 }
8812
8813 ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8814 lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8815
8816 ev_pcb->evp_socket = so;
8817 ev_pcb->evp_vendor_code_filter = 0xffffffff;
8818
8819 so->so_pcb = (caddr_t) ev_pcb;
8820 lck_rw_lock_exclusive(&kev_rwlock);
8821 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8822 kevtstat.kes_pcbcount++;
8823 kevtstat.kes_gencnt++;
8824 lck_rw_done(&kev_rwlock);
8825
8826 return error;
8827 }
8828
8829 static void
8830 kev_delete(struct kern_event_pcb *ev_pcb)
8831 {
8832 VERIFY(ev_pcb != NULL);
8833 lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8834 zfree(ev_pcb_zone, ev_pcb);
8835 }
8836
8837 static int
8838 kev_detach(struct socket *so)
8839 {
8840 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8841
8842 if (ev_pcb != NULL) {
8843 soisdisconnected(so);
8844 so->so_flags |= SOF_PCBCLEARING;
8845 }
8846
8847 return 0;
8848 }
8849
8850 /*
8851 * For now, kev_vendor_code and mbuf_tags use the same
8852 * mechanism.
8853 */
8854 errno_t
8855 kev_vendor_code_find(
8856 const char *string,
8857 u_int32_t *out_vendor_code)
8858 {
8859 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8860 return EINVAL;
8861 }
8862 return net_str_id_find_internal(string, out_vendor_code,
8863 NSI_VENDOR_CODE, 1);
8864 }
8865
8866 errno_t
8867 kev_msg_post(struct kev_msg *event_msg)
8868 {
8869 mbuf_tag_id_t min_vendor, max_vendor;
8870
8871 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8872
8873 if (event_msg == NULL) {
8874 return EINVAL;
8875 }
8876
8877 /*
8878 * Limit third parties to posting events for registered vendor codes
8879 * only
8880 */
8881 if (event_msg->vendor_code < min_vendor ||
8882 event_msg->vendor_code > max_vendor) {
8883 os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8884 return EINVAL;
8885 }
8886 return kev_post_msg(event_msg);
8887 }
8888
8889 static int
8890 kev_post_msg_internal(struct kev_msg *event_msg, int wait)
8891 {
8892 struct mbuf *m, *m2;
8893 struct kern_event_pcb *ev_pcb;
8894 struct kern_event_msg *ev;
8895 char *tmp;
8896 u_int32_t total_size;
8897 int i;
8898
8899 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
8900 /*
8901 * Special hook for ALF state updates
8902 */
8903 if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
8904 event_msg->kev_class == KEV_NKE_CLASS &&
8905 event_msg->kev_subclass == KEV_NKE_ALF_SUBCLASS &&
8906 event_msg->event_code == KEV_NKE_ALF_STATE_CHANGED) {
8907 #if MACH_ASSERT
8908 os_log_info(OS_LOG_DEFAULT, "KEV_NKE_ALF_STATE_CHANGED posted");
8909 #endif /* MACH_ASSERT */
8910 net_filter_event_mark(NET_FILTER_EVENT_ALF,
8911 net_check_compatible_alf());
8912 }
8913 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
8914
8915 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8916 total_size = KEV_MSG_HEADER_SIZE;
8917
8918 for (i = 0; i < 5; i++) {
8919 if (event_msg->dv[i].data_length == 0) {
8920 break;
8921 }
8922 total_size += event_msg->dv[i].data_length;
8923 }
8924
8925 if (total_size > MLEN) {
8926 os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8927 return EMSGSIZE;
8928 }
8929
8930 m = m_get(wait, MT_DATA);
8931 if (m == 0) {
8932 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8933 return ENOMEM;
8934 }
8935 ev = mtod(m, struct kern_event_msg *);
8936 total_size = KEV_MSG_HEADER_SIZE;
8937
8938 tmp = (char *) &ev->event_data[0];
8939 for (i = 0; i < 5; i++) {
8940 if (event_msg->dv[i].data_length == 0) {
8941 break;
8942 }
8943
8944 total_size += event_msg->dv[i].data_length;
8945 bcopy(event_msg->dv[i].data_ptr, tmp,
8946 event_msg->dv[i].data_length);
8947 tmp += event_msg->dv[i].data_length;
8948 }
8949
8950 ev->id = ++static_event_id;
8951 ev->total_size = total_size;
8952 ev->vendor_code = event_msg->vendor_code;
8953 ev->kev_class = event_msg->kev_class;
8954 ev->kev_subclass = event_msg->kev_subclass;
8955 ev->event_code = event_msg->event_code;
8956
8957 m->m_len = total_size;
8958 lck_rw_lock_shared(&kev_rwlock);
8959 for (ev_pcb = LIST_FIRST(&kern_event_head);
8960 ev_pcb;
8961 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8962 lck_mtx_lock(&ev_pcb->evp_mtx);
8963 if (ev_pcb->evp_socket->so_pcb == NULL) {
8964 lck_mtx_unlock(&ev_pcb->evp_mtx);
8965 continue;
8966 }
8967 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8968 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8969 lck_mtx_unlock(&ev_pcb->evp_mtx);
8970 continue;
8971 }
8972
8973 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8974 if (ev_pcb->evp_class_filter != ev->kev_class) {
8975 lck_mtx_unlock(&ev_pcb->evp_mtx);
8976 continue;
8977 }
8978
8979 if ((ev_pcb->evp_subclass_filter !=
8980 KEV_ANY_SUBCLASS) &&
8981 (ev_pcb->evp_subclass_filter !=
8982 ev->kev_subclass)) {
8983 lck_mtx_unlock(&ev_pcb->evp_mtx);
8984 continue;
8985 }
8986 }
8987 }
8988
8989 m2 = m_copym(m, 0, m->m_len, wait);
8990 if (m2 == 0) {
8991 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8992 m_free(m);
8993 lck_mtx_unlock(&ev_pcb->evp_mtx);
8994 lck_rw_done(&kev_rwlock);
8995 return ENOMEM;
8996 }
8997 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8998 /*
8999 * We use "m" for the socket stats as it would be
9000 * unsafe to use "m2"
9001 */
9002 so_inc_recv_data_stat(ev_pcb->evp_socket,
9003 1, m->m_len);
9004
9005 sorwakeup(ev_pcb->evp_socket);
9006 os_atomic_inc(&kevtstat.kes_posted, relaxed);
9007 } else {
9008 os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
9009 }
9010 lck_mtx_unlock(&ev_pcb->evp_mtx);
9011 }
9012 m_free(m);
9013 lck_rw_done(&kev_rwlock);
9014
9015 return 0;
9016 }
9017
9018 int
9019 kev_post_msg(struct kev_msg *event_msg)
9020 {
9021 return kev_post_msg_internal(event_msg, M_WAIT);
9022 }
9023
9024 int
9025 kev_post_msg_nowait(struct kev_msg *event_msg)
9026 {
9027 return kev_post_msg_internal(event_msg, M_NOWAIT);
9028 }
9029
9030 static int
9031 kev_control(struct socket *so,
9032 u_long cmd,
9033 caddr_t data,
9034 __unused struct ifnet *ifp,
9035 __unused struct proc *p)
9036 {
9037 struct kev_request *kev_req = (struct kev_request *) data;
9038 struct kern_event_pcb *ev_pcb;
9039 struct kev_vendor_code *kev_vendor;
9040 u_int32_t *id_value = (u_int32_t *) data;
9041
9042 switch (cmd) {
9043 case SIOCGKEVID:
9044 *id_value = static_event_id;
9045 break;
9046 case SIOCSKEVFILT:
9047 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9048 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9049 ev_pcb->evp_class_filter = kev_req->kev_class;
9050 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
9051 break;
9052 case SIOCGKEVFILT:
9053 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9054 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9055 kev_req->kev_class = ev_pcb->evp_class_filter;
9056 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9057 break;
9058 case SIOCGKEVVENDOR:
9059 kev_vendor = (struct kev_vendor_code *)data;
9060 /* Make sure string is NULL terminated */
9061 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
9062 return net_str_id_find_internal(kev_vendor->vendor_string,
9063 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
9064 default:
9065 return ENOTSUP;
9066 }
9067
9068 return 0;
9069 }
9070
9071 int
9072 kevt_getstat SYSCTL_HANDLER_ARGS
9073 {
9074 #pragma unused(oidp, arg1, arg2)
9075 int error = 0;
9076
9077 lck_rw_lock_shared(&kev_rwlock);
9078
9079 if (req->newptr != USER_ADDR_NULL) {
9080 error = EPERM;
9081 goto done;
9082 }
9083 if (req->oldptr == USER_ADDR_NULL) {
9084 req->oldidx = sizeof(struct kevtstat);
9085 goto done;
9086 }
9087
9088 error = SYSCTL_OUT(req, &kevtstat,
9089 MIN(sizeof(struct kevtstat), req->oldlen));
9090 done:
9091 lck_rw_done(&kev_rwlock);
9092
9093 return error;
9094 }
9095
9096 __private_extern__ int
9097 kevt_pcblist SYSCTL_HANDLER_ARGS
9098 {
9099 #pragma unused(oidp, arg1, arg2)
9100 int error = 0;
9101 uint64_t n, i;
9102 struct xsystmgen xsg;
9103 void *buf = NULL;
9104 size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
9105 ROUNDUP64(sizeof(struct xsocket_n)) +
9106 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
9107 ROUNDUP64(sizeof(struct xsockstat_n));
9108 struct kern_event_pcb *ev_pcb;
9109
9110 buf = kalloc_data(item_size, Z_WAITOK_ZERO_NOFAIL);
9111
9112 lck_rw_lock_shared(&kev_rwlock);
9113
9114 n = kevtstat.kes_pcbcount;
9115
9116 if (req->oldptr == USER_ADDR_NULL) {
9117 req->oldidx = (size_t) ((n + n / 8) * item_size);
9118 goto done;
9119 }
9120 if (req->newptr != USER_ADDR_NULL) {
9121 error = EPERM;
9122 goto done;
9123 }
9124 bzero(&xsg, sizeof(xsg));
9125 xsg.xg_len = sizeof(xsg);
9126 xsg.xg_count = n;
9127 xsg.xg_gen = kevtstat.kes_gencnt;
9128 xsg.xg_sogen = so_gencnt;
9129 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9130 if (error) {
9131 goto done;
9132 }
9133 /*
9134 * We are done if there is no pcb
9135 */
9136 if (n == 0) {
9137 goto done;
9138 }
9139
9140 i = 0;
9141 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9142 i < n && ev_pcb != NULL;
9143 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9144 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9145 struct xsocket_n *xso = (struct xsocket_n *)
9146 ADVANCE64(xk, sizeof(*xk));
9147 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9148 ADVANCE64(xso, sizeof(*xso));
9149 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9150 ADVANCE64(xsbrcv, sizeof(*xsbrcv));
9151 struct xsockstat_n *xsostats = (struct xsockstat_n *)
9152 ADVANCE64(xsbsnd, sizeof(*xsbsnd));
9153
9154 bzero(buf, item_size);
9155
9156 lck_mtx_lock(&ev_pcb->evp_mtx);
9157
9158 xk->kep_len = sizeof(struct xkevtpcb);
9159 xk->kep_kind = XSO_EVT;
9160 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRHASH(ev_pcb);
9161 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9162 xk->kep_class_filter = ev_pcb->evp_class_filter;
9163 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9164
9165 sotoxsocket_n(ev_pcb->evp_socket, xso);
9166 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9167 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9168 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9169 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9170 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9171
9172 lck_mtx_unlock(&ev_pcb->evp_mtx);
9173
9174 error = SYSCTL_OUT(req, buf, item_size);
9175 }
9176
9177 if (error == 0) {
9178 /*
9179 * Give the user an updated idea of our state.
9180 * If the generation differs from what we told
9181 * her before, she knows that something happened
9182 * while we were processing this request, and it
9183 * might be necessary to retry.
9184 */
9185 bzero(&xsg, sizeof(xsg));
9186 xsg.xg_len = sizeof(xsg);
9187 xsg.xg_count = n;
9188 xsg.xg_gen = kevtstat.kes_gencnt;
9189 xsg.xg_sogen = so_gencnt;
9190 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
9191 if (error) {
9192 goto done;
9193 }
9194 }
9195
9196 done:
9197 lck_rw_done(&kev_rwlock);
9198
9199 kfree_data(buf, item_size);
9200 return error;
9201 }
9202
9203 #endif /* SOCKETS */
9204
9205
9206 int
9207 fill_kqueueinfo(kqueue_t kqu, struct kqueue_info * kinfo)
9208 {
9209 struct vinfo_stat * st;
9210
9211 st = &kinfo->kq_stat;
9212
9213 st->vst_size = kqu.kq->kq_count;
9214 if (kqu.kq->kq_state & KQ_KEV_QOS) {
9215 st->vst_blksize = sizeof(struct kevent_qos_s);
9216 } else if (kqu.kq->kq_state & KQ_KEV64) {
9217 st->vst_blksize = sizeof(struct kevent64_s);
9218 } else {
9219 st->vst_blksize = sizeof(struct kevent);
9220 }
9221 st->vst_mode = S_IFIFO;
9222 st->vst_ino = (kqu.kq->kq_state & KQ_DYNAMIC) ?
9223 kqu.kqwl->kqwl_dynamicid : 0;
9224
9225 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9226 #define PROC_KQUEUE_MASK (KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9227 static_assert(PROC_KQUEUE_SLEEP == KQ_SLEEP);
9228 static_assert(PROC_KQUEUE_32 == KQ_KEV32);
9229 static_assert(PROC_KQUEUE_64 == KQ_KEV64);
9230 static_assert(PROC_KQUEUE_QOS == KQ_KEV_QOS);
9231 static_assert(PROC_KQUEUE_WORKQ == KQ_WORKQ);
9232 static_assert(PROC_KQUEUE_WORKLOOP == KQ_WORKLOOP);
9233 kinfo->kq_state = kqu.kq->kq_state & PROC_KQUEUE_MASK;
9234 if ((kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0) {
9235 if (kqu.kqf->kqf_sel.si_flags & SI_RECORDED) {
9236 kinfo->kq_state |= PROC_KQUEUE_SELECT;
9237 }
9238 }
9239
9240 return 0;
9241 }
9242
9243 static int
9244 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
9245 {
9246 workq_threadreq_t kqr = &kqwl->kqwl_request;
9247 workq_threadreq_param_t trp = {};
9248 int err;
9249
9250 if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
9251 return EINVAL;
9252 }
9253
9254 if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
9255 return err;
9256 }
9257
9258 kqlock(kqwl);
9259
9260 kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
9261 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9262 kqdi->kqdi_request_state = kqr->tr_state;
9263 kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
9264 kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
9265 kqdi->kqdi_sync_waiters = 0;
9266 kqdi->kqdi_sync_waiter_qos = 0;
9267
9268 trp.trp_value = kqwl->kqwl_params;
9269 if (trp.trp_flags & TRP_PRIORITY) {
9270 kqdi->kqdi_pri = trp.trp_pri;
9271 } else {
9272 kqdi->kqdi_pri = 0;
9273 }
9274
9275 if (trp.trp_flags & TRP_POLICY) {
9276 kqdi->kqdi_pol = trp.trp_pol;
9277 } else {
9278 kqdi->kqdi_pol = 0;
9279 }
9280
9281 if (trp.trp_flags & TRP_CPUPERCENT) {
9282 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
9283 } else {
9284 kqdi->kqdi_cpupercent = 0;
9285 }
9286
9287 kqunlock(kqwl);
9288
9289 return 0;
9290 }
9291
9292
9293 static unsigned long
9294 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9295 unsigned long buflen, unsigned long nknotes)
9296 {
9297 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9298 if (kq == knote_get_kq(kn)) {
9299 if (nknotes < buflen) {
9300 struct kevent_extinfo *info = &buf[nknotes];
9301
9302 kqlock(kq);
9303
9304 if (knote_fops(kn)->f_sanitized_copyout) {
9305 knote_fops(kn)->f_sanitized_copyout(kn, &info->kqext_kev);
9306 } else {
9307 info->kqext_kev = *(struct kevent_qos_s *)&kn->kn_kevent;
9308 }
9309
9310 if (knote_has_qos(kn)) {
9311 info->kqext_kev.qos =
9312 _pthread_priority_thread_qos_fast(kn->kn_qos);
9313 } else {
9314 info->kqext_kev.qos = kn->kn_qos_override;
9315 }
9316 info->kqext_kev.filter |= 0xff00; /* sign extend filter */
9317 info->kqext_kev.xflags = 0; /* this is where sfflags lives */
9318 info->kqext_kev.data = 0; /* this is where sdata lives */
9319 info->kqext_sdata = kn->kn_sdata;
9320 info->kqext_status = kn->kn_status;
9321 info->kqext_sfflags = kn->kn_sfflags;
9322
9323 kqunlock(kq);
9324 }
9325
9326 /* we return total number of knotes, which may be more than requested */
9327 nknotes++;
9328 }
9329 }
9330
9331 return nknotes;
9332 }
9333
9334 int
9335 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9336 int32_t *nkqueues_out)
9337 {
9338 proc_t p = (proc_t)proc;
9339 struct filedesc *fdp = &p->p_fd;
9340 unsigned int nkqueues = 0;
9341 unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9342 size_t buflen, bufsize;
9343 kqueue_id_t *kq_ids = NULL;
9344 int err = 0;
9345
9346 assert(p != NULL);
9347
9348 if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9349 err = EINVAL;
9350 goto out;
9351 }
9352
9353 buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9354
9355 if (ubuflen != 0) {
9356 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9357 err = ERANGE;
9358 goto out;
9359 }
9360 kq_ids = (kqueue_id_t *)kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
9361 if (!kq_ids) {
9362 err = ENOMEM;
9363 goto out;
9364 }
9365 }
9366
9367 kqhash_lock(fdp);
9368
9369 u_long kqhashmask = fdp->fd_kqhashmask;
9370 if (kqhashmask > 0) {
9371 for (uint32_t i = 0; i < kqhashmask + 1; i++) {
9372 struct kqworkloop *kqwl;
9373
9374 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9375 /* report the number of kqueues, even if they don't all fit */
9376 if (nkqueues < buflen) {
9377 kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9378 }
9379 nkqueues++;
9380 }
9381
9382 /*
9383 * Drop the kqhash lock and take it again to give some breathing room
9384 */
9385 kqhash_unlock(fdp);
9386 kqhash_lock(fdp);
9387
9388 /*
9389 * Reevaluate to see if we have raced with someone who changed this -
9390 * if we have, we should bail out with the set of info captured so far
9391 */
9392 if (fdp->fd_kqhashmask != kqhashmask) {
9393 break;
9394 }
9395 }
9396 }
9397
9398 kqhash_unlock(fdp);
9399
9400 if (kq_ids) {
9401 size_t copysize;
9402 if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), ©size)) {
9403 err = ERANGE;
9404 goto out;
9405 }
9406
9407 assert(ubufsize >= copysize);
9408 err = copyout(kq_ids, ubuf, copysize);
9409 }
9410
9411 out:
9412 if (kq_ids) {
9413 kfree_data(kq_ids, bufsize);
9414 }
9415
9416 if (!err) {
9417 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9418 }
9419 return err;
9420 }
9421
9422 int
9423 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9424 uint32_t ubufsize, int32_t *size_out)
9425 {
9426 proc_t p = (proc_t)proc;
9427 struct kqworkloop *kqwl;
9428 int err = 0;
9429 struct kqueue_dyninfo kqdi = { };
9430
9431 assert(p != NULL);
9432
9433 if (ubufsize < sizeof(struct kqueue_info)) {
9434 return ENOBUFS;
9435 }
9436
9437 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9438 if (!kqwl) {
9439 return ESRCH;
9440 }
9441
9442 /*
9443 * backward compatibility: allow the argument to this call to only be
9444 * a struct kqueue_info
9445 */
9446 if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9447 ubufsize = sizeof(struct kqueue_dyninfo);
9448 err = fill_kqueue_dyninfo(kqwl, &kqdi);
9449 } else {
9450 ubufsize = sizeof(struct kqueue_info);
9451 err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
9452 }
9453 if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9454 *size_out = ubufsize;
9455 }
9456 kqworkloop_release(kqwl);
9457 return err;
9458 }
9459
9460 int
9461 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9462 uint32_t ubufsize, int32_t *nknotes_out)
9463 {
9464 proc_t p = (proc_t)proc;
9465 struct kqworkloop *kqwl;
9466 int err;
9467
9468 kqwl = kqworkloop_hash_lookup_and_retain(&p->p_fd, kq_id);
9469 if (!kqwl) {
9470 return ESRCH;
9471 }
9472
9473 err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
9474 kqworkloop_release(kqwl);
9475 return err;
9476 }
9477
9478 int
9479 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9480 uint32_t bufsize, int32_t *retval)
9481 {
9482 struct knote *kn;
9483 int i;
9484 int err = 0;
9485 struct filedesc *fdp = &p->p_fd;
9486 unsigned long nknotes = 0;
9487 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9488 struct kevent_extinfo *kqext = NULL;
9489
9490 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9491 buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9492
9493 kqext = (struct kevent_extinfo *)kalloc_data(buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
9494 if (kqext == NULL) {
9495 err = ENOMEM;
9496 goto out;
9497 }
9498
9499 proc_fdlock(p);
9500 u_long fd_knlistsize = fdp->fd_knlistsize;
9501 struct klist *fd_knlist = fdp->fd_knlist;
9502
9503 for (i = 0; i < fd_knlistsize; i++) {
9504 kn = SLIST_FIRST(&fd_knlist[i]);
9505 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9506
9507 proc_fdunlock(p);
9508 proc_fdlock(p);
9509 /*
9510 * Reevaluate to see if we have raced with someone who changed this -
9511 * if we have, we return the set of info for fd_knlistsize we knew
9512 * in the beginning except if knotes_dealloc interleaves with us.
9513 * In that case, we bail out early with the set of info captured so far.
9514 */
9515 if (fd_knlistsize != fdp->fd_knlistsize) {
9516 if (fdp->fd_knlistsize) {
9517 /* kq_add_knote might grow fdp->fd_knlist. */
9518 fd_knlist = fdp->fd_knlist;
9519 } else {
9520 break;
9521 }
9522 }
9523 }
9524 proc_fdunlock(p);
9525
9526 knhash_lock(fdp);
9527 u_long knhashmask = fdp->fd_knhashmask;
9528
9529 if (knhashmask != 0) {
9530 for (i = 0; i < (int)knhashmask + 1; i++) {
9531 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9532 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9533
9534 knhash_unlock(fdp);
9535 knhash_lock(fdp);
9536
9537 /*
9538 * Reevaluate to see if we have raced with someone who changed this -
9539 * if we have, we should bail out with the set of info captured so far
9540 */
9541 if (fdp->fd_knhashmask != knhashmask) {
9542 break;
9543 }
9544 }
9545 }
9546 knhash_unlock(fdp);
9547
9548 assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9549 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
9550
9551 out:
9552 kfree_data(kqext, buflen * sizeof(struct kevent_extinfo));
9553
9554 if (!err) {
9555 *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9556 }
9557 return err;
9558 }
9559
9560 static unsigned int
9561 klist_copy_udata(struct klist *list, uint64_t *buf,
9562 unsigned int buflen, unsigned int nknotes)
9563 {
9564 struct knote *kn;
9565 SLIST_FOREACH(kn, list, kn_link) {
9566 if (nknotes < buflen) {
9567 /*
9568 * kevent_register will always set kn_udata atomically
9569 * so that we don't have to take any kqlock here.
9570 */
9571 buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9572 }
9573 /* we return total number of knotes, which may be more than requested */
9574 nknotes++;
9575 }
9576
9577 return nknotes;
9578 }
9579
9580 int
9581 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9582 {
9583 proc_t p = (proc_t)proc;
9584 struct filedesc *fdp = &p->p_fd;
9585 unsigned int nuptrs = 0;
9586 unsigned int buflen = bufsize / sizeof(uint64_t);
9587 struct kqworkloop *kqwl;
9588 u_long size = 0;
9589 struct klist *fd_knlist = NULL;
9590
9591 if (buflen > 0) {
9592 assert(buf != NULL);
9593 }
9594
9595 /*
9596 * Copyout the uptrs as much as possible but make sure to drop the respective
9597 * locks and take them again periodically so that we don't blow through
9598 * preemption disabled timeouts. Always reevaluate to see if we have raced
9599 * with someone who changed size of the hash - if we have, we return info for
9600 * the size of the hash we knew in the beginning except if it drops to 0.
9601 * In that case, we bail out with the set of info captured so far
9602 */
9603 proc_fdlock(p);
9604 size = fdp->fd_knlistsize;
9605 fd_knlist = fdp->fd_knlist;
9606
9607 for (int i = 0; i < size; i++) {
9608 nuptrs = klist_copy_udata(&fd_knlist[i], buf, buflen, nuptrs);
9609
9610 proc_fdunlock(p);
9611 proc_fdlock(p);
9612 if (size != fdp->fd_knlistsize) {
9613 if (fdp->fd_knlistsize) {
9614 /* kq_add_knote might grow fdp->fd_knlist. */
9615 fd_knlist = fdp->fd_knlist;
9616 } else {
9617 break;
9618 }
9619 }
9620 }
9621 proc_fdunlock(p);
9622
9623 knhash_lock(fdp);
9624 size = fdp->fd_knhashmask;
9625
9626 if (size != 0) {
9627 for (size_t i = 0; i < size + 1; i++) {
9628 nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9629
9630 knhash_unlock(fdp);
9631 knhash_lock(fdp);
9632 /* The only path that can interleave with us today is knotes_dealloc. */
9633 if (size != fdp->fd_knhashmask) {
9634 break;
9635 }
9636 }
9637 }
9638 knhash_unlock(fdp);
9639
9640 kqhash_lock(fdp);
9641 size = fdp->fd_kqhashmask;
9642
9643 if (size != 0) {
9644 for (size_t i = 0; i < size + 1; i++) {
9645 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9646 if (nuptrs < buflen) {
9647 buf[nuptrs] = kqwl->kqwl_dynamicid;
9648 }
9649 nuptrs++;
9650 }
9651
9652 kqhash_unlock(fdp);
9653 kqhash_lock(fdp);
9654 if (size != fdp->fd_kqhashmask) {
9655 break;
9656 }
9657 }
9658 }
9659 kqhash_unlock(fdp);
9660
9661 return (int)nuptrs;
9662 }
9663
9664 static void
9665 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9666 {
9667 uint64_t ast_addr;
9668 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9669 size_t user_addr_size = proc_is_64bit ? 8 : 4;
9670 uint32_t ast_flags32 = 0;
9671 uint64_t ast_flags64 = 0;
9672 struct uthread *ut = get_bsdthread_info(thread);
9673
9674 if (ut->uu_kqr_bound != NULL) {
9675 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9676 }
9677
9678 if (ast_flags64 == 0) {
9679 return;
9680 }
9681
9682 if (!(p->p_flag & P_LP64)) {
9683 ast_flags32 = (uint32_t)ast_flags64;
9684 assert(ast_flags64 < 0x100000000ull);
9685 }
9686
9687 ast_addr = thread_rettokern_addr(thread);
9688 if (ast_addr == 0) {
9689 return;
9690 }
9691
9692 if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9693 (user_addr_t)ast_addr,
9694 user_addr_size) != 0) {
9695 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9696 "ast_addr = %llu\n", proc_getpid(p), thread_tid(current_thread()), ast_addr);
9697 }
9698 }
9699
9700 /*
9701 * Semantics of writing to TSD value:
9702 *
9703 * 1. It is written to by the kernel and cleared by userspace.
9704 * 2. When the userspace code clears the TSD field, it takes responsibility for
9705 * taking action on the quantum expiry action conveyed by kernel.
9706 * 3. The TSD value is always cleared upon entry into userspace and upon exit of
9707 * userspace back to kernel to make sure that it is never leaked across thread
9708 * requests.
9709 */
9710 void
9711 kevent_set_workq_quantum_expiry_user_tsd(proc_t p, thread_t thread,
9712 uint64_t flags)
9713 {
9714 uint64_t ast_addr;
9715 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9716 uint32_t ast_flags32 = 0;
9717 uint64_t ast_flags64 = flags;
9718
9719 if (ast_flags64 == 0) {
9720 return;
9721 }
9722
9723 if (!(p->p_flag & P_LP64)) {
9724 ast_flags32 = (uint32_t)ast_flags64;
9725 assert(ast_flags64 < 0x100000000ull);
9726 }
9727
9728 ast_addr = thread_wqquantum_addr(thread);
9729 assert(ast_addr != 0);
9730
9731 if (proc_is_64bit) {
9732 if (copyout_atomic64(ast_flags64, (user_addr_t) ast_addr)) {
9733 #if DEBUG || DEVELOPMENT
9734 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9735 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9736 #endif
9737 }
9738 } else {
9739 if (copyout_atomic32(ast_flags32, (user_addr_t) ast_addr)) {
9740 #if DEBUG || DEVELOPMENT
9741 printf("pid %d (tid:%llu): copyout of workq quantum ast flags failed with "
9742 "ast_addr = %llu\n", proc_getpid(p), thread_tid(thread), ast_addr);
9743 #endif
9744 }
9745 }
9746 }
9747
9748 void
9749 kevent_ast(thread_t thread, uint16_t bits)
9750 {
9751 proc_t p = current_proc();
9752
9753
9754 if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9755 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9756 }
9757 if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9758 kevent_set_return_to_kernel_user_tsd(p, thread);
9759 }
9760
9761 if (bits & AST_KEVENT_WORKQ_QUANTUM_EXPIRED) {
9762 workq_kern_quantum_expiry_reevaluate(p, thread);
9763 }
9764 }
9765
9766 #if DEVELOPMENT || DEBUG
9767
9768 #define KEVENT_SYSCTL_BOUND_ID 1
9769
9770 static int
9771 kevent_sysctl SYSCTL_HANDLER_ARGS
9772 {
9773 #pragma unused(oidp, arg2)
9774 uintptr_t type = (uintptr_t)arg1;
9775 uint64_t bound_id = 0;
9776
9777 if (type != KEVENT_SYSCTL_BOUND_ID) {
9778 return EINVAL;
9779 }
9780
9781 if (req->newptr) {
9782 return EINVAL;
9783 }
9784
9785 struct uthread *ut = current_uthread();
9786 if (!ut) {
9787 return EFAULT;
9788 }
9789
9790 workq_threadreq_t kqr = ut->uu_kqr_bound;
9791 if (kqr) {
9792 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9793 bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9794 } else {
9795 bound_id = -1;
9796 }
9797 }
9798
9799 return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9800 }
9801
9802 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9803 "kevent information");
9804
9805 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9806 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9807 (void *)KEVENT_SYSCTL_BOUND_ID,
9808 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9809 "get the ID of the bound kqueue");
9810
9811 #endif /* DEVELOPMENT || DEBUG */
9812