xref: /xnu-11215/osfmk/kern/lock_mtx.c (revision 4f1223e8)
1 /*
2  * Copyright (c) 2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #define LOCK_PRIVATE 1
30 
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37 
38 #include <mach/machine/sdt.h>
39 
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42 
43 #if !LCK_MTX_USE_ARCH
44 
45 /*
46  * lck_mtx_t
47  * ~~~~~~~~~
48  *
49  * Kernel mutexes in this implementation are made of four 32 bits words:
50  *
51  *   - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52  *   - word 1: padding (to be used for group compact IDs)
53  *   - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54  *             refered to as "data" in the code.
55  *   - word 3: adaptive spin and interlock MCS queue tails.
56  *
57  * The 64 bits word made of the last two words is refered to
58  * as the "mutex state" in code.
59  *
60  *
61  * Core serialization rules
62  * ~~~~~~~~~~~~~~~~~~~~~~~~
63  *
64  * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65  * of the data word) that serves as a spinlock for the mutex state.
66  *
67  *
68  * Updating the lock fields must follow the following rules:
69  *
70  *   - It is ok to "steal" the mutex (updating its data field) if no one
71  *     holds the interlock.
72  *
73  *   - Holding the interlock allows its holder to update the first 3 words
74  *     of the kernel mutex without using RMW atomics (plain stores are OK).
75  *
76  *   - Holding the interlock is required for a thread to remove itself
77  *     from the adaptive spin queue.
78  *
79  *   - Threads can enqueue themselves onto the adaptive spin wait queue
80  *     or the interlock wait queue at any time.
81  *
82  *
83  * Waiters bit and turnstiles
84  * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85  *
86  * The turnstile on a kernel mutex is set by waiters, and cleared
87  * once they have all been resumed and successfully acquired the lock.
88  *
89  * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90  * forces threads to the lck_mtx_unlock slowpath,
91  * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92  *
93  * As a result it means it really only needs to be set at select times:
94  *
95  *   - when a thread blocks and "snitches" on the current thread owner,
96  *     so that when that thread unlocks it calls wake up,
97  *
98  *   - when a thread that was woken up resumes its work and became
99  *     the inheritor.
100  */
101 
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103 
104 #define NOINLINE                __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck)      CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e)   __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l)  ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108 
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112 
113 extern unsigned int not_in_kdp;
114 
115 #if CONFIG_SPTM
116 extern const bool * sptm_xnu_triggered_panic_ptr;
117 #endif /* CONFIG_SPTM */
118 
119 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
120 
121 #define LCK_MTX_NULL_CTID       0x00000000u
122 
123 __enum_decl(lck_mtx_mode_t, uint32_t, {
124 	LCK_MTX_MODE_SLEEPABLE,
125 	LCK_MTX_MODE_SPIN,
126 	LCK_MTX_MODE_SPIN_ALWAYS,
127 });
128 
129 __enum_decl(lck_ilk_mode_t, uint32_t, {
130 	LCK_ILK_MODE_UNLOCK,
131 	LCK_ILK_MODE_DIRECT,
132 	LCK_ILK_MODE_FROM_AS,
133 });
134 
135 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)136 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
137 {
138 	*mcs = (struct lck_mtx_mcs){ };
139 }
140 
141 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)142 lck_mtx_get_mcs_id(void)
143 {
144 	return lck_mcs_id_current(LCK_MCS_SLOT_0);
145 }
146 
147 __pure2
148 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)149 lck_mtx_get_mcs(lck_mcs_id_t idx)
150 {
151 	return &lck_mcs_get_other(idx)->mcs_mtx;
152 }
153 
154 
155 #pragma mark lck_mtx_t: validation
156 
157 __abortlike
158 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)159 __lck_mtx_invalid_panic(lck_mtx_t *lck)
160 {
161 	panic("Invalid/destroyed mutex %p: "
162 	    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
163 	    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
164 	    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
165 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
166 }
167 
168 __abortlike
169 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)170 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
171 {
172 	panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
173 }
174 
175 __abortlike
176 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)177 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
178 {
179 	panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
180 }
181 
182 __abortlike
183 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)184 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
185 {
186 	// "Always" variants can never block. If the lock is held as a normal mutex
187 	// then someone is mixing always and non-always calls on the same lock, which is
188 	// forbidden.
189 	panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
190 }
191 
192 #if DEVELOPMENT || DEBUG
193 __abortlike
194 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)195 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
196 {
197 	panic("Attempt to take mutex %p with preemption disabled (%d)",
198 	    lck, get_preemption_level() - expected);
199 }
200 
201 __abortlike
202 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)203 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
204 {
205 	panic("Attempt to take mutex %p in IRQ context", lck);
206 }
207 
208 /*
209  *	Routine:	lck_mtx_check_preemption
210  *
211  *	Verify preemption is enabled when attempting to acquire a mutex.
212  */
213 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)214 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
215 {
216 #pragma unused(thread)
217 	if (lock_preemption_level_for_thread(thread) == expected) {
218 		return;
219 	}
220 	if (LckDisablePreemptCheck) {
221 		return;
222 	}
223 	if (current_cpu_datap()->cpu_hibernate) {
224 		return;
225 	}
226 	if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
227 		return;
228 	}
229 #if CONFIG_SPTM
230 	/*
231 	 * If a panic has been initiated on SPTM devices, preemption was disabled by sleh,
232 	 * but platform callbacks could be acquiring mutexes
233 	 */
234 	if (*sptm_xnu_triggered_panic_ptr) {
235 		return;
236 	}
237 #endif
238 	__lck_mtx_preemption_disabled_panic(lock, expected);
239 }
240 
241 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)242 lck_mtx_check_irq(lck_mtx_t *lock)
243 {
244 	if (ml_at_interrupt_context()) {
245 		__lck_mtx_at_irq_panic(lock);
246 	}
247 }
248 
249 #define LCK_MTX_SNIFF_PREEMPTION(thread)   lock_preemption_level_for_thread(thread)
250 #define LCK_MTX_CHECK_INVARIANTS           1
251 #else
252 #define lck_mtx_check_irq(lck)             ((void)0)
253 #define LCK_MTX_SNIFF_PREEMPTION(thread)   0
254 #define LCK_MTX_CHECK_INVARIANTS           0
255 #endif /* !DEVELOPMENT && !DEBUG */
256 
257 #if CONFIG_DTRACE
258 #define LCK_MTX_SNIFF_DTRACE()             lck_debug_state.lds_value
259 #else
260 #define LCK_MTX_SNIFF_DTRACE()             0
261 #endif
262 
263 
264 #pragma mark lck_mtx_t: alloc/init/destroy/free
265 
266 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)267 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
268 {
269 	lck_mtx_t      *lck;
270 
271 	lck = zalloc(KT_LCK_MTX);
272 	lck_mtx_init(lck, grp, attr);
273 	return lck;
274 }
275 
276 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)277 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
278 {
279 	lck_mtx_destroy(lck, grp);
280 	zfree(KT_LCK_MTX, lck);
281 }
282 
283 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)284 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
285 {
286 	if (attr == LCK_ATTR_NULL) {
287 		attr = &lck_attr_default;
288 	}
289 
290 	*lck = (lck_mtx_t){
291 		.lck_mtx_type = LCK_TYPE_MUTEX,
292 		.lck_mtx_grp  = grp->lck_grp_attr_id,
293 	};
294 	if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
295 		lck->lck_mtx.data |= LCK_MTX_PROFILE;
296 	}
297 
298 	lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
299 }
300 
301 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)302 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
303 {
304 	if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
305 		panic("Mutex to destroy still has waiters: %p: "
306 		    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
307 		    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
308 		    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
309 		    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
310 	}
311 	if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
312 	    (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
313 	    lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
314 		__lck_mtx_invalid_panic(lck);
315 	}
316 	LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
317 	lck->lck_mtx_type = LCK_TYPE_NONE;
318 	lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
319 	lck->lck_mtx_grp      = 0;
320 	lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
321 }
322 
323 
324 #pragma mark lck_mtx_t: lck_mtx_ilk*
325 
326 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)327 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
328 {
329 	lck_mtx_t *lck = _lock;
330 
331 	panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
332 	    "current owner: %p, "
333 	    "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
334 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
335 	    lck, HW_SPIN_TIMEOUT_ARG(to, st),
336 	    ctid_get_thread_unsafe(lck->lck_mtx.owner),
337 	    lck->lck_mtx_tsid, lck->lck_mtx_type,
338 	    lck->lck_mtx_grp, lck->lck_mtx.data,
339 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
340 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
341 }
342 
343 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
344 	.hwsp_name              = "lck_mtx_t (ilk)",
345 	.hwsp_timeout_atomic    = &lock_panic_timeout,
346 	.hwsp_op_timeout        = lck_mtx_ilk_timeout_panic,
347 };
348 
349 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)350 lck_mtx_ilk_lock_cleanup_as_mcs(
351 	lck_mtx_t               *lock,
352 	lck_mcs_id_t             idx,
353 	lck_mtx_mcs_t            mcs,
354 	hw_spin_timeout_t        to,
355 	hw_spin_state_t         *ss)
356 {
357 	lck_mtx_mcs_t nnode = NULL;
358 	lck_mcs_id_t  pidx  = (lck_mcs_id_t)mcs->lmm_as_prev;
359 	bool          was_last;
360 
361 	/*
362 	 *	This is called when the thread made use
363 	 *	of the adaptive spin queue and needs
364 	 *	to remove itself from it.
365 	 */
366 
367 	/*
368 	 *	If the thread is last, set the tail to the node before us.
369 	 */
370 	was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
371 
372 	if (was_last) {
373 		/*
374 		 *	If @c mcs was last, we need to erase the previous
375 		 *	node link to it.
376 		 *
377 		 *	However, new nodes could have now taken our place
378 		 *	and set the previous node's @c lmm_as_next field
379 		 *	already, so we must CAS rather than blindly set.
380 		 *
381 		 *	We know the previous node is stable because
382 		 *	we hold the interlock (preventing concurrent
383 		 *	removals).
384 		 */
385 		if (pidx) {
386 			os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
387 			    mcs, nnode, relaxed);
388 		}
389 	} else {
390 		/*
391 		 *	If @c mcs wasn't last, then wait to make sure
392 		 *	we observe @c lmm_as_next. Once we do, we know
393 		 *	the field is stable since we hold the interlock
394 		 *	(preventing concurrent dequeues).
395 		 *
396 		 *	We can then update it to @c mcs next node index
397 		 *	(which is also stable for similar reasons).
398 		 *
399 		 *	Lastly update the previous node @c lmm_as_next
400 		 *	field as well to terminate the dequeue.
401 		 */
402 		while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
403 			hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
404 			hw_spin_should_keep_spinning(lock, pol, to, ss);
405 		}
406 
407 		os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
408 		if (pidx) {
409 			os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
410 			    nnode, relaxed);
411 		}
412 	}
413 
414 	/*
415 	 *	@c mcs's fields are left dangling,
416 	 *	it is the responsibilty of the caller
417 	 *	to terminate the cleanup.
418 	 */
419 }
420 
421 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)422 lck_mtx_ilk_lock_contended(
423 	lck_mtx_t              *lock,
424 	lck_mtx_state_t         state,
425 	lck_ilk_mode_t          mode)
426 {
427 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
428 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
429 	hw_spin_state_t   ss  = { };
430 
431 	lck_mtx_mcs_t     mcs, nnode, pnode;
432 	lck_mcs_id_t      idx, pidx;
433 	lck_mtx_state_t   nstate;
434 	unsigned long     ready;
435 	uint64_t          spin_start;
436 
437 	/*
438 	 *	Take a spot in the interlock MCS queue,
439 	 *	and then spin until we're at the head of it.
440 	 */
441 
442 	idx  = lck_mtx_get_mcs_id();
443 	mcs  = &lck_mcs_get_current()->mcs_mtx;
444 	if (mode != LCK_MTX_MODE_SPIN) {
445 		spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
446 	}
447 
448 	mcs->lmm_ilk_current = lock;
449 	pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
450 	if (pidx) {
451 		pnode = lck_mtx_get_mcs(pidx);
452 		os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
453 
454 		while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
455 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
456 		}
457 	}
458 
459 
460 	/*
461 	 *	We're now the first in line, wait for the interlock
462 	 *	to look ready and take it.
463 	 *
464 	 *	We can't just assume the lock is ours for the taking,
465 	 *	because the fastpath of lck_mtx_lock_spin{,_always}
466 	 *	only look at the mutex "data" and might steal it.
467 	 *
468 	 *	Also clear the interlock MCS tail if @c mcs is last.
469 	 */
470 	do {
471 		while (!hw_spin_wait_until(&lock->lck_mtx.val,
472 		    state.val, state.ilocked == 0)) {
473 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
474 		}
475 
476 		nstate = state;
477 		nstate.ilocked = 1;
478 		if (nstate.ilk_tail == idx) {
479 			nstate.ilk_tail = 0;
480 		}
481 	} while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
482 
483 
484 	/*
485 	 *	We now have the interlock, let's cleanup the MCS state.
486 	 *
487 	 *	First, if there is a node after us, notify that it
488 	 *	is at the head of the interlock queue.
489 	 *
490 	 *	Second, perform the adaptive spin MCS cleanup if needed.
491 	 *
492 	 *	Lastly, clear the MCS node.
493 	 */
494 	if (state.ilk_tail != idx) {
495 		while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
496 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
497 		}
498 
499 		os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
500 	}
501 
502 	if (mode == LCK_ILK_MODE_FROM_AS) {
503 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
504 	}
505 	lck_mtx_mcs_clear(mcs);
506 
507 	if (mode != LCK_MTX_MODE_SPIN) {
508 		LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
509 	}
510 }
511 
512 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)513 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
514 {
515 	lck_mtx_state_t state, nstate;
516 
517 	os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
518 		if (__improbable(state.ilocked || state.ilk_tail)) {
519 		        os_atomic_rmw_loop_give_up({
520 				return lck_mtx_ilk_lock_contended(lock, state, mode);
521 			});
522 		}
523 
524 		nstate = state;
525 		nstate.ilocked = true;
526 	});
527 }
528 
529 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)530 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
531 {
532 	os_atomic_store(&lock->lck_mtx.data, data, release);
533 	lock_enable_preemption();
534 }
535 
536 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)537 lck_mtx_ilk_unlock(lck_mtx_t *lock)
538 {
539 	lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
540 }
541 
542 
543 #pragma mark lck_mtx_t: turnstile integration
544 
545 /*
546  * Routine: lck_mtx_lock_wait
547  *
548  * Invoked in order to wait on contention.
549  *
550  * Called with the interlock locked and
551  * returns it unlocked.
552  *
553  * Always aggressively sets the owning thread to promoted,
554  * even if it's the same or higher priority
555  * This prevents it from lowering its own priority while holding a lock
556  *
557  * TODO: Come up with a more efficient way to handle same-priority promotions
558  *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
559  */
560 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)561 lck_mtx_lock_wait(
562 	lck_mtx_t              *lck,
563 	thread_t                self,
564 	thread_t                holder,
565 	struct turnstile       *ts)
566 {
567 	uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
568 
569 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
570 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
571 
572 	if (ts == TURNSTILE_NULL) {
573 		ts = turnstile_prepare_compact_id((uintptr_t)lck,
574 		    lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
575 		if (lck->lck_mtx_tsid == 0) {
576 			lck->lck_mtx_tsid = ts->ts_compact_id;
577 		}
578 	}
579 	assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
580 
581 	thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
582 	turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
583 
584 	waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
585 	    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
586 
587 	lck_mtx_ilk_unlock(lck);
588 
589 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
590 
591 	thread_block(THREAD_CONTINUE_NULL);
592 
593 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
594 
595 	LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
596 
597 	return ts;
598 }
599 
600 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)601 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile  *ts)
602 {
603 	if (turnstile_complete_compact_id((uintptr_t)lck, ts,
604 	    TURNSTILE_KERNEL_MUTEX)) {
605 		lck->lck_mtx_tsid = 0;
606 	}
607 }
608 
609 /*
610  * Routine:     lck_mtx_lock_will_need_wakeup
611  *
612  * Returns whether the thread is the current turnstile inheritor,
613  * which means it will have to call lck_mtx_unlock_wakeup()
614  * on unlock.
615  */
616 __attribute__((always_inline))
617 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)618 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t  self)
619 {
620 	uint32_t tsid = lck->lck_mtx_tsid;
621 
622 	return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
623 }
624 
625 /*
626  * Routine:     lck_mtx_unlock_wakeup
627  *
628  * Invoked on unlock when there is contention.
629  *
630  * Called with the interlock locked.
631  *
632  * NOTE: callers should call turnstile_clenup after
633  * dropping the interlock.
634  */
635 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)636 lck_mtx_unlock_wakeup(
637 	lck_mtx_t                       *lck,
638 	__kdebug_only thread_t          thread)
639 {
640 	struct turnstile *ts;
641 	kern_return_t did_wake;
642 
643 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
644 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
645 
646 	ts = turnstile_get_by_id(lck->lck_mtx_tsid);
647 
648 	/*
649 	 * We can skip turnstile_{prepare,cleanup} because
650 	 * we hold the interlock of the primitive,
651 	 * and enqueues/wakeups all happen under the interlock,
652 	 * which means the turnstile is stable.
653 	 */
654 	did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
655 	    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
656 	assert(did_wake == KERN_SUCCESS);
657 
658 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
659 
660 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
661 }
662 
663 
664 #pragma mark lck_mtx_t: lck_mtx_lock
665 
666 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)667 lck_mtx_ctid_on_core(uint32_t ctid)
668 {
669 	thread_t th = ctid_get_thread_unsafe(ctid);
670 
671 	return th && machine_thread_on_core_allow_invalid(th);
672 }
673 
674 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
675 	VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
676 
677 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)678 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
679 {
680 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
681 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
682 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
683 	hw_spin_state_t   ss  = { };
684 	uint64_t          deadline;
685 
686 	lck_mtx_mcs_t     mcs, node;
687 	lck_mcs_id_t      idx, pidx, clear_idx;
688 	unsigned long     prev;
689 	lck_mtx_state_t   nstate;
690 	ast_t      *const astp = ast_pending();
691 
692 	idx  = lck_mtx_get_mcs_id();
693 	mcs  = &lck_mcs_get_current()->mcs_mtx;
694 
695 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
696 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
697 
698 	/*
699 	 *	Take a spot in the adaptive spin queue,
700 	 *	and then spin until we're at the head of it.
701 	 *
702 	 *	Until we're at the head, we do not need to monitor
703 	 *	for whether the current owner is on core or not:
704 	 *
705 	 *	1. the head of the queue is doing it already,
706 	 *
707 	 *	2. when the entire adaptive spin queue will "give up"
708 	 *	   as a result of the owner going off core, we want
709 	 *	   to avoid a thundering herd and let the AS queue
710 	 *	   pour into the interlock one slowly.
711 	 *
712 	 *	Do give up if the scheduler made noises something
713 	 *	more important has shown up.
714 	 *
715 	 *	Note: this function is optimized so that we do not touch
716 	 *	      our local mcs node when we're the head of the queue.
717 	 *
718 	 *	      This allows us in the case when the contention is
719 	 *	      between 2 cores only to not have to touch this
720 	 *	      cacheline at all.
721 	 */
722 	pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
723 	if (pidx) {
724 		node = lck_mtx_get_mcs(pidx);
725 		mcs->lmm_as_prev = pidx;
726 		os_atomic_store(&node->lmm_as_next, mcs, release);
727 
728 		while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
729 		    prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT))) {
730 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
731 		}
732 
733 		if (__improbable(prev)) {
734 			goto adaptive_spin_fail;
735 		}
736 
737 		clear_idx = 0;
738 	} else {
739 		clear_idx = idx;
740 	}
741 
742 	/*
743 	 *	We're now first in line.
744 	 *
745 	 *	It's our responsbility to monitor the lock's state
746 	 *	for whether (1) the lock has become available,
747 	 *	(2) its owner has gone off core, (3) the scheduler
748 	 *	wants its CPU back, or (4) we've spun for too long.
749 	 */
750 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
751 
752 	for (;;) {
753 		state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
754 
755 		if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
756 			/*
757 			 * 2-core contention: if we can, try to dequeue
758 			 * ourselves from the adaptive spin queue
759 			 * as part of this CAS in order to avoid
760 			 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
761 			 * and zeroing the mcs node at all.
762 			 *
763 			 * Because the queue is designed to limit contention,
764 			 * using store-exclusive over an armv8.1 LSE atomic
765 			 * is actually marginally better (presumably due to
766 			 * the better codegen).
767 			 */
768 			nstate = state;
769 			nstate.ilocked = true;
770 			if (state.as_tail == clear_idx) {
771 				nstate.as_tail = 0;
772 			}
773 			if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
774 			    state.val, nstate.val, acquire))) {
775 				break;
776 			}
777 		} else {
778 			lock_wait_for_event();
779 		}
780 
781 		if (__improbable(ml_get_timebase() > deadline ||
782 		    (os_atomic_load(astp, relaxed) & AST_URGENT) ||
783 		    (!state.ilocked && !state.ilk_tail && state.owner &&
784 		    !lck_mtx_ctid_on_core(state.owner)))) {
785 			goto adaptive_spin_fail;
786 		}
787 	}
788 
789 	/*
790 	 *	If we're here, we got the lock, we just have to cleanup
791 	 *	the MCS nodes and return.
792 	 */
793 	if (state.as_tail != clear_idx) {
794 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
795 		lck_mtx_mcs_clear(mcs);
796 	}
797 
798 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
799 	    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
800 	    lock->lck_mtx_tsid, 0, 0);
801 	return;
802 
803 adaptive_spin_fail:
804 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
805 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
806 	return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
807 }
808 
809 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)810 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
811 {
812 	struct turnstile *ts = TURNSTILE_NULL;
813 	lck_mtx_state_t   state;
814 	uint32_t          ctid = thread->ctid;
815 	uint32_t          data;
816 #if CONFIG_DTRACE
817 	int               first_miss = 0;
818 #endif /* CONFIG_DTRACE */
819 	bool              direct_wait = false;
820 	uint64_t          spin_start;
821 	uint32_t          profile;
822 
823 	lck_mtx_check_irq(lock);
824 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
825 		lock_disable_preemption_for_thread(thread);
826 	}
827 
828 	for (;;) {
829 		/*
830 		 *	Load the current state and perform sanity checks
831 		 *
832 		 *	Note that the various "corrupt" values are designed
833 		 *	so that the slowpath is taken when a mutex was used
834 		 *	after destruction, so that we do not have to do
835 		 *	sanity checks in the fast path.
836 		 */
837 		state = os_atomic_load(&lock->lck_mtx, relaxed);
838 		if (state.owner == ctid) {
839 			__lck_mtx_owned_panic(lock, thread);
840 		}
841 		if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
842 		    state.data == LCK_MTX_TAG_DESTROYED) {
843 			__lck_mtx_invalid_panic(lock);
844 		}
845 		profile = (state.data & LCK_MTX_PROFILE);
846 
847 		/*
848 		 *	Attempt steal
849 		 *
850 		 *	When the lock state is 0, then no thread can be queued
851 		 *	for adaptive spinning or for the interlock yet.
852 		 *
853 		 *	As such we can attempt to try to take the interlock.
854 		 *	(we can't take the mutex directly because we need
855 		 *	the interlock to do turnstile operations on the way out).
856 		 */
857 		if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
858 			if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
859 			    state.val, state.val | LCK_MTX_ILOCK,
860 			    &state.val, acquire)) {
861 				continue;
862 			}
863 			break;
864 		}
865 
866 #if CONFIG_DTRACE
867 		if (profile) {
868 			LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
869 		}
870 #endif /* CONFIG_DTRACE */
871 
872 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
873 			spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
874 		} else {
875 			spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
876 		}
877 
878 		/*
879 		 *	Adaptive spin or interlock
880 		 *
881 		 *	Evaluate if adaptive spinning should be attempted,
882 		 *	and if yes go to adaptive spin.
883 		 *
884 		 *	Otherwise (and this includes always-spin mutexes),
885 		 *	go for the interlock.
886 		 */
887 		if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
888 		    (state.ilocked || state.as_tail || !state.owner ||
889 		    lck_mtx_ctid_on_core(state.owner))) {
890 			lck_mtx_lock_adaptive_spin(lock, state);
891 		} else {
892 			direct_wait = true;
893 			lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
894 		}
895 
896 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
897 			LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
898 		} else {
899 			LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
900 		}
901 
902 		/*
903 		 *	Take or sleep
904 		 *
905 		 *	We now have the interlock. Either the owner
906 		 *	isn't set, and the mutex is ours to claim,
907 		 *	or we must go to sleep.
908 		 *
909 		 *	If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
910 		 *	to force the current lock owner to call
911 		 *	lck_mtx_unlock_wakeup().
912 		 */
913 		state = os_atomic_load(&lock->lck_mtx, relaxed);
914 		if (state.owner == LCK_MTX_NULL_CTID) {
915 			break;
916 		}
917 
918 		if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
919 			__lck_mtx_lock_is_sleepable_panic(lock);
920 		}
921 
922 #if CONFIG_DTRACE
923 		if (profile) {
924 			LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
925 			    direct_wait, &first_miss);
926 		}
927 #endif /* CONFIG_DTRACE */
928 		os_atomic_store(&lock->lck_mtx.data,
929 		    state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
930 		    compiler_acq_rel);
931 		ts = lck_mtx_lock_wait(lock, thread,
932 		    ctid_get_thread(state.owner), ts);
933 
934 		/* returns interlock unlocked and preemption re-enabled */
935 		lock_disable_preemption_for_thread(thread);
936 	}
937 
938 	/*
939 	 *	We can take the lock!
940 	 *
941 	 *	We only have the interlock and the owner field is 0.
942 	 *
943 	 *	Perform various turnstile cleanups if needed,
944 	 *	claim the lock, and reenable preemption (if needed).
945 	 */
946 	if (ts) {
947 		lck_mtx_lock_wait_done(lock, ts);
948 	}
949 	data = ctid | profile;
950 	if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
951 		data |= LCK_MTX_NEEDS_WAKEUP;
952 	}
953 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
954 		data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
955 	}
956 	os_atomic_store(&lock->lck_mtx.data, data, release);
957 
958 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
959 		lock_enable_preemption();
960 	}
961 
962 	assert(thread->turnstile != NULL);
963 
964 	if (ts) {
965 		turnstile_cleanup();
966 	}
967 	LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
968 	    mode != LCK_MTX_MODE_SLEEPABLE, profile);
969 }
970 
971 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
972 __attribute__((noinline))
973 #else
974 __attribute__((always_inline))
975 #endif
976 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)977 lck_mtx_lock_slow(
978 	lck_mtx_t              *lock,
979 	thread_t                thread,
980 	lck_mtx_state_t         state,
981 	lck_mtx_mode_t          mode)
982 {
983 #pragma unused(state)
984 #if CONFIG_DTRACE
985 	lck_mtx_state_t ostate = {
986 		.data = LCK_MTX_PROFILE,
987 	};
988 #endif /* CONFIG_DTRACE */
989 
990 #if LCK_MTX_CHECK_INVARIANTS
991 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
992 		lck_mtx_check_preemption(lock, thread,
993 		    (mode == LCK_MTX_MODE_SPIN));
994 	}
995 #endif /* LCK_MTX_CHECK_INVARIANTS */
996 #if CONFIG_DTRACE
997 	if (state.val == ostate.val) {
998 		state.data = thread->ctid | LCK_MTX_PROFILE;
999 		if (mode != LCK_MTX_MODE_SLEEPABLE) {
1000 			state.ilocked = true;
1001 			state.spin_mode = true;
1002 		}
1003 		os_atomic_cmpxchgv(&lock->lck_mtx.val,
1004 		    ostate.val, state.val, &state.val, acquire);
1005 	}
1006 	if ((state.val & ~ostate.val) == 0) {
1007 		LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
1008 		    mode != LCK_MTX_MODE_SLEEPABLE,
1009 		    state.data & LCK_MTX_PROFILE);
1010 		return;
1011 	}
1012 #endif /* CONFIG_DTRACE */
1013 	lck_mtx_lock_contended(lock, thread, mode);
1014 }
1015 
1016 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1017 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1018 {
1019 	thread_t thread = current_thread();
1020 	lck_mtx_state_t state = {
1021 		.data = thread->ctid,
1022 	};
1023 	uint64_t take_slowpath = 0;
1024 
1025 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1026 		take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1027 	}
1028 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1029 
1030 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1031 		lock_disable_preemption_for_thread(thread);
1032 		state.ilocked = true;
1033 		state.spin_mode = true;
1034 	}
1035 
1036 	/*
1037 	 * Do the CAS on the entire mutex state,
1038 	 * which hence requires for the ILK/AS queues
1039 	 * to be empty (which is fairer).
1040 	 */
1041 	lock_cmpxchgv(&lock->lck_mtx.val,
1042 	    0, state.val, &state.val, acquire);
1043 
1044 	take_slowpath |= state.val;
1045 	if (__improbable(take_slowpath)) {
1046 		return lck_mtx_lock_slow(lock, thread, state, mode);
1047 	}
1048 }
1049 
1050 void
lck_mtx_lock(lck_mtx_t * lock)1051 lck_mtx_lock(lck_mtx_t *lock)
1052 {
1053 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1054 }
1055 
1056 void
lck_mtx_lock_spin(lck_mtx_t * lock)1057 lck_mtx_lock_spin(lck_mtx_t *lock)
1058 {
1059 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1060 }
1061 
1062 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1063 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1064 {
1065 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1066 }
1067 
1068 
1069 #pragma mark lck_mtx_t: lck_mtx_try_lock
1070 
1071 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1072 lck_mtx_try_lock_slow_inline(
1073 	lck_mtx_t              *lock,
1074 	thread_t                thread,
1075 	uint32_t                odata,
1076 	uint32_t                ndata,
1077 	bool                    spin)
1078 {
1079 #pragma unused(lock, thread, odata, ndata)
1080 #if CONFIG_DTRACE
1081 	if (odata == LCK_MTX_PROFILE) {
1082 		os_atomic_cmpxchgv(&lock->lck_mtx.data,
1083 		    odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1084 	}
1085 	if ((odata & ~LCK_MTX_PROFILE) == 0) {
1086 		LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1087 		    spin, odata & LCK_MTX_PROFILE);
1088 		return true;
1089 	}
1090 	if (odata & LCK_MTX_PROFILE) {
1091 		LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1092 	}
1093 #endif /* CONFIG_DTRACE */
1094 
1095 	if (spin) {
1096 		lock_enable_preemption();
1097 	}
1098 	return false;
1099 }
1100 
1101 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1102 __attribute__((noinline))
1103 #else
1104 __attribute__((always_inline))
1105 #endif
1106 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1107 lck_mtx_try_lock_slow(
1108 	lck_mtx_t              *lock,
1109 	thread_t                thread,
1110 	uint32_t                odata,
1111 	uint32_t                ndata)
1112 {
1113 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1114 }
1115 
1116 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1117 __attribute__((noinline))
1118 #else
1119 __attribute__((always_inline))
1120 #endif
1121 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1122 lck_mtx_try_lock_slow_spin(
1123 	lck_mtx_t              *lock,
1124 	thread_t                thread,
1125 	uint32_t                odata,
1126 	uint32_t                ndata)
1127 {
1128 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1129 }
1130 
1131 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1132 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1133 {
1134 	thread_t thread = current_thread();
1135 	uint32_t odata, ndata = thread->ctid;
1136 	uint32_t take_slowpath = 0;
1137 
1138 #if CONFIG_DTRACE
1139 	take_slowpath |= lck_debug_state.lds_value;
1140 #endif
1141 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1142 		lock_disable_preemption_for_thread(thread);
1143 		ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1144 	}
1145 
1146 	/*
1147 	 * try_lock because it's likely to be used for cases
1148 	 * like lock inversion resolutions tries a bit harder
1149 	 * than lck_mtx_lock() to take the lock and ignores
1150 	 * adaptive spin / interlock queues by doing the CAS
1151 	 * on the 32bit mutex data only.
1152 	 */
1153 	lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1154 
1155 	take_slowpath |= odata;
1156 	if (__probable(!take_slowpath)) {
1157 		return true;
1158 	}
1159 
1160 	if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1161 	    (odata & LCK_MTX_CTID_MASK) &&
1162 	    !(odata & LCK_MTX_SPIN_MODE)) {
1163 		__lck_mtx_lock_is_sleepable_panic(lock);
1164 	}
1165 
1166 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
1167 		return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1168 	} else {
1169 		return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1170 	}
1171 }
1172 
1173 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1174 lck_mtx_try_lock(lck_mtx_t *lock)
1175 {
1176 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1177 }
1178 
1179 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1180 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1181 {
1182 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1183 }
1184 
1185 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1186 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1187 {
1188 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1189 }
1190 
1191 
1192 #pragma mark lck_mtx_t: lck_mtx_unlock
1193 
1194 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1195 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1196 {
1197 	bool cleanup = false;
1198 
1199 #if !CONFIG_DTRACE
1200 	/*
1201 	 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1202 	 */
1203 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1204 		__lck_mtx_not_owned_panic(lock, thread);
1205 	}
1206 #endif /* !CONFIG_DTRACE */
1207 
1208 	if ((data & LCK_MTX_SPIN_MODE) == 0) {
1209 		lock_disable_preemption_for_thread(thread);
1210 		lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1211 	}
1212 
1213 	/*
1214 	 * We must re-load the data: we might have taken
1215 	 * the slowpath because another thread had taken
1216 	 * the interlock and set the NEEDS_WAKEUP bit
1217 	 * while we were spinning to get it.
1218 	 */
1219 	data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1220 	if (data & LCK_MTX_NEEDS_WAKEUP) {
1221 		lck_mtx_unlock_wakeup(lock, thread);
1222 		cleanup = true;
1223 	}
1224 	lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1225 
1226 	LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1227 
1228 	/*
1229 	 * Do not do any turnstile operations outside of this block.
1230 	 *
1231 	 * lock/unlock is called at early stage of boot while single
1232 	 * threaded, without turnstiles being available yet.
1233 	 * Even without contention we can come throught the slow path
1234 	 * if the mutex is acquired as a spin lock.
1235 	 */
1236 	if (cleanup) {
1237 		turnstile_cleanup();
1238 	}
1239 }
1240 
1241 #if CONFIG_DTRACE
1242 __attribute__((noinline))
1243 #else
1244 __attribute__((always_inline))
1245 #endif
1246 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1247 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1248 {
1249 #if CONFIG_DTRACE
1250 	/*
1251 	 *	If Dtrace is enabled, locks can be profiled,
1252 	 *	which causes the fastpath of unlock to fail.
1253 	 */
1254 	if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1255 		os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1256 		    &data, release);
1257 	}
1258 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1259 		__lck_mtx_not_owned_panic(lock, thread);
1260 	}
1261 	if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1262 		LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1263 		return;
1264 	}
1265 #endif /* CONFIG_DTRACE */
1266 
1267 	lck_mtx_unlock_contended(lock, thread, data);
1268 }
1269 
1270 void
lck_mtx_unlock(lck_mtx_t * lock)1271 lck_mtx_unlock(lck_mtx_t *lock)
1272 {
1273 	thread_t thread = current_thread();
1274 	uint32_t take_slowpath = 0;
1275 	uint32_t data;
1276 
1277 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1278 
1279 	/*
1280 	 * The fast path ignores the ILK/AS queues on purpose,
1281 	 * those really are a "lock" concept, not unlock.
1282 	 */
1283 	if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1284 	    thread->ctid, 0, &data, release))) {
1285 		if (__probable(!take_slowpath)) {
1286 			return;
1287 		}
1288 	}
1289 
1290 	lck_mtx_unlock_slow(lock, thread, data);
1291 }
1292 
1293 
1294 #pragma mark lck_mtx_t: misc
1295 
1296 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1297 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1298 {
1299 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1300 	thread_t        thread = current_thread();
1301 
1302 	if (type == LCK_MTX_ASSERT_OWNED) {
1303 		if (state.owner != thread->ctid) {
1304 			__lck_mtx_not_owned_panic(lock, thread);
1305 		}
1306 	} else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1307 		if (state.owner == thread->ctid) {
1308 			__lck_mtx_owned_panic(lock, thread);
1309 		}
1310 	} else {
1311 		panic("lck_mtx_assert(): invalid arg (%u)", type);
1312 	}
1313 }
1314 
1315 /*
1316  *	Routine:	lck_mtx_convert_spin
1317  *
1318  *	Convert a mutex held for spin into a held full mutex
1319  */
1320 void
lck_mtx_convert_spin(lck_mtx_t * lock)1321 lck_mtx_convert_spin(lck_mtx_t *lock)
1322 {
1323 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1324 	thread_t        thread = current_thread();
1325 	uint32_t        data   = thread->ctid;
1326 
1327 	if (state.owner != data) {
1328 		__lck_mtx_not_owned_panic(lock, thread);
1329 	}
1330 
1331 	if (state.spin_mode) {
1332 		/*
1333 		 * Note: we can acquire the lock in spin mode
1334 		 *       _and_ be the inheritor if we waited.
1335 		 *
1336 		 *       We must only clear ilocked and spin_mode,
1337 		 *       but preserve owner and needs_wakeup.
1338 		 */
1339 		state.ilocked = false;
1340 		state.spin_mode = false;
1341 		lck_mtx_ilk_unlock_v(lock, state.data);
1342 		turnstile_cleanup();
1343 	}
1344 }
1345 
1346 /*
1347  * Routine: kdp_lck_mtx_lock_spin_is_acquired
1348  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1349  */
1350 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1351 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1352 {
1353 	lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1354 
1355 	if (not_in_kdp) {
1356 		panic("panic: spinlock acquired check done outside of kernel debugger");
1357 	}
1358 	if (state.data == LCK_MTX_TAG_DESTROYED) {
1359 		return false;
1360 	}
1361 	return state.owner || state.ilocked;
1362 }
1363 
1364 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1365 kdp_lck_mtx_find_owner(
1366 	struct waitq           *waitq __unused,
1367 	event64_t               event,
1368 	thread_waitinfo_t      *waitinfo)
1369 {
1370 	lck_mtx_t      *mutex  = LCK_EVENT_TO_MUTEX(event);
1371 	lck_mtx_state_t state  = os_atomic_load(&mutex->lck_mtx, relaxed);
1372 
1373 	assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1374 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1375 	waitinfo->owner   = thread_tid(ctid_get_thread(state.owner));
1376 }
1377 
1378 #endif /* !LCK_MTX_USE_ARCH */
1379 
1380 /*
1381  * Routine:     mutex_pause
1382  *
1383  * Called by former callers of simple_lock_pause().
1384  */
1385 #define MAX_COLLISION_COUNTS    32
1386 #define MAX_COLLISION   8
1387 
1388 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1389 
1390 uint32_t collision_backoffs[MAX_COLLISION] = {
1391 	10, 50, 100, 200, 400, 600, 800, 1000
1392 };
1393 
1394 
1395 void
mutex_pause(uint32_t collisions)1396 mutex_pause(uint32_t collisions)
1397 {
1398 	wait_result_t wait_result;
1399 	uint32_t        back_off;
1400 
1401 	if (collisions >= MAX_COLLISION_COUNTS) {
1402 		collisions = MAX_COLLISION_COUNTS - 1;
1403 	}
1404 	max_collision_count[collisions]++;
1405 
1406 	if (collisions >= MAX_COLLISION) {
1407 		collisions = MAX_COLLISION - 1;
1408 	}
1409 	back_off = collision_backoffs[collisions];
1410 
1411 	wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1412 	assert(wait_result == THREAD_WAITING);
1413 
1414 	wait_result = thread_block(THREAD_CONTINUE_NULL);
1415 	assert(wait_result == THREAD_TIMED_OUT);
1416 }
1417 
1418 
1419 unsigned int mutex_yield_wait = 0;
1420 unsigned int mutex_yield_no_wait = 0;
1421 
1422 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1423 lck_mtx_yield(
1424 	lck_mtx_t   *lck)
1425 {
1426 	bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1427 
1428 #if DEBUG
1429 	lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1430 #endif /* DEBUG */
1431 
1432 	if (!has_waiters) {
1433 		mutex_yield_no_wait++;
1434 	} else {
1435 		mutex_yield_wait++;
1436 		lck_mtx_unlock(lck);
1437 		mutex_pause(0);
1438 		lck_mtx_lock(lck);
1439 	}
1440 	return has_waiters;
1441 }
1442