xref: /xnu-11215/osfmk/kern/timer_call.c (revision 8d741a5d)
1 /*
2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Timer interrupt callout module.
30  */
31 
32 #include <mach/mach_types.h>
33 
34 #include <kern/clock.h>
35 #include <kern/counter.h>
36 #include <kern/smp.h>
37 #include <kern/processor.h>
38 #include <kern/timer_call.h>
39 #include <kern/timer_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_group.h>
42 #include <kern/policy_internal.h>
43 
44 #include <sys/kdebug.h>
45 
46 #if CONFIG_DTRACE
47 #include <mach/sdt.h>
48 #endif
49 
50 
51 #if DEBUG
52 #define TIMER_ASSERT    1
53 #endif
54 
55 //#define TIMER_ASSERT	1
56 //#define TIMER_DBG	1
57 
58 #if TIMER_DBG
59 #define DBG(x...) kprintf("DBG: " x);
60 #else
61 #define DBG(x...)
62 #endif
63 
64 #if TIMER_TRACE
65 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
66 #else
67 #define TIMER_KDEBUG_TRACE(x...)
68 #endif
69 
70 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
71 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
72 LCK_GRP_DECLARE(timer_queue_lck_grp, "timer_queue");
73 
74 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
75 #define timer_queue_lock_spin(queue) lck_ticket_lock(&(queue)->lock_data, &timer_queue_lck_grp)
76 #define timer_queue_unlock(queue)    lck_ticket_unlock(&(queue)->lock_data)
77 
78 /*
79  * The longterm timer object is a global structure holding all timers
80  * beyond the short-term, local timer queue threshold. The boot processor
81  * is responsible for moving each timer to its local timer queue
82  * if and when that timer becomes due within the threshold.
83  */
84 
85 /* Sentinel for "no time set": */
86 #define TIMER_LONGTERM_NONE             EndOfAllTime
87 /* The default threadhold is the delta above which a timer is "long-term" */
88 #if defined(__x86_64__)
89 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
90 #else
91 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
92 #endif
93 
94 /*
95  * The scan_limit throttles processing of the longterm queue.
96  * If the scan time exceeds this limit, we terminate, unlock
97  * and defer for scan_interval. This prevents unbounded holding of
98  * timer queue locks with interrupts masked.
99  */
100 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
101 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
102 /* Sentinel for "scan limit exceeded": */
103 #define TIMER_LONGTERM_SCAN_AGAIN       0
104 
105 /*
106  * In a similar way to the longterm queue's scan limit, the following bounds the
107  * amount of time spent processing regular timers.
108  */
109 TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400);
110 TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40);
111 static uint64_t timer_scan_limit_abs = 0;
112 static uint64_t timer_scan_interval_abs = 0;
113 
114 /*
115  * Count of times scanning the queue was aborted early (to avoid long
116  * scan times).
117  */
118 SCALABLE_COUNTER_DEFINE(timer_scan_pauses_cnt);
119 
120 /*
121  * Count of times scanning the queue was aborted early resulting in a
122  * postponed hard deadline.
123  */
124 SCALABLE_COUNTER_DEFINE(timer_scan_postpones_cnt);
125 
126 #define MAX_TIMER_SCAN_LIMIT    (30000ULL * NSEC_PER_USEC)  /* 30 ms */
127 #define MIN_TIMER_SCAN_LIMIT    (   50ULL * NSEC_PER_USEC)  /* 50 us */
128 #define MAX_TIMER_SCAN_INTERVAL ( 2000ULL * NSEC_PER_USEC)  /*  2 ms */
129 #define MIN_TIMER_SCAN_INTERVAL (   20ULL * NSEC_PER_USEC)  /* 20 us */
130 
131 typedef struct {
132 	uint64_t        interval;       /* longterm timer interval */
133 	uint64_t        margin;         /* fudge factor (10% of interval */
134 	uint64_t        deadline;       /* first/soonest longterm deadline */
135 	uint64_t        preempted;      /* sooner timer has pre-empted */
136 	timer_call_t    call;           /* first/soonest longterm timer call */
137 	uint64_t        deadline_set;   /* next timer set */
138 	timer_call_data_t timer;        /* timer used by threshold management */
139 	                                /* Stats: */
140 	uint64_t        scans;          /*   num threshold timer scans */
141 	uint64_t        preempts;       /*   num threshold reductions */
142 	uint64_t        latency;        /*   average threshold latency */
143 	uint64_t        latency_min;    /*   minimum threshold latency */
144 	uint64_t        latency_max;    /*   maximum threshold latency */
145 } threshold_t;
146 
147 typedef struct {
148 	mpqueue_head_t  queue;          /* longterm timer list */
149 	uint64_t        enqueues;       /* num timers queued */
150 	uint64_t        dequeues;       /* num timers dequeued */
151 	uint64_t        escalates;      /* num timers becoming shortterm */
152 	uint64_t        scan_time;      /* last time the list was scanned */
153 	threshold_t     threshold;      /* longterm timer threshold */
154 	uint64_t        scan_limit;     /* maximum scan time */
155 	uint64_t        scan_interval;  /* interval between LT "escalation" scans */
156 	uint64_t        scan_pauses;    /* num scans exceeding time limit */
157 } timer_longterm_t;
158 
159 timer_longterm_t                timer_longterm = {
160 	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
161 	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
162 };
163 
164 static mpqueue_head_t           *timer_longterm_queue = NULL;
165 
166 static void                     timer_longterm_init(void);
167 static void                     timer_longterm_callout(
168 	timer_call_param_t      p0,
169 	timer_call_param_t      p1);
170 extern void                     timer_longterm_scan(
171 	timer_longterm_t        *tlp,
172 	uint64_t                now);
173 static void                     timer_longterm_update(
174 	timer_longterm_t *tlp);
175 static void                     timer_longterm_update_locked(
176 	timer_longterm_t *tlp);
177 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
178 	timer_call_t            call,
179 	uint64_t                now,
180 	uint64_t                deadline,
181 	mpqueue_head_t **       old_queue,
182 	uint64_t                soft_deadline,
183 	uint64_t                ttd,
184 	timer_call_param_t      param1,
185 	uint32_t                callout_flags);
186 static void                     timer_longterm_dequeued_locked(
187 	timer_call_t            call);
188 
189 uint64_t past_deadline_timers;
190 uint64_t past_deadline_deltas;
191 uint64_t past_deadline_longest;
192 uint64_t past_deadline_shortest = ~0ULL;
193 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
194 
195 uint64_t past_deadline_timer_adjustment;
196 
197 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
198 boolean_t       mach_timer_coalescing_enabled = TRUE;
199 
200 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
201 	timer_call_t            call,
202 	mpqueue_head_t          *queue,
203 	uint64_t                deadline,
204 	uint64_t                soft_deadline,
205 	uint64_t                ttd,
206 	timer_call_param_t      param1,
207 	uint32_t                flags);
208 
209 mpqueue_head_t  *timer_call_dequeue_unlocked(
210 	timer_call_t            call);
211 
212 timer_coalescing_priority_params_t tcoal_prio_params;
213 
214 #if TCOAL_PRIO_STATS
215 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
216 #define TCOAL_PRIO_STAT(x) (x++)
217 #else
218 #define TCOAL_PRIO_STAT(x)
219 #endif
220 
221 static void
timer_call_init_abstime(void)222 timer_call_init_abstime(void)
223 {
224 	int i;
225 	uint64_t result;
226 	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
227 	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
228 	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
229 	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
230 	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
231 	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
232 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
233 	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
234 	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
235 	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
236 	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
237 	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
238 	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
239 
240 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
241 	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
242 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
243 	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
244 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
245 	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
246 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
247 	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
248 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
249 	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
250 
251 	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
252 		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
253 		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
254 		    &tcoal_prio_params.latency_qos_abstime_max[i]);
255 		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
256 	}
257 
258 	nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC, &timer_scan_limit_abs);
259 	nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC, &timer_scan_interval_abs);
260 }
261 
262 
263 void
timer_call_init(void)264 timer_call_init(void)
265 {
266 	timer_longterm_init();
267 	timer_call_init_abstime();
268 }
269 
270 
271 void
timer_call_queue_init(mpqueue_head_t * queue)272 timer_call_queue_init(mpqueue_head_t *queue)
273 {
274 	DBG("timer_call_queue_init(%p)\n", queue);
275 	mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
276 }
277 
278 
279 void
timer_call_setup(timer_call_t call,timer_call_func_t func,timer_call_param_t param0)280 timer_call_setup(
281 	timer_call_t                    call,
282 	timer_call_func_t               func,
283 	timer_call_param_t              param0)
284 {
285 	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
286 
287 	*call = (struct timer_call) {
288 		.tc_func = func,
289 		.tc_param0 = param0,
290 		.tc_async_dequeue = false,
291 	};
292 
293 	simple_lock_init(&(call)->tc_lock, 0);
294 }
295 
296 timer_call_t
timer_call_alloc(timer_call_func_t func,timer_call_param_t param0)297 timer_call_alloc(
298 	timer_call_func_t       func,
299 	timer_call_param_t      param0)
300 {
301 	timer_call_t call;
302 
303 	call = kalloc_type(struct timer_call, Z_ZERO | Z_WAITOK | Z_NOFAIL);
304 	timer_call_setup(call, func, param0);
305 	return call;
306 }
307 
308 void
timer_call_free(timer_call_t call)309 timer_call_free(
310 	timer_call_t            call)
311 {
312 	kfree_type(struct timer_call, call);
313 }
314 
315 static mpqueue_head_t*
mpqueue_for_timer_call(timer_call_t entry)316 mpqueue_for_timer_call(timer_call_t entry)
317 {
318 	queue_t queue_entry_is_on = entry->tc_queue;
319 	/* 'cast' the queue back to the orignal mpqueue */
320 	return __container_of(queue_entry_is_on, struct mpqueue_head, head);
321 }
322 
323 
324 static __inline__ mpqueue_head_t *
timer_call_entry_dequeue(timer_call_t entry)325 timer_call_entry_dequeue(
326 	timer_call_t            entry)
327 {
328 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
329 
330 	/* The entry was always on a queue */
331 	assert(old_mpqueue != NULL);
332 
333 #if TIMER_ASSERT
334 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
335 		panic("_call_entry_dequeue() "
336 		    "entry %p is not locked\n", entry);
337 	}
338 
339 	/*
340 	 * XXX The queue lock is actually a mutex in spin mode
341 	 *     but there's no way to test for it being held
342 	 *     so we pretend it's a spinlock!
343 	 */
344 	if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
345 		panic("_call_entry_dequeue() "
346 		    "queue %p is not locked\n", old_mpqueue);
347 	}
348 #endif /* TIMER_ASSERT */
349 
350 	if (old_mpqueue != timer_longterm_queue) {
351 		priority_queue_remove(&old_mpqueue->mpq_pqhead,
352 		    &entry->tc_pqlink);
353 	}
354 
355 	remqueue(&entry->tc_qlink);
356 
357 	entry->tc_queue = NULL;
358 
359 	old_mpqueue->count--;
360 
361 	return old_mpqueue;
362 }
363 
364 static __inline__ mpqueue_head_t *
timer_call_entry_enqueue_deadline(timer_call_t entry,mpqueue_head_t * new_mpqueue,uint64_t deadline)365 timer_call_entry_enqueue_deadline(
366 	timer_call_t                    entry,
367 	mpqueue_head_t                  *new_mpqueue,
368 	uint64_t                        deadline)
369 {
370 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
371 
372 #if TIMER_ASSERT
373 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
374 		panic("_call_entry_enqueue_deadline() "
375 		    "entry %p is not locked\n", entry);
376 	}
377 
378 	/* XXX More lock pretense:  */
379 	if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
380 		panic("_call_entry_enqueue_deadline() "
381 		    "queue %p is not locked\n", new_mpqueue);
382 	}
383 
384 	if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
385 		panic("_call_entry_enqueue_deadline() "
386 		    "old_mpqueue %p != new_mpqueue", old_mpqueue);
387 	}
388 #endif /* TIMER_ASSERT */
389 
390 	/* no longterm queue involved */
391 	assert(new_mpqueue != timer_longterm_queue);
392 	assert(old_mpqueue != timer_longterm_queue);
393 
394 	if (old_mpqueue == new_mpqueue) {
395 		/* optimize the same-queue case to avoid a full re-insert */
396 		uint64_t old_deadline = entry->tc_pqlink.deadline;
397 		entry->tc_pqlink.deadline = deadline;
398 
399 		if (old_deadline < deadline) {
400 			priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
401 			    &entry->tc_pqlink);
402 		} else {
403 			priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
404 			    &entry->tc_pqlink);
405 		}
406 	} else {
407 		if (old_mpqueue != NULL) {
408 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
409 			    &entry->tc_pqlink);
410 
411 			re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
412 		} else {
413 			enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
414 		}
415 
416 		entry->tc_queue = &new_mpqueue->head;
417 		entry->tc_pqlink.deadline = deadline;
418 
419 		priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
420 	}
421 
422 
423 	/* For efficiency, track the earliest soft deadline on the queue,
424 	 * so that fuzzy decisions can be made without lock acquisitions.
425 	 */
426 
427 	timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
428 
429 	new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
430 
431 	if (old_mpqueue) {
432 		old_mpqueue->count--;
433 	}
434 	new_mpqueue->count++;
435 
436 	return old_mpqueue;
437 }
438 
439 static __inline__ void
timer_call_entry_enqueue_tail(timer_call_t entry,mpqueue_head_t * queue)440 timer_call_entry_enqueue_tail(
441 	timer_call_t                    entry,
442 	mpqueue_head_t                  *queue)
443 {
444 	/* entry is always dequeued before this call */
445 	assert(entry->tc_queue == NULL);
446 
447 	/*
448 	 * this is only used for timer_longterm_queue, which is unordered
449 	 * and thus needs no priority queueing
450 	 */
451 	assert(queue == timer_longterm_queue);
452 
453 	enqueue_tail(&queue->head, &entry->tc_qlink);
454 
455 	entry->tc_queue = &queue->head;
456 
457 	queue->count++;
458 	return;
459 }
460 
461 /*
462  * Remove timer entry from its queue but don't change the queue pointer
463  * and set the async_dequeue flag. This is locking case 2b.
464  */
465 static __inline__ void
timer_call_entry_dequeue_async(timer_call_t entry)466 timer_call_entry_dequeue_async(
467 	timer_call_t            entry)
468 {
469 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
470 	if (old_mpqueue) {
471 		old_mpqueue->count--;
472 
473 		if (old_mpqueue != timer_longterm_queue) {
474 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
475 			    &entry->tc_pqlink);
476 		}
477 
478 		remqueue(&entry->tc_qlink);
479 		entry->tc_async_dequeue = true;
480 	}
481 	return;
482 }
483 
484 #if TIMER_ASSERT
485 unsigned timer_call_enqueue_deadline_unlocked_async1;
486 unsigned timer_call_enqueue_deadline_unlocked_async2;
487 #endif
488 /*
489  * Assumes call_entry and queues unlocked, interrupts disabled.
490  */
491 __inline__ mpqueue_head_t *
timer_call_enqueue_deadline_unlocked(timer_call_t call,mpqueue_head_t * queue,uint64_t deadline,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)492 timer_call_enqueue_deadline_unlocked(
493 	timer_call_t                    call,
494 	mpqueue_head_t                  *queue,
495 	uint64_t                        deadline,
496 	uint64_t                        soft_deadline,
497 	uint64_t                        ttd,
498 	timer_call_param_t              param1,
499 	uint32_t                        callout_flags)
500 {
501 	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
502 
503 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
504 
505 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
506 
507 	if (old_queue != NULL) {
508 		timer_queue_lock_spin(old_queue);
509 		if (call->tc_async_dequeue) {
510 			/* collision (1c): timer already dequeued, clear flag */
511 #if TIMER_ASSERT
512 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
513 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
514 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
515 			    call->tc_async_dequeue,
516 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
517 			    0x1c, 0);
518 			timer_call_enqueue_deadline_unlocked_async1++;
519 #endif
520 			call->tc_async_dequeue = false;
521 			call->tc_queue = NULL;
522 		} else if (old_queue != queue) {
523 			timer_call_entry_dequeue(call);
524 #if TIMER_ASSERT
525 			timer_call_enqueue_deadline_unlocked_async2++;
526 #endif
527 		}
528 		if (old_queue == timer_longterm_queue) {
529 			timer_longterm_dequeued_locked(call);
530 		}
531 		if (old_queue != queue) {
532 			timer_queue_unlock(old_queue);
533 			timer_queue_lock_spin(queue);
534 		}
535 	} else {
536 		timer_queue_lock_spin(queue);
537 	}
538 
539 	call->tc_soft_deadline = soft_deadline;
540 	call->tc_flags = callout_flags;
541 	call->tc_param1 = param1;
542 	call->tc_ttd = ttd;
543 
544 	timer_call_entry_enqueue_deadline(call, queue, deadline);
545 	timer_queue_unlock(queue);
546 	simple_unlock(&call->tc_lock);
547 
548 	return old_queue;
549 }
550 
551 #if TIMER_ASSERT
552 unsigned timer_call_dequeue_unlocked_async1;
553 unsigned timer_call_dequeue_unlocked_async2;
554 #endif
555 mpqueue_head_t *
timer_call_dequeue_unlocked(timer_call_t call)556 timer_call_dequeue_unlocked(
557 	timer_call_t            call)
558 {
559 	DBG("timer_call_dequeue_unlocked(%p)\n", call);
560 
561 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
562 
563 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
564 
565 #if TIMER_ASSERT
566 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
567 	    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
568 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
569 	    call->tc_async_dequeue,
570 	    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
571 	    0, 0);
572 #endif
573 	if (old_queue != NULL) {
574 		timer_queue_lock_spin(old_queue);
575 		if (call->tc_async_dequeue) {
576 			/* collision (1c): timer already dequeued, clear flag */
577 #if TIMER_ASSERT
578 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
579 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
580 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
581 			    call->tc_async_dequeue,
582 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
583 			    0x1c, 0);
584 			timer_call_dequeue_unlocked_async1++;
585 #endif
586 			call->tc_async_dequeue = false;
587 			call->tc_queue = NULL;
588 		} else {
589 			timer_call_entry_dequeue(call);
590 		}
591 		if (old_queue == timer_longterm_queue) {
592 			timer_longterm_dequeued_locked(call);
593 		}
594 		timer_queue_unlock(old_queue);
595 	}
596 	simple_unlock(&call->tc_lock);
597 	return old_queue;
598 }
599 
600 uint64_t
timer_call_past_deadline_timer_handle(uint64_t deadline,uint64_t ctime)601 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
602 {
603 	uint64_t delta = (ctime - deadline);
604 
605 	past_deadline_timers++;
606 	past_deadline_deltas += delta;
607 	if (delta > past_deadline_longest) {
608 		past_deadline_longest = deadline;
609 	}
610 	if (delta < past_deadline_shortest) {
611 		past_deadline_shortest = delta;
612 	}
613 
614 	return ctime + past_deadline_timer_adjustment;
615 }
616 
617 /*
618  * Timer call entry locking model
619  * ==============================
620  *
621  * Timer call entries are linked on per-cpu timer queues which are protected
622  * by the queue lock and the call entry lock. The locking protocol is:
623  *
624  *  0) The canonical locking order is timer call entry followed by queue.
625  *
626  *  1) With only the entry lock held, entry.queue is valid:
627  *    1a) NULL: the entry is not queued, or
628  *    1b) non-NULL: this queue must be locked before the entry is modified.
629  *        After locking the queue, the call.async_dequeue flag must be checked:
630  *    1c) TRUE: the entry was removed from the queue by another thread
631  *	        and we must NULL the entry.queue and reset this flag, or
632  *    1d) FALSE: (ie. queued), the entry can be manipulated.
633  *
634  *  2) If a queue lock is obtained first, the queue is stable:
635  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
636  *	  and dequeued.
637  *    2b) If a try-lock fails, it indicates that another thread is attempting
638  *        to change the entry and move it to a different position in this queue
639  *        or to different queue. The entry can be dequeued but it should not be
640  *        operated upon since it is being changed. Furthermore, we don't null
641  *	  the entry.queue pointer (protected by the entry lock we don't own).
642  *	  Instead, we set the async_dequeue flag -- see (1c).
643  *    2c) Same as 2b but occurring when a longterm timer is matured.
644  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
645  *     should be manipulated with the appropriate timer queue lock held,
646  *     to prevent queue traversal observations from observing inconsistent
647  *     updates to an in-flight callout.
648  */
649 
650 /*
651  * In the debug case, we assert that the timer call locking protocol
652  * is being obeyed.
653  */
654 
655 static boolean_t
timer_call_enter_internal(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)656 timer_call_enter_internal(
657 	timer_call_t            call,
658 	timer_call_param_t      param1,
659 	uint64_t                deadline,
660 	uint64_t                leeway,
661 	uint32_t                flags,
662 	boolean_t               ratelimited)
663 {
664 	mpqueue_head_t          *queue = NULL;
665 	mpqueue_head_t          *old_queue;
666 	spl_t                   s;
667 	uint64_t                slop;
668 	uint32_t                urgency;
669 	uint64_t                sdeadline, ttd;
670 
671 	assert(call->tc_func != NULL);
672 	s = splclock();
673 
674 	sdeadline = deadline;
675 	uint64_t ctime = mach_absolute_time();
676 
677 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
678 	    DECR_TIMER_ENTER | DBG_FUNC_START,
679 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
680 	    VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
681 
682 	urgency = (flags & TIMER_CALL_URGENCY_MASK);
683 
684 	boolean_t slop_ratelimited = FALSE;
685 	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
686 
687 	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
688 		slop = leeway;
689 	}
690 
691 	if (UINT64_MAX - deadline <= slop) {
692 		deadline = UINT64_MAX;
693 	} else {
694 		deadline += slop;
695 	}
696 
697 	if (__improbable(deadline < ctime)) {
698 		deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
699 		sdeadline = deadline;
700 	}
701 
702 	if (ratelimited || slop_ratelimited) {
703 		flags |= TIMER_CALL_RATELIMITED;
704 	} else {
705 		flags &= ~TIMER_CALL_RATELIMITED;
706 	}
707 
708 	ttd =  sdeadline - ctime;
709 #if CONFIG_DTRACE
710 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
711 	    timer_call_param_t, call->tc_param0, uint32_t, flags,
712 	    (deadline - sdeadline),
713 	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
714 #endif
715 
716 	/* Program timer callout parameters under the appropriate per-CPU or
717 	 * longterm queue lock. The callout may have been previously enqueued
718 	 * and in-flight on this or another timer queue.
719 	 */
720 	if (!ratelimited && !slop_ratelimited) {
721 		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
722 	}
723 
724 	if (queue == NULL) {
725 		queue = timer_queue_assign(deadline);
726 		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
727 	}
728 
729 #if TIMER_TRACE
730 	call->tc_entry_time = ctime;
731 #endif
732 
733 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
734 	    DECR_TIMER_ENTER | DBG_FUNC_END,
735 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
736 	    (old_queue != NULL), deadline, queue->count, 0);
737 
738 	splx(s);
739 
740 	return old_queue != NULL;
741 }
742 
743 /*
744  * timer_call_*()
745  *	return boolean indicating whether the call was previously queued.
746  */
747 boolean_t
timer_call_enter(timer_call_t call,uint64_t deadline,uint32_t flags)748 timer_call_enter(
749 	timer_call_t            call,
750 	uint64_t                deadline,
751 	uint32_t                flags)
752 {
753 	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
754 }
755 
756 boolean_t
timer_call_enter1(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint32_t flags)757 timer_call_enter1(
758 	timer_call_t            call,
759 	timer_call_param_t      param1,
760 	uint64_t                deadline,
761 	uint32_t                flags)
762 {
763 	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
764 }
765 
766 boolean_t
timer_call_enter_with_leeway(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)767 timer_call_enter_with_leeway(
768 	timer_call_t            call,
769 	timer_call_param_t      param1,
770 	uint64_t                deadline,
771 	uint64_t                leeway,
772 	uint32_t                flags,
773 	boolean_t               ratelimited)
774 {
775 	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
776 }
777 
778 boolean_t
timer_call_cancel(timer_call_t call)779 timer_call_cancel(
780 	timer_call_t            call)
781 {
782 	mpqueue_head_t          *old_queue;
783 	spl_t                   s;
784 
785 	s = splclock();
786 
787 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
788 	    DECR_TIMER_CANCEL | DBG_FUNC_START,
789 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
790 	    call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
791 
792 	old_queue = timer_call_dequeue_unlocked(call);
793 
794 	if (old_queue != NULL) {
795 		timer_queue_lock_spin(old_queue);
796 
797 		timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
798 
799 		if (new_head) {
800 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
801 			old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
802 		} else {
803 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
804 			old_queue->earliest_soft_deadline = UINT64_MAX;
805 		}
806 
807 		timer_queue_unlock(old_queue);
808 	}
809 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
810 	    DECR_TIMER_CANCEL | DBG_FUNC_END,
811 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
812 	    VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
813 	    call->tc_pqlink.deadline - mach_absolute_time(),
814 	    call->tc_pqlink.deadline - call->tc_entry_time, 0);
815 	splx(s);
816 
817 #if CONFIG_DTRACE
818 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
819 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
820 	    (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
821 #endif /* CONFIG_DTRACE */
822 
823 	return old_queue != NULL;
824 }
825 
826 static uint32_t timer_queue_shutdown_lock_skips;
827 static uint32_t timer_queue_shutdown_discarded;
828 
829 void
timer_queue_shutdown(__kdebug_only int target_cpu,mpqueue_head_t * queue,mpqueue_head_t * new_queue)830 timer_queue_shutdown(
831 	__kdebug_only int target_cpu,
832 	mpqueue_head_t          *queue,
833 	mpqueue_head_t          *new_queue)
834 {
835 	DBG("timer_queue_shutdown(%p)\n", queue);
836 
837 	__kdebug_only int ntimers_moved = 0, lock_skips = 0, shutdown_discarded = 0;
838 
839 	spl_t s = splclock();
840 
841 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
842 	    DECR_TIMER_SHUTDOWN | DBG_FUNC_START,
843 	    target_cpu,
844 	    queue->earliest_soft_deadline, 0,
845 	    0, 0);
846 
847 	while (TRUE) {
848 		timer_queue_lock_spin(queue);
849 
850 		timer_call_t call = qe_queue_first(&queue->head,
851 		    struct timer_call, tc_qlink);
852 
853 		if (call == NULL) {
854 			break;
855 		}
856 
857 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
858 			/*
859 			 * case (2b) lock order inversion, dequeue and skip
860 			 * Don't change the call_entry queue back-pointer
861 			 * but set the async_dequeue field.
862 			 */
863 			lock_skips++;
864 			timer_queue_shutdown_lock_skips++;
865 			timer_call_entry_dequeue_async(call);
866 #if TIMER_ASSERT
867 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
868 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
869 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
870 			    call->tc_async_dequeue,
871 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
872 			    0x2b, 0);
873 #endif
874 			timer_queue_unlock(queue);
875 			continue;
876 		}
877 
878 		boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
879 
880 		/* remove entry from old queue */
881 		timer_call_entry_dequeue(call);
882 		timer_queue_unlock(queue);
883 
884 		if (call_local == FALSE) {
885 			/* and queue it on new, discarding LOCAL timers */
886 			timer_queue_lock_spin(new_queue);
887 			timer_call_entry_enqueue_deadline(
888 				call, new_queue, call->tc_pqlink.deadline);
889 			timer_queue_unlock(new_queue);
890 			ntimers_moved++;
891 		} else {
892 			shutdown_discarded++;
893 			timer_queue_shutdown_discarded++;
894 		}
895 
896 		assert(call_local == FALSE);
897 		simple_unlock(&call->tc_lock);
898 	}
899 
900 	timer_queue_unlock(queue);
901 
902 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
903 	    DECR_TIMER_SHUTDOWN | DBG_FUNC_END,
904 	    target_cpu, ntimers_moved, lock_skips, shutdown_discarded, 0);
905 
906 	splx(s);
907 }
908 
909 
910 static uint32_t timer_queue_expire_lock_skips;
911 uint64_t
timer_queue_expire_with_options(mpqueue_head_t * queue,uint64_t deadline,boolean_t rescan)912 timer_queue_expire_with_options(
913 	mpqueue_head_t          *queue,
914 	uint64_t                deadline,
915 	boolean_t               rescan)
916 {
917 	timer_call_t    call = NULL;
918 	uint32_t tc_iterations = 0;
919 	DBG("timer_queue_expire(%p,)\n", queue);
920 
921 	/* 'rescan' means look at every timer in the list, instead of
922 	 * early-exiting when the head of the list expires in the future.
923 	 * when 'rescan' is true, iterate by linked list instead of priority queue.
924 	 *
925 	 * TODO: if we keep a deadline ordered and soft-deadline ordered
926 	 * priority queue, then it's no longer necessary to do that
927 	 */
928 
929 	uint64_t cur_deadline = deadline;
930 
931 	/* Force an early return if this time limit is hit. */
932 	const uint64_t time_limit = deadline + timer_scan_limit_abs;
933 
934 	/* Next deadline if the time limit is hit */
935 	uint64_t time_limit_deadline = 0;
936 
937 	timer_queue_lock_spin(queue);
938 
939 	while (!queue_empty(&queue->head)) {
940 		if (++tc_iterations > 1) {
941 			const uint64_t now = mach_absolute_time();
942 
943 			/*
944 			 * Abort the scan if it's taking too long to avoid long
945 			 * periods with interrupts disabled.
946 			 * Scanning will restart after a short pause
947 			 * (timer_scan_interval_abs) if there's a hard deadline
948 			 * pending.
949 			 */
950 			if (rescan == FALSE && now > time_limit) {
951 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
952 				    DECR_TIMER_PAUSE | DBG_FUNC_NONE,
953 				    queue->count, tc_iterations - 1,
954 				    0, 0, 0);
955 
956 				counter_inc(&timer_scan_pauses_cnt);
957 				time_limit_deadline = now + timer_scan_interval_abs;
958 				break;
959 			}
960 
961 			/*
962 			 * Upon processing one or more timer calls, refresh the
963 			 * deadline to account for time elapsed in the callout
964 			 */
965 			cur_deadline = now;
966 		}
967 
968 		if (call == NULL) {
969 			if (rescan == FALSE) {
970 				call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
971 			} else {
972 				call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
973 			}
974 		}
975 
976 		if (call->tc_soft_deadline <= cur_deadline) {
977 			timer_call_func_t               func;
978 			timer_call_param_t              param0, param1;
979 
980 			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
981 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
982 			    DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
983 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
984 			    call->tc_soft_deadline,
985 			    call->tc_pqlink.deadline,
986 			    call->tc_entry_time, 0);
987 
988 			if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
989 			    (call->tc_pqlink.deadline > cur_deadline)) {
990 				if (rescan == FALSE) {
991 					break;
992 				}
993 			}
994 
995 			if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
996 				/* case (2b) lock inversion, dequeue and skip */
997 				timer_queue_expire_lock_skips++;
998 				timer_call_entry_dequeue_async(call);
999 				call = NULL;
1000 				continue;
1001 			}
1002 
1003 			timer_call_entry_dequeue(call);
1004 
1005 			func = call->tc_func;
1006 			param0 = call->tc_param0;
1007 			param1 = call->tc_param1;
1008 
1009 			simple_unlock(&call->tc_lock);
1010 			timer_queue_unlock(queue);
1011 
1012 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1013 			    DECR_TIMER_CALLOUT | DBG_FUNC_START,
1014 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1015 			    VM_KERNEL_ADDRHIDE(param0),
1016 			    VM_KERNEL_ADDRHIDE(param1),
1017 			    0);
1018 
1019 #if CONFIG_DTRACE
1020 			DTRACE_TMR7(callout__start, timer_call_func_t, func,
1021 			    timer_call_param_t, param0, unsigned, call->tc_flags,
1022 			    0, (call->tc_ttd >> 32),
1023 			    (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1024 #endif
1025 			/* Maintain time-to-deadline in per-processor data
1026 			 * structure for thread wakeup deadline statistics.
1027 			 */
1028 			uint64_t *ttdp = &current_processor()->timer_call_ttd;
1029 			*ttdp = call->tc_ttd;
1030 			(*func)(param0, param1);
1031 			*ttdp = 0;
1032 #if CONFIG_DTRACE
1033 			DTRACE_TMR4(callout__end, timer_call_func_t, func,
1034 			    param0, param1, call);
1035 #endif
1036 
1037 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1038 			    DECR_TIMER_CALLOUT | DBG_FUNC_END,
1039 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1040 			    VM_KERNEL_ADDRHIDE(param0),
1041 			    VM_KERNEL_ADDRHIDE(param1),
1042 			    0);
1043 			call = NULL;
1044 			timer_queue_lock_spin(queue);
1045 		} else {
1046 			if (__probable(rescan == FALSE)) {
1047 				break;
1048 			} else {
1049 				int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
1050 				assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
1051 
1052 				/* DRK: On a latency quality-of-service level change,
1053 				 * re-sort potentially rate-limited timers. The platform
1054 				 * layer determines which timers require
1055 				 * this. In the absence of the per-callout
1056 				 * synchronization requirement, a global resort could
1057 				 * be more efficient. The re-sort effectively
1058 				 * annuls all timer adjustments, i.e. the "soft
1059 				 * deadline" is the sort key.
1060 				 */
1061 
1062 				if (timer_resort_threshold(skew)) {
1063 					if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
1064 						/* TODO: don't need to dequeue before enqueue */
1065 						timer_call_entry_dequeue(call);
1066 						timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
1067 						simple_unlock(&call->tc_lock);
1068 						call = NULL;
1069 					}
1070 				}
1071 				if (call) {
1072 					call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
1073 
1074 					if (call == NULL) {
1075 						break;
1076 					}
1077 				}
1078 			}
1079 		}
1080 	}
1081 
1082 	call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
1083 
1084 	if (call) {
1085 		/*
1086 		 * Even if the time limit has been hit, it doesn't mean a hard
1087 		 * deadline will be missed - the next hard deadline may be in
1088 		 * future.
1089 		 */
1090 		if (time_limit_deadline > call->tc_pqlink.deadline) {
1091 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1092 			    DECR_TIMER_POSTPONE | DBG_FUNC_NONE,
1093 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1094 			    call->tc_pqlink.deadline,
1095 			    time_limit_deadline,
1096 			    0, 0);
1097 			counter_inc(&timer_scan_postpones_cnt);
1098 			cur_deadline = time_limit_deadline;
1099 		} else {
1100 			cur_deadline = call->tc_pqlink.deadline;
1101 		}
1102 		queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1103 	} else {
1104 		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1105 	}
1106 
1107 	timer_queue_unlock(queue);
1108 
1109 	return cur_deadline;
1110 }
1111 
1112 uint64_t
timer_queue_expire(mpqueue_head_t * queue,uint64_t deadline)1113 timer_queue_expire(
1114 	mpqueue_head_t          *queue,
1115 	uint64_t                deadline)
1116 {
1117 	return timer_queue_expire_with_options(queue, deadline, FALSE);
1118 }
1119 
1120 static uint32_t timer_queue_migrate_lock_skips;
1121 /*
1122  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1123  * to move timer requests from the local processor (queue_from)
1124  * to a target processor's (queue_to).
1125  */
1126 int
timer_queue_migrate(mpqueue_head_t * queue_from,mpqueue_head_t * queue_to)1127 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1128 {
1129 	timer_call_t    call;
1130 	timer_call_t    head_to;
1131 	int             timers_migrated = 0;
1132 
1133 	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1134 
1135 	assert(!ml_get_interrupts_enabled());
1136 	assert(queue_from != queue_to);
1137 
1138 	if (serverperfmode) {
1139 		/*
1140 		 * if we're running a high end server
1141 		 * avoid migrations... they add latency
1142 		 * and don't save us power under typical
1143 		 * server workloads
1144 		 */
1145 		return -4;
1146 	}
1147 
1148 	/*
1149 	 * Take both local (from) and target (to) timer queue locks while
1150 	 * moving the timers from the local queue to the target processor.
1151 	 * We assume that the target is always the boot processor.
1152 	 * But only move if all of the following is true:
1153 	 *  - the target queue is non-empty
1154 	 *  - the local queue is non-empty
1155 	 *  - the local queue's first deadline is later than the target's
1156 	 *  - the local queue contains no non-migrateable "local" call
1157 	 * so that we need not have the target resync.
1158 	 */
1159 
1160 	timer_queue_lock_spin(queue_to);
1161 
1162 	head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1163 
1164 	if (head_to == NULL) {
1165 		timers_migrated = -1;
1166 		goto abort1;
1167 	}
1168 
1169 	timer_queue_lock_spin(queue_from);
1170 
1171 	call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1172 
1173 	if (call == NULL) {
1174 		timers_migrated = -2;
1175 		goto abort2;
1176 	}
1177 
1178 	if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1179 		timers_migrated = 0;
1180 		goto abort2;
1181 	}
1182 
1183 	/* perform scan for non-migratable timers */
1184 	qe_foreach_element(call, &queue_from->head, tc_qlink) {
1185 		if (call->tc_flags & TIMER_CALL_LOCAL) {
1186 			timers_migrated = -3;
1187 			goto abort2;
1188 		}
1189 	}
1190 
1191 	/* migration loop itself -- both queues are locked */
1192 	qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1193 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1194 			/* case (2b) lock order inversion, dequeue only */
1195 #ifdef TIMER_ASSERT
1196 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1197 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1198 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1199 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1200 			    0,
1201 			    0x2b, 0);
1202 #endif
1203 			timer_queue_migrate_lock_skips++;
1204 			timer_call_entry_dequeue_async(call);
1205 			continue;
1206 		}
1207 		timer_call_entry_dequeue(call);
1208 		timer_call_entry_enqueue_deadline(
1209 			call, queue_to, call->tc_pqlink.deadline);
1210 		timers_migrated++;
1211 		simple_unlock(&call->tc_lock);
1212 	}
1213 	queue_from->earliest_soft_deadline = UINT64_MAX;
1214 abort2:
1215 	timer_queue_unlock(queue_from);
1216 abort1:
1217 	timer_queue_unlock(queue_to);
1218 
1219 	return timers_migrated;
1220 }
1221 
1222 void
timer_queue_trace_cpu(int ncpu)1223 timer_queue_trace_cpu(int ncpu)
1224 {
1225 	timer_call_nosync_cpu(
1226 		ncpu,
1227 		(void (*)(void *))timer_queue_trace,
1228 		(void*) timer_queue_cpu(ncpu));
1229 }
1230 
1231 void
timer_queue_trace(mpqueue_head_t * queue)1232 timer_queue_trace(
1233 	mpqueue_head_t                  *queue)
1234 {
1235 	timer_call_t    call;
1236 	spl_t           s;
1237 
1238 	if (!kdebug_enable) {
1239 		return;
1240 	}
1241 
1242 	s = splclock();
1243 	timer_queue_lock_spin(queue);
1244 
1245 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1246 	    DECR_TIMER_QUEUE | DBG_FUNC_START,
1247 	    queue->count, mach_absolute_time(), 0, 0, 0);
1248 
1249 	qe_foreach_element(call, &queue->head, tc_qlink) {
1250 		TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1251 		    DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1252 		    call->tc_soft_deadline,
1253 		    call->tc_pqlink.deadline,
1254 		    call->tc_entry_time,
1255 		    VM_KERNEL_UNSLIDE(call->tc_func),
1256 		    0);
1257 	}
1258 
1259 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1260 	    DECR_TIMER_QUEUE | DBG_FUNC_END,
1261 	    queue->count, mach_absolute_time(), 0, 0, 0);
1262 
1263 	timer_queue_unlock(queue);
1264 	splx(s);
1265 }
1266 
1267 void
timer_longterm_dequeued_locked(timer_call_t call)1268 timer_longterm_dequeued_locked(timer_call_t call)
1269 {
1270 	timer_longterm_t        *tlp = &timer_longterm;
1271 
1272 	tlp->dequeues++;
1273 	if (call == tlp->threshold.call) {
1274 		tlp->threshold.call = NULL;
1275 	}
1276 }
1277 
1278 /*
1279  * Place a timer call in the longterm list
1280  * and adjust the next timer callout deadline if the new timer is first.
1281  */
1282 mpqueue_head_t *
timer_longterm_enqueue_unlocked(timer_call_t call,uint64_t now,uint64_t deadline,mpqueue_head_t ** old_queue,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)1283 timer_longterm_enqueue_unlocked(timer_call_t    call,
1284     uint64_t        now,
1285     uint64_t        deadline,
1286     mpqueue_head_t  **old_queue,
1287     uint64_t        soft_deadline,
1288     uint64_t        ttd,
1289     timer_call_param_t      param1,
1290     uint32_t        callout_flags)
1291 {
1292 	timer_longterm_t        *tlp = &timer_longterm;
1293 	boolean_t               update_required = FALSE;
1294 	uint64_t                longterm_threshold;
1295 
1296 	longterm_threshold = now + tlp->threshold.interval;
1297 
1298 	/*
1299 	 * Return NULL without doing anything if:
1300 	 *  - this timer is local, or
1301 	 *  - the longterm mechanism is disabled, or
1302 	 *  - this deadline is too short.
1303 	 */
1304 	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1305 	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1306 	    (deadline <= longterm_threshold)) {
1307 		return NULL;
1308 	}
1309 
1310 	/*
1311 	 * Remove timer from its current queue, if any.
1312 	 */
1313 	*old_queue = timer_call_dequeue_unlocked(call);
1314 
1315 	/*
1316 	 * Lock the longterm queue, queue timer and determine
1317 	 * whether an update is necessary.
1318 	 */
1319 	assert(!ml_get_interrupts_enabled());
1320 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
1321 	timer_queue_lock_spin(timer_longterm_queue);
1322 	call->tc_pqlink.deadline = deadline;
1323 	call->tc_param1 = param1;
1324 	call->tc_ttd = ttd;
1325 	call->tc_soft_deadline = soft_deadline;
1326 	call->tc_flags = callout_flags;
1327 	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1328 
1329 	tlp->enqueues++;
1330 
1331 	/*
1332 	 * We'll need to update the currently set threshold timer
1333 	 * if the new deadline is sooner and no sooner update is in flight.
1334 	 */
1335 	if (deadline < tlp->threshold.deadline &&
1336 	    deadline < tlp->threshold.preempted) {
1337 		tlp->threshold.preempted = deadline;
1338 		tlp->threshold.call = call;
1339 		update_required = TRUE;
1340 	}
1341 	timer_queue_unlock(timer_longterm_queue);
1342 	simple_unlock(&call->tc_lock);
1343 
1344 	if (update_required) {
1345 		/*
1346 		 * Note: this call expects that calling the master cpu
1347 		 * alone does not involve locking the topo lock.
1348 		 */
1349 		timer_call_nosync_cpu(
1350 			master_cpu,
1351 			(void (*)(void *))timer_longterm_update,
1352 			(void *)tlp);
1353 	}
1354 
1355 	return timer_longterm_queue;
1356 }
1357 
1358 /*
1359  * Scan for timers below the longterm threshold.
1360  * Move these to the local timer queue (of the boot processor on which the
1361  * calling thread is running).
1362  * Both the local (boot) queue and the longterm queue are locked.
1363  * The scan is similar to the timer migrate sequence but is performed by
1364  * successively examining each timer on the longterm queue:
1365  *  - if within the short-term threshold
1366  *    - enter on the local queue (unless being deleted),
1367  *  - otherwise:
1368  *    - if sooner, deadline becomes the next threshold deadline.
1369  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1370  * exceeded, we abort and reschedule again so that we don't shut others from
1371  * the timer queues. Longterm timers firing late is not critical.
1372  */
1373 void
timer_longterm_scan(timer_longterm_t * tlp,uint64_t time_start)1374 timer_longterm_scan(timer_longterm_t    *tlp,
1375     uint64_t            time_start)
1376 {
1377 	timer_call_t    call;
1378 	uint64_t        threshold = TIMER_LONGTERM_NONE;
1379 	uint64_t        deadline;
1380 	uint64_t        time_limit = time_start + tlp->scan_limit;
1381 	mpqueue_head_t  *timer_master_queue;
1382 
1383 	assert(!ml_get_interrupts_enabled());
1384 	assert(cpu_number() == master_cpu);
1385 
1386 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1387 		threshold = time_start + tlp->threshold.interval;
1388 	}
1389 
1390 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1391 	tlp->threshold.call = NULL;
1392 
1393 	if (queue_empty(&timer_longterm_queue->head)) {
1394 		return;
1395 	}
1396 
1397 	timer_master_queue = timer_queue_cpu(master_cpu);
1398 	timer_queue_lock_spin(timer_master_queue);
1399 
1400 	qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1401 		deadline = call->tc_soft_deadline;
1402 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1403 			/* case (2c) lock order inversion, dequeue only */
1404 #ifdef TIMER_ASSERT
1405 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1406 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1407 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1408 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1409 			    0,
1410 			    0x2c, 0);
1411 #endif
1412 			timer_call_entry_dequeue_async(call);
1413 			continue;
1414 		}
1415 		if (deadline < threshold) {
1416 			/*
1417 			 * This timer needs moving (escalating)
1418 			 * to the local (boot) processor's queue.
1419 			 */
1420 #ifdef TIMER_ASSERT
1421 			if (deadline < time_start) {
1422 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1423 				    DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1424 				    VM_KERNEL_UNSLIDE_OR_PERM(call),
1425 				    deadline,
1426 				    time_start,
1427 				    threshold,
1428 				    0);
1429 			}
1430 #endif
1431 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1432 			    DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1433 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1434 			    call->tc_pqlink.deadline,
1435 			    call->tc_entry_time,
1436 			    VM_KERNEL_UNSLIDE(call->tc_func),
1437 			    0);
1438 			tlp->escalates++;
1439 			timer_call_entry_dequeue(call);
1440 			timer_call_entry_enqueue_deadline(
1441 				call, timer_master_queue, call->tc_pqlink.deadline);
1442 			/*
1443 			 * A side-effect of the following call is to update
1444 			 * the actual hardware deadline if required.
1445 			 */
1446 			(void) timer_queue_assign(deadline);
1447 		} else {
1448 			if (deadline < tlp->threshold.deadline) {
1449 				tlp->threshold.deadline = deadline;
1450 				tlp->threshold.call = call;
1451 			}
1452 		}
1453 		simple_unlock(&call->tc_lock);
1454 
1455 		/* Abort scan if we're taking too long. */
1456 		if (mach_absolute_time() > time_limit) {
1457 			tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1458 			tlp->scan_pauses++;
1459 			DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1460 			    time_limit, tlp->queue.count);
1461 			break;
1462 		}
1463 	}
1464 
1465 	timer_queue_unlock(timer_master_queue);
1466 }
1467 
1468 void
timer_longterm_callout(timer_call_param_t p0,__unused timer_call_param_t p1)1469 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1470 {
1471 	timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1472 
1473 	timer_longterm_update(tlp);
1474 }
1475 
1476 void
timer_longterm_update_locked(timer_longterm_t * tlp)1477 timer_longterm_update_locked(timer_longterm_t *tlp)
1478 {
1479 	uint64_t        latency;
1480 
1481 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1482 	    DECR_TIMER_UPDATE | DBG_FUNC_START,
1483 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1484 	    tlp->threshold.deadline,
1485 	    tlp->threshold.preempted,
1486 	    tlp->queue.count, 0);
1487 
1488 	tlp->scan_time = mach_absolute_time();
1489 	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1490 		tlp->threshold.preempts++;
1491 		tlp->threshold.deadline = tlp->threshold.preempted;
1492 		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1493 		/*
1494 		 * Note: in the unlikely event that a pre-empted timer has
1495 		 * itself been cancelled, we'll simply re-scan later at the
1496 		 * time of the preempted/cancelled timer.
1497 		 */
1498 	} else {
1499 		tlp->threshold.scans++;
1500 
1501 		/*
1502 		 * Maintain a moving average of our wakeup latency.
1503 		 * Clamp latency to 0 and ignore above threshold interval.
1504 		 */
1505 		if (tlp->scan_time > tlp->threshold.deadline_set) {
1506 			latency = tlp->scan_time - tlp->threshold.deadline_set;
1507 		} else {
1508 			latency = 0;
1509 		}
1510 		if (latency < tlp->threshold.interval) {
1511 			tlp->threshold.latency_min =
1512 			    MIN(tlp->threshold.latency_min, latency);
1513 			tlp->threshold.latency_max =
1514 			    MAX(tlp->threshold.latency_max, latency);
1515 			tlp->threshold.latency =
1516 			    (tlp->threshold.latency * 99 + latency) / 100;
1517 		}
1518 
1519 		timer_longterm_scan(tlp, tlp->scan_time);
1520 	}
1521 
1522 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1523 	/* The next deadline timer to be set is adjusted */
1524 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1525 	    tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1526 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1527 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1528 	}
1529 
1530 	/* Throttle next scan time */
1531 	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1532 	if (tlp->threshold.deadline_set < scan_clamp) {
1533 		tlp->threshold.deadline_set = scan_clamp;
1534 	}
1535 
1536 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1537 	    DECR_TIMER_UPDATE | DBG_FUNC_END,
1538 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1539 	    tlp->threshold.deadline,
1540 	    tlp->threshold.scans,
1541 	    tlp->queue.count, 0);
1542 }
1543 
1544 void
timer_longterm_update(timer_longterm_t * tlp)1545 timer_longterm_update(timer_longterm_t *tlp)
1546 {
1547 	spl_t   s = splclock();
1548 
1549 	timer_queue_lock_spin(timer_longterm_queue);
1550 
1551 	if (cpu_number() != master_cpu) {
1552 		panic("timer_longterm_update_master() on non-boot cpu");
1553 	}
1554 
1555 	timer_longterm_update_locked(tlp);
1556 
1557 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1558 		timer_call_enter(
1559 			&tlp->threshold.timer,
1560 			tlp->threshold.deadline_set,
1561 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1562 	}
1563 
1564 	timer_queue_unlock(timer_longterm_queue);
1565 	splx(s);
1566 }
1567 
1568 void
timer_longterm_init(void)1569 timer_longterm_init(void)
1570 {
1571 	uint32_t                longterm;
1572 	timer_longterm_t        *tlp = &timer_longterm;
1573 
1574 	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1575 
1576 	/*
1577 	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1578 	 * or TIMER_LONGTERM_NONE (disabled) for server;
1579 	 * overridden longterm boot-arg
1580 	 */
1581 	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1582 	    : TIMER_LONGTERM_THRESHOLD;
1583 	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1584 		tlp->threshold.interval = (longterm == 0) ?
1585 		    TIMER_LONGTERM_NONE :
1586 		    longterm * NSEC_PER_MSEC;
1587 	}
1588 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1589 		printf("Longterm timer threshold: %llu ms\n",
1590 		    tlp->threshold.interval / NSEC_PER_MSEC);
1591 		kprintf("Longterm timer threshold: %llu ms\n",
1592 		    tlp->threshold.interval / NSEC_PER_MSEC);
1593 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1594 		    &tlp->threshold.interval);
1595 		tlp->threshold.margin = tlp->threshold.interval / 10;
1596 		tlp->threshold.latency_min = EndOfAllTime;
1597 		tlp->threshold.latency_max = 0;
1598 	}
1599 
1600 	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1601 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1602 
1603 	mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1604 
1605 	timer_call_setup(&tlp->threshold.timer,
1606 	    timer_longterm_callout, (timer_call_param_t) tlp);
1607 
1608 	timer_longterm_queue = &tlp->queue;
1609 }
1610 
1611 enum {
1612 	THRESHOLD, QCOUNT,
1613 	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1614 	LATENCY, LATENCY_MIN, LATENCY_MAX, LONG_TERM_SCAN_LIMIT,
1615 	LONG_TERM_SCAN_INTERVAL, LONG_TERM_SCAN_PAUSES,
1616 	SCAN_LIMIT, SCAN_INTERVAL, SCAN_PAUSES, SCAN_POSTPONES,
1617 };
1618 uint64_t
timer_sysctl_get(int oid)1619 timer_sysctl_get(int oid)
1620 {
1621 	timer_longterm_t        *tlp = &timer_longterm;
1622 
1623 	switch (oid) {
1624 	case THRESHOLD:
1625 		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1626 		       0 : tlp->threshold.interval / NSEC_PER_MSEC;
1627 	case QCOUNT:
1628 		return tlp->queue.count;
1629 	case ENQUEUES:
1630 		return tlp->enqueues;
1631 	case DEQUEUES:
1632 		return tlp->dequeues;
1633 	case ESCALATES:
1634 		return tlp->escalates;
1635 	case SCANS:
1636 		return tlp->threshold.scans;
1637 	case PREEMPTS:
1638 		return tlp->threshold.preempts;
1639 	case LATENCY:
1640 		return tlp->threshold.latency;
1641 	case LATENCY_MIN:
1642 		return tlp->threshold.latency_min;
1643 	case LATENCY_MAX:
1644 		return tlp->threshold.latency_max;
1645 	case LONG_TERM_SCAN_LIMIT:
1646 		return tlp->scan_limit;
1647 	case LONG_TERM_SCAN_INTERVAL:
1648 		return tlp->scan_interval;
1649 	case LONG_TERM_SCAN_PAUSES:
1650 		return tlp->scan_pauses;
1651 	case SCAN_LIMIT:
1652 		return timer_scan_limit_us * NSEC_PER_USEC;
1653 	case SCAN_INTERVAL:
1654 		return timer_scan_interval_us * NSEC_PER_USEC;
1655 	case SCAN_PAUSES:
1656 		return counter_load(&timer_scan_pauses_cnt);
1657 	case SCAN_POSTPONES:
1658 		return counter_load(&timer_scan_postpones_cnt);
1659 
1660 	default:
1661 		return 0;
1662 	}
1663 }
1664 
1665 /*
1666  * timer_master_scan() is the inverse of timer_longterm_scan()
1667  * since it un-escalates timers to the longterm queue.
1668  */
1669 static void
timer_master_scan(timer_longterm_t * tlp,uint64_t now)1670 timer_master_scan(timer_longterm_t      *tlp,
1671     uint64_t              now)
1672 {
1673 	timer_call_t    call;
1674 	uint64_t        threshold;
1675 	uint64_t        deadline;
1676 	mpqueue_head_t  *timer_master_queue;
1677 
1678 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1679 		threshold = now + tlp->threshold.interval;
1680 	} else {
1681 		threshold = TIMER_LONGTERM_NONE;
1682 	}
1683 
1684 	timer_master_queue = timer_queue_cpu(master_cpu);
1685 	timer_queue_lock_spin(timer_master_queue);
1686 
1687 	qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1688 		deadline = call->tc_pqlink.deadline;
1689 		if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1690 			continue;
1691 		}
1692 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1693 			/* case (2c) lock order inversion, dequeue only */
1694 			timer_call_entry_dequeue_async(call);
1695 			continue;
1696 		}
1697 		if (deadline > threshold) {
1698 			/* move from master to longterm */
1699 			timer_call_entry_dequeue(call);
1700 			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1701 			if (deadline < tlp->threshold.deadline) {
1702 				tlp->threshold.deadline = deadline;
1703 				tlp->threshold.call = call;
1704 			}
1705 		}
1706 		simple_unlock(&call->tc_lock);
1707 	}
1708 	timer_queue_unlock(timer_master_queue);
1709 }
1710 
1711 static void
timer_sysctl_set_threshold(void * valp)1712 timer_sysctl_set_threshold(void* valp)
1713 {
1714 	uint64_t value =        (uint64_t)valp;
1715 	timer_longterm_t        *tlp = &timer_longterm;
1716 	spl_t                   s = splclock();
1717 	boolean_t               threshold_increase;
1718 
1719 	timer_queue_lock_spin(timer_longterm_queue);
1720 
1721 	timer_call_cancel(&tlp->threshold.timer);
1722 
1723 	/*
1724 	 * Set the new threshold and note whther it's increasing.
1725 	 */
1726 	if (value == 0) {
1727 		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1728 		threshold_increase = TRUE;
1729 		timer_call_cancel(&tlp->threshold.timer);
1730 	} else {
1731 		uint64_t        old_interval = tlp->threshold.interval;
1732 		tlp->threshold.interval = value * NSEC_PER_MSEC;
1733 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1734 		    &tlp->threshold.interval);
1735 		tlp->threshold.margin = tlp->threshold.interval / 10;
1736 		if (old_interval == TIMER_LONGTERM_NONE) {
1737 			threshold_increase = FALSE;
1738 		} else {
1739 			threshold_increase = (tlp->threshold.interval > old_interval);
1740 		}
1741 	}
1742 
1743 	if (threshold_increase /* or removal */) {
1744 		/* Escalate timers from the longterm queue */
1745 		timer_longterm_scan(tlp, mach_absolute_time());
1746 	} else { /* decrease or addition  */
1747 		/*
1748 		 * We scan the local/master queue for timers now longterm.
1749 		 * To be strictly correct, we should scan all processor queues
1750 		 * but timer migration results in most timers gravitating to the
1751 		 * master processor in any case.
1752 		 */
1753 		timer_master_scan(tlp, mach_absolute_time());
1754 	}
1755 
1756 	/* Set new timer accordingly */
1757 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1758 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1759 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1760 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1761 		timer_call_enter(
1762 			&tlp->threshold.timer,
1763 			tlp->threshold.deadline_set,
1764 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1765 	}
1766 
1767 	/* Reset stats */
1768 	tlp->enqueues = 0;
1769 	tlp->dequeues = 0;
1770 	tlp->escalates = 0;
1771 	tlp->scan_pauses = 0;
1772 	tlp->threshold.scans = 0;
1773 	tlp->threshold.preempts = 0;
1774 	tlp->threshold.latency = 0;
1775 	tlp->threshold.latency_min = EndOfAllTime;
1776 	tlp->threshold.latency_max = 0;
1777 
1778 	timer_queue_unlock(timer_longterm_queue);
1779 	splx(s);
1780 }
1781 
1782 int
timer_sysctl_set(__unused int oid,__unused uint64_t value)1783 timer_sysctl_set(__unused int oid, __unused uint64_t value)
1784 {
1785 	if (support_bootcpu_shutdown) {
1786 		return KERN_NOT_SUPPORTED;
1787 	}
1788 
1789 	switch (oid) {
1790 	case THRESHOLD:
1791 		timer_call_cpu(
1792 			master_cpu,
1793 			timer_sysctl_set_threshold,
1794 			(void *) value);
1795 		return KERN_SUCCESS;
1796 	case LONG_TERM_SCAN_LIMIT:
1797 		timer_longterm.scan_limit = value;
1798 		return KERN_SUCCESS;
1799 	case LONG_TERM_SCAN_INTERVAL:
1800 		timer_longterm.scan_interval = value;
1801 		return KERN_SUCCESS;
1802 	case SCAN_LIMIT:
1803 		if (value > MAX_TIMER_SCAN_LIMIT ||
1804 		    value < MIN_TIMER_SCAN_LIMIT) {
1805 			return KERN_INVALID_ARGUMENT;
1806 		}
1807 		timer_scan_limit_us = value / NSEC_PER_USEC;
1808 		nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC,
1809 		    &timer_scan_limit_abs);
1810 		return KERN_SUCCESS;
1811 	case SCAN_INTERVAL:
1812 		if (value > MAX_TIMER_SCAN_INTERVAL ||
1813 		    value < MIN_TIMER_SCAN_INTERVAL) {
1814 			return KERN_INVALID_ARGUMENT;
1815 		}
1816 		timer_scan_interval_us = value / NSEC_PER_USEC;
1817 		nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC,
1818 		    &timer_scan_interval_abs);
1819 		return KERN_SUCCESS;
1820 	default:
1821 		return KERN_INVALID_ARGUMENT;
1822 	}
1823 }
1824 
1825 
1826 /* Select timer coalescing window based on per-task quality-of-service hints */
1827 static boolean_t
tcoal_qos_adjust(thread_t t,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1828 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1829 {
1830 	uint32_t latency_qos;
1831 	boolean_t adjusted = FALSE;
1832 	task_t ctask = get_threadtask(t);
1833 
1834 	if (ctask) {
1835 		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1836 
1837 		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1838 
1839 		if (latency_qos) {
1840 			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1841 			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1842 			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1843 			adjusted = TRUE;
1844 		}
1845 	}
1846 	return adjusted;
1847 }
1848 
1849 
1850 /* Adjust timer deadlines based on priority of the thread and the
1851  * urgency value provided at timeout establishment. With this mechanism,
1852  * timers are no longer necessarily sorted in order of soft deadline
1853  * on a given timer queue, i.e. they may be differentially skewed.
1854  * In the current scheme, this could lead to fewer pending timers
1855  * processed than is technically possible when the HW deadline arrives.
1856  */
1857 static void
timer_compute_leeway(thread_t cthread,int32_t urgency,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1858 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1859 {
1860 	int16_t tpri = cthread->sched_pri;
1861 	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1862 		bool tg_critical = false;
1863 #if CONFIG_THREAD_GROUPS
1864 		uint32_t tg_flags = thread_group_get_flags(thread_group_get(cthread));
1865 		tg_critical = tg_flags & (THREAD_GROUP_FLAGS_CRITICAL | THREAD_GROUP_FLAGS_STRICT_TIMERS);
1866 #endif /* CONFIG_THREAD_GROUPS */
1867 		bool timer_critical = (tpri >= BASEPRI_RTQUEUES) ||
1868 		    (urgency == TIMER_CALL_USER_CRITICAL) ||
1869 		    tg_critical;
1870 		if (timer_critical) {
1871 			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1872 			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1873 			TCOAL_PRIO_STAT(rt_tcl);
1874 		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1875 		    (urgency == TIMER_CALL_USER_BACKGROUND)) {
1876 			/* Determine if timer should be subjected to a lower QoS */
1877 			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1878 				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1879 					return;
1880 				} else {
1881 					*pratelimited = FALSE;
1882 				}
1883 			}
1884 			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1885 			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1886 			TCOAL_PRIO_STAT(bg_tcl);
1887 		} else if (tpri >= MINPRI_KERNEL) {
1888 			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1889 			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1890 			TCOAL_PRIO_STAT(kt_tcl);
1891 		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1892 			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1893 			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1894 			TCOAL_PRIO_STAT(fp_tcl);
1895 		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1896 			TCOAL_PRIO_STAT(qos_tcl);
1897 		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1898 			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1899 			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1900 			TCOAL_PRIO_STAT(ts_tcl);
1901 		} else {
1902 			TCOAL_PRIO_STAT(nc_tcl);
1903 		}
1904 	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1905 		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1906 		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1907 		TCOAL_PRIO_STAT(bg_tcl);
1908 	} else {
1909 		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1910 		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1911 		TCOAL_PRIO_STAT(kt_tcl);
1912 	}
1913 }
1914 
1915 
1916 int timer_user_idle_level;
1917 
1918 uint64_t
timer_call_slop(uint64_t deadline,uint64_t now,uint32_t flags,thread_t cthread,boolean_t * pratelimited)1919 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1920 {
1921 	int32_t tcs_shift = 0;
1922 	uint64_t tcs_max_abstime = 0;
1923 	uint64_t adjval;
1924 	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1925 
1926 	if (mach_timer_coalescing_enabled &&
1927 	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1928 		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1929 
1930 		if (tcs_shift >= 0) {
1931 			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1932 		} else {
1933 			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1934 		}
1935 		/* Apply adjustments derived from "user idle level" heuristic */
1936 		adjval += (adjval * timer_user_idle_level) >> 7;
1937 		return adjval;
1938 	} else {
1939 		return 0;
1940 	}
1941 }
1942 
1943 int
timer_get_user_idle_level(void)1944 timer_get_user_idle_level(void)
1945 {
1946 	return timer_user_idle_level;
1947 }
1948 
1949 kern_return_t
timer_set_user_idle_level(int ilevel)1950 timer_set_user_idle_level(int ilevel)
1951 {
1952 	boolean_t do_reeval = FALSE;
1953 
1954 	if ((ilevel < 0) || (ilevel > 128)) {
1955 		return KERN_INVALID_ARGUMENT;
1956 	}
1957 
1958 	if (ilevel < timer_user_idle_level) {
1959 		do_reeval = TRUE;
1960 	}
1961 
1962 	timer_user_idle_level = ilevel;
1963 
1964 	if (do_reeval) {
1965 		ml_timer_evaluate();
1966 	}
1967 
1968 	return KERN_SUCCESS;
1969 }
1970 
1971 #pragma mark - running timers
1972 
1973 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1974     TIMER_CALL_LOCAL)
1975 
1976 /*
1977  * timer_call_trace_* functions mimic the tracing behavior from the normal
1978  * timer_call subsystem, so tools continue to function.
1979  */
1980 
1981 static void
timer_call_trace_enter_before(struct timer_call * call,uint64_t deadline,uint32_t flags,uint64_t now)1982 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1983     uint32_t flags, uint64_t now)
1984 {
1985 #pragma unused(call, deadline, flags, now)
1986 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1987 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1988 	    deadline, flags, 0);
1989 #if CONFIG_DTRACE
1990 	uint64_t ttd = deadline - now;
1991 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1992 	    timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1993 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1994 #endif /* CONFIG_DTRACE */
1995 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1996 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1997 }
1998 
1999 static void
timer_call_trace_enter_after(struct timer_call * call,uint64_t deadline)2000 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
2001 {
2002 #pragma unused(call, deadline)
2003 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
2004 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
2005 }
2006 
2007 static void
timer_call_trace_cancel(struct timer_call * call)2008 timer_call_trace_cancel(struct timer_call *call)
2009 {
2010 #pragma unused(call)
2011 	__unused uint64_t deadline = call->tc_pqlink.deadline;
2012 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
2013 	    VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
2014 	    call->tc_flags, 0);
2015 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
2016 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
2017 	    deadline - call->tc_entry_time, 0);
2018 #if CONFIG_DTRACE
2019 #if TIMER_TRACE
2020 	uint64_t ttd = deadline - call->tc_entry_time;
2021 #else
2022 	uint64_t ttd = UINT64_MAX;
2023 #endif /* TIMER_TRACE */
2024 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
2025 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
2026 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
2027 #endif /* CONFIG_DTRACE */
2028 }
2029 
2030 static void
timer_call_trace_expire_entry(struct timer_call * call)2031 timer_call_trace_expire_entry(struct timer_call *call)
2032 {
2033 #pragma unused(call)
2034 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
2035 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
2036 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2037 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2038 	    0);
2039 #if CONFIG_DTRACE
2040 #if TIMER_TRACE
2041 	uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
2042 #else /* TIMER_TRACE */
2043 	uint64_t ttd = UINT64_MAX;
2044 #endif /* TIMER_TRACE */
2045 	DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
2046 	    timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
2047 	    0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
2048 #endif /* CONFIG_DTRACE */
2049 }
2050 
2051 static void
timer_call_trace_expire_return(struct timer_call * call)2052 timer_call_trace_expire_return(struct timer_call *call)
2053 {
2054 #pragma unused(call)
2055 #if CONFIG_DTRACE
2056 	DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
2057 	    call->tc_param0, call->tc_param1, NULL);
2058 #endif /* CONFIG_DTRACE */
2059 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
2060 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
2061 	    VM_KERNEL_UNSLIDE(call->tc_func),
2062 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2063 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2064 	    0);
2065 }
2066 
2067 /*
2068  * Set a new deadline for a running timer on this processor.
2069  */
2070 void
running_timer_setup(processor_t processor,enum running_timer timer,void * param,uint64_t deadline,uint64_t now)2071 running_timer_setup(processor_t processor, enum running_timer timer,
2072     void *param, uint64_t deadline, uint64_t now)
2073 {
2074 	assert(timer < RUNNING_TIMER_MAX);
2075 	assert(ml_get_interrupts_enabled() == FALSE);
2076 
2077 	struct timer_call *call = &processor->running_timers[timer];
2078 
2079 	timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
2080 	    now);
2081 
2082 	if (__improbable(deadline < now)) {
2083 		deadline = timer_call_past_deadline_timer_handle(deadline, now);
2084 	}
2085 
2086 	call->tc_pqlink.deadline = deadline;
2087 #if TIMER_TRACE
2088 	call->tc_entry_time = now;
2089 #endif /* TIMER_TRACE */
2090 	call->tc_param1 = param;
2091 
2092 	timer_call_trace_enter_after(call, deadline);
2093 }
2094 
2095 void
running_timers_sync(void)2096 running_timers_sync(void)
2097 {
2098 	timer_resync_deadlines();
2099 }
2100 
2101 void
running_timer_enter(processor_t processor,unsigned int timer,void * param,uint64_t deadline,uint64_t now)2102 running_timer_enter(processor_t processor, unsigned int timer,
2103     void *param, uint64_t deadline, uint64_t now)
2104 {
2105 	running_timer_setup(processor, timer, param, deadline, now);
2106 	running_timers_sync();
2107 }
2108 
2109 /*
2110  * Call the callback for any running timers that fired for this processor.
2111  * Returns true if any timers were past their deadline.
2112  */
2113 bool
running_timers_expire(processor_t processor,uint64_t now)2114 running_timers_expire(processor_t processor, uint64_t now)
2115 {
2116 	bool expired = false;
2117 
2118 	if (!processor->running_timers_active) {
2119 		return expired;
2120 	}
2121 
2122 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2123 		struct timer_call *call = &processor->running_timers[i];
2124 
2125 		uint64_t deadline = call->tc_pqlink.deadline;
2126 		if (deadline > now) {
2127 			continue;
2128 		}
2129 
2130 		expired = true;
2131 		timer_call_trace_expire_entry(call);
2132 		call->tc_func(call->tc_param0, call->tc_param1);
2133 		timer_call_trace_expire_return(call);
2134 	}
2135 
2136 	return expired;
2137 }
2138 
2139 void
running_timer_clear(processor_t processor,enum running_timer timer)2140 running_timer_clear(processor_t processor, enum running_timer timer)
2141 {
2142 	struct timer_call *call = &processor->running_timers[timer];
2143 	uint64_t deadline = call->tc_pqlink.deadline;
2144 	if (deadline == EndOfAllTime) {
2145 		return;
2146 	}
2147 
2148 	call->tc_pqlink.deadline = EndOfAllTime;
2149 #if TIMER_TRACE
2150 	call->tc_entry_time = 0;
2151 #endif /* TIMER_TRACE */
2152 	timer_call_trace_cancel(call);
2153 }
2154 
2155 void
running_timer_cancel(processor_t processor,unsigned int timer)2156 running_timer_cancel(processor_t processor, unsigned int timer)
2157 {
2158 	running_timer_clear(processor, timer);
2159 	running_timers_sync();
2160 }
2161 
2162 uint64_t
running_timers_deadline(processor_t processor)2163 running_timers_deadline(processor_t processor)
2164 {
2165 	if (!processor->running_timers_active) {
2166 		return EndOfAllTime;
2167 	}
2168 
2169 	uint64_t deadline = EndOfAllTime;
2170 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2171 		uint64_t candidate =
2172 		    processor->running_timers[i].tc_pqlink.deadline;
2173 		if (candidate != 0 && candidate < deadline) {
2174 			deadline = candidate;
2175 		}
2176 	}
2177 
2178 	return deadline;
2179 }
2180 
2181 void
running_timers_activate(processor_t processor)2182 running_timers_activate(processor_t processor)
2183 {
2184 	processor->running_timers_active = true;
2185 	running_timers_sync();
2186 }
2187 
2188 void
running_timers_deactivate(processor_t processor)2189 running_timers_deactivate(processor_t processor)
2190 {
2191 	assert(processor->running_timers_active == true);
2192 	processor->running_timers_active = false;
2193 	running_timers_sync();
2194 }
2195