xref: /f-stack/lib/ff_kern_timeout.c (revision 2bfe3f2e)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *  The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Copyright (c) 2010 Kip Macy. All rights reserved.
11  * Copyright (c) 2013 Patrick Kelsey. All rights reserved.
12  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
13  * All rights reserved.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *  From: @(#)kern_clock.c  8.5 (Berkeley) 1/21/94
40  *
41  * Derived in part from libplebnet's pn_kern_timeout.c and libuinet's uinet_timecounter.c.
42  *
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_callout_profiling.h"
49 #include "opt_ddb.h"
50 #if defined(__arm__)
51 #include "opt_timer.h"
52 #endif
53 #include "opt_rss.h"
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/bus.h>
58 #include <sys/callout.h>
59 #include <sys/file.h>
60 #include <sys/interrupt.h>
61 #include <sys/kernel.h>
62 #include <sys/ktr.h>
63 #include <sys/lock.h>
64 #include <sys/malloc.h>
65 #include <sys/mutex.h>
66 #include <sys/proc.h>
67 #include <sys/sdt.h>
68 #include <sys/sleepqueue.h>
69 #include <sys/sysctl.h>
70 #include <sys/smp.h>
71 #include <sys/timetc.h>
72 
73 SDT_PROVIDER_DEFINE(callout_execute);
74 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
75 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
76 
77 #ifdef CALLOUT_PROFILING
78 static int avg_depth;
79 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
80     "Average number of items examined per softclock call. Units = 1/1000");
81 static int avg_gcalls;
82 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
83     "Average number of Giant callouts made per softclock call. Units = 1/1000");
84 static int avg_lockcalls;
85 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
86     "Average number of lock callouts made per softclock call. Units = 1/1000");
87 static int avg_mpcalls;
88 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
89     "Average number of MP callouts made per softclock call. Units = 1/1000");
90 #endif
91 
92 static int ncallout;
93 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
94     "Number of entries in callwheel and size of timeout() preallocation");
95 
96 #ifdef RSS
97 static int pin_default_swi = 1;
98 static int pin_pcpu_swi = 1;
99 #else
100 static int pin_default_swi = 0;
101 static int pin_pcpu_swi = 0;
102 #endif
103 
104 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
105     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
106 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
107     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
108 
109 #define sleepq_lock(w) do {} while(0)
110 #define sleepq_release(w) do {} while(0)
111 #define sleepq_add(a, b, c, d, e) do {} while(0)
112 #define sleepq_wait(w, p) do {} while(0)
113 
114 #define    CC_HASH_SHIFT    8
115 
116 /*
117  * TODO:
118  *    allocate more timeout table slots when table overflows.
119  */
120 u_int callwheelsize, callwheelmask;
121 
122 /*
123  * The callout cpu exec entities represent informations necessary for
124  * describing the state of callouts currently running on the CPU and the ones
125  * necessary for migrating callouts to the new callout cpu. In particular,
126  * the first entry of the array cc_exec_entity holds informations for callout
127  * running in SWI thread context, while the second one holds informations
128  * for callout running directly from hardware interrupt context.
129  * The cached informations are very important for deferring migration when
130  * the migrating callout is already running.
131  */
132 struct cc_exec {
133     struct callout *cc_curr;
134     void (*cc_drain)(void *);
135     bool cc_cancel;
136     bool cc_waiting;
137 };
138 
139 /*
140  * There is one struct callout_cpu per cpu, holding all relevant
141  * state for the callout processing thread on the individual CPU.
142  */
143 struct callout_cpu {
144     struct mtx_padalign cc_lock;
145     struct cc_exec cc_exec_entity[2];
146     struct callout *cc_next;
147     struct callout *cc_callout;
148     struct callout_list *cc_callwheel;
149     struct callout_tailq cc_expireq;
150     struct callout_slist cc_callfree;
151     int cc_softticks;
152     void *cc_cookie;
153     u_int cc_bucket;
154     u_int cc_inited;
155     char cc_ktr_event_name[20];
156 };
157 
158 #define callout_migrating(c)    ((c)->c_iflags & CALLOUT_DFRMIGRATION)
159 
160 #define cc_exec_curr(cc, dir)        cc->cc_exec_entity[dir].cc_curr
161 #define cc_exec_drain(cc, dir)       cc->cc_exec_entity[dir].cc_drain
162 #define cc_exec_next(cc)             cc->cc_next
163 #define cc_exec_cancel(cc, dir)      cc->cc_exec_entity[dir].cc_cancel
164 #define cc_exec_waiting(cc, dir)     cc->cc_exec_entity[dir].cc_waiting
165 struct callout_cpu cc_cpu;
166 #define CC_CPU(cpu)    &cc_cpu
167 #define CC_SELF()      &cc_cpu
168 #define CC_LOCK(cc)           mtx_lock_spin(&(cc)->cc_lock)
169 #define CC_UNLOCK(cc)         mtx_unlock_spin(&(cc)->cc_lock)
170 #define CC_LOCK_ASSERT(cc)    mtx_assert(&(cc)->cc_lock, MA_OWNED)
171 
172 static int timeout_cpu;
173 
174 static void callout_cpu_init(struct callout_cpu *cc, int cpu);
175 static void softclock_call_cc(struct callout *c, struct callout_cpu *cc,
176 #ifdef CALLOUT_PROFILING
177     int *mpcalls, int *lockcalls, int *gcalls,
178 #endif
179     int direct);
180 
181 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
182 
183 /**
184  * Locked by cc_lock:
185  *   cc_curr         - If a callout is in progress, it is cc_curr.
186  *                     If cc_curr is non-NULL, threads waiting in
187  *                     callout_drain() will be woken up as soon as the
188  *                     relevant callout completes.
189  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
190  *                     guarantees that the current callout will not run.
191  *                     The softclock() function sets this to 0 before it
192  *                     drops callout_lock to acquire c_lock, and it calls
193  *                     the handler only if curr_cancelled is still 0 after
194  *                     cc_lock is successfully acquired.
195  *   cc_waiting      - If a thread is waiting in callout_drain(), then
196  *                     callout_wait is nonzero.  Set only when
197  *                     cc_curr is non-NULL.
198  */
199 
200 /*
201  * Resets the execution entity tied to a specific callout cpu.
202  */
203 static void
204 cc_cce_cleanup(struct callout_cpu *cc, int direct)
205 {
206     cc_exec_curr(cc, direct) = NULL;
207     cc_exec_cancel(cc, direct) = false;
208     cc_exec_waiting(cc, direct) = false;
209 }
210 
211 /*
212  * Checks if migration is requested by a specific callout cpu.
213  */
214 static int
215 cc_cce_migrating(struct callout_cpu *cc, int direct)
216 {
217     return (0);
218 }
219 
220 /*
221  * Kernel low level callwheel initialization
222  * called on cpu0 during kernel startup.
223  */
224 static void
225 callout_callwheel_init(void *dummy)
226 {
227     struct callout_cpu *cc;
228 
229     /*
230      * Calculate the size of the callout wheel and the preallocated
231      * timeout() structures.
232      * XXX: Clip callout to result of previous function of maxusers
233      * maximum 384.  This is still huge, but acceptable.
234      */
235     memset(CC_CPU(0), 0, sizeof(cc_cpu));
236     ncallout = imin(16 + maxproc + maxfiles, 18508);
237     TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
238 
239     /*
240      * Calculate callout wheel size, should be next power of two higher
241      * than 'ncallout'.
242      */
243     callwheelsize = 1 << fls(ncallout);
244     callwheelmask = callwheelsize - 1;
245 
246     /*
247      * Fetch whether we're pinning the swi's or not.
248      */
249     TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
250     TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
251 
252     /*
253      * Only cpu0 handles timeout(9) and receives a preallocation.
254      *
255      * XXX: Once all timeout(9) consumers are converted this can
256      * be removed.
257      */
258     timeout_cpu = PCPU_GET(cpuid);
259     cc = CC_CPU(timeout_cpu);
260     cc->cc_callout = malloc(ncallout * sizeof(struct callout),
261         M_CALLOUT, M_WAITOK);
262     callout_cpu_init(cc, timeout_cpu);
263 }
264 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
265 
266 /*
267  * Initialize the per-cpu callout structures.
268  */
269 static void
270 callout_cpu_init(struct callout_cpu *cc, int cpu)
271 {
272     struct callout *c;
273     int i;
274 
275     mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
276     SLIST_INIT(&cc->cc_callfree);
277     cc->cc_inited = 1;
278     cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
279         M_CALLOUT, M_WAITOK);
280     for (i = 0; i < callwheelsize; i++)
281         LIST_INIT(&cc->cc_callwheel[i]);
282     TAILQ_INIT(&cc->cc_expireq);
283     for (i = 0; i < 2; i++)
284         cc_cce_cleanup(cc, i);
285     snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
286         "callwheel cpu %d", cpu);
287     if (cc->cc_callout == NULL)    /* Only cpu0 handles timeout(9) */
288         return;
289     for (i = 0; i < ncallout; i++) {
290         c = &cc->cc_callout[i];
291         callout_init(c, 0);
292         c->c_iflags = CALLOUT_LOCAL_ALLOC;
293         SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
294     }
295 }
296 
297 static inline u_int
298 callout_get_bucket(int to_ticks)
299 {
300     return (to_ticks & callwheelmask);
301 }
302 
303 void
304 callout_tick(void)
305 {
306     struct callout_cpu *cc;
307     int need_softclock;
308     int bucket;
309 
310     /*
311      * Process callouts at a very low cpu priority, so we don't keep the
312      * relatively high clock interrupt priority any longer than necessary.
313      */
314     need_softclock = 0;
315     cc = CC_SELF();
316     mtx_lock(&cc->cc_lock);
317     for (; (cc->cc_softticks - ticks) < 0; cc->cc_softticks++) {
318         bucket = cc->cc_softticks & callwheelmask;
319         if (!LIST_EMPTY(&cc->cc_callwheel[bucket])) {
320             need_softclock = 1;
321             break;
322         }
323     }
324     mtx_unlock(&cc->cc_lock);
325     /*
326      * swi_sched acquires the thread lock, so we don't want to call it
327      * with cc_lock held; incorrect locking order.
328      */
329     if (need_softclock)
330         softclock(cc);
331 }
332 
333 static struct callout_cpu *
334 callout_lock(struct callout *c)
335 {
336     struct callout_cpu *cc;
337     int cpu;
338 
339     for (;;) {
340         cpu = c->c_cpu;
341         cc = CC_CPU(cpu);
342         CC_LOCK(cc);
343         if (cpu == c->c_cpu)
344             break;
345         CC_UNLOCK(cc);
346     }
347     return (cc);
348 }
349 
350 static void
351 callout_cc_add(struct callout *c, struct callout_cpu *cc,
352     int to_ticks, void (*func)(void *), void *arg, int cpu, int flags)
353 {
354     int bucket;
355 
356     CC_LOCK_ASSERT(cc);
357 
358     c->c_arg = arg;
359     c->c_iflags |= CALLOUT_PENDING;
360     c->c_iflags &= ~CALLOUT_PROCESSED;
361     c->c_flags |= CALLOUT_ACTIVE;
362     if (flags & C_DIRECT_EXEC)
363         c->c_iflags |= CALLOUT_DIRECT;
364     c->c_func = func;
365     c->c_time = ticks + to_ticks;
366     bucket = callout_get_bucket(c->c_time);
367     LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
368     if (cc->cc_bucket == bucket)
369         cc_exec_next(cc) = c;
370 }
371 
372 static void
373 callout_cc_del(struct callout *c, struct callout_cpu *cc)
374 {
375 
376     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
377         return;
378     c->c_func = NULL;
379     SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
380 }
381 
382 static void
383 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
384 #ifdef CALLOUT_PROFILING
385     int *mpcalls, int *lockcalls, int *gcalls,
386 #endif
387     int direct)
388 {
389     struct rm_priotracker tracker;
390     void (*c_func)(void *);
391     void *c_arg;
392     struct lock_class *class;
393     struct lock_object *c_lock;
394     uintptr_t lock_status;
395     int c_iflags;
396 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
397     sbintime_t sbt1, sbt2;
398     struct timespec ts2;
399     static sbintime_t maxdt = 2 * SBT_1MS;    /* 2 msec */
400     static timeout_t *lastfunc;
401 #endif
402 
403     KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
404         ("softclock_call_cc: pend %p %x", c, c->c_iflags));
405     KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
406         ("softclock_call_cc: act %p %x", c, c->c_flags));
407     class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
408     lock_status = 0;
409     if (c->c_flags & CALLOUT_SHAREDLOCK) {
410         if (class == &lock_class_rm)
411             lock_status = (uintptr_t)&tracker;
412         else
413             lock_status = 1;
414     }
415     c_lock = c->c_lock;
416     c_func = c->c_func;
417     c_arg = c->c_arg;
418     c_iflags = c->c_iflags;
419     if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
420         c->c_iflags = CALLOUT_LOCAL_ALLOC;
421     else
422         c->c_iflags &= ~CALLOUT_PENDING;
423 
424     cc_exec_curr(cc, direct) = c;
425     cc_exec_cancel(cc, direct) = false;
426     cc_exec_drain(cc, direct) = NULL;
427     CC_UNLOCK(cc);
428     if (c_lock != NULL) {
429         class->lc_lock(c_lock, lock_status);
430         /*
431          * The callout may have been cancelled
432          * while we switched locks.
433          */
434         if (cc_exec_cancel(cc, direct)) {
435             class->lc_unlock(c_lock);
436             goto skip;
437         }
438         /* The callout cannot be stopped now. */
439         cc_exec_cancel(cc, direct) = true;
440         if (c_lock == &Giant.lock_object) {
441 #ifdef CALLOUT_PROFILING
442             (*gcalls)++;
443 #endif
444             CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
445                 c, c_func, c_arg);
446         } else {
447 #ifdef CALLOUT_PROFILING
448             (*lockcalls)++;
449 #endif
450             CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
451                 c, c_func, c_arg);
452         }
453     } else {
454 #ifdef CALLOUT_PROFILING
455         (*mpcalls)++;
456 #endif
457         CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
458             c, c_func, c_arg);
459     }
460     KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
461         "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
462 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
463     sbt1 = sbinuptime();
464 #endif
465     THREAD_NO_SLEEPING();
466     SDT_PROBE1(callout_execute, , , callout__start, c);
467     c_func(c_arg);
468     SDT_PROBE1(callout_execute, , , callout__end, c);
469     THREAD_SLEEPING_OK();
470 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
471     sbt2 = sbinuptime();
472     sbt2 -= sbt1;
473     if (sbt2 > maxdt) {
474         if (lastfunc != c_func || sbt2 > maxdt * 2) {
475             ts2 = sbttots(sbt2);
476             printf(
477         "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
478                 c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
479         }
480         maxdt = sbt2;
481         lastfunc = c_func;
482     }
483 #endif
484     KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
485     CTR1(KTR_CALLOUT, "callout %p finished", c);
486     if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
487         class->lc_unlock(c_lock);
488 skip:
489     CC_LOCK(cc);
490     KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
491     cc_exec_curr(cc, direct) = NULL;
492     if (cc_exec_drain(cc, direct)) {
493         void (*drain)(void *);
494 
495         drain = cc_exec_drain(cc, direct);
496         cc_exec_drain(cc, direct) = NULL;
497         CC_UNLOCK(cc);
498         drain(c_arg);
499         CC_LOCK(cc);
500     }
501     if (cc_exec_waiting(cc, direct)) {
502         /*
503          * There is someone waiting for the
504          * callout to complete.
505          * If the callout was scheduled for
506          * migration just cancel it.
507          */
508         if (cc_cce_migrating(cc, direct)) {
509             cc_cce_cleanup(cc, direct);
510 
511             /*
512              * It should be assert here that the callout is not
513              * destroyed but that is not easy.
514              */
515             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
516         }
517         cc_exec_waiting(cc, direct) = false;
518         CC_UNLOCK(cc);
519         wakeup(&cc_exec_waiting(cc, direct));
520         CC_LOCK(cc);
521     } else if (cc_cce_migrating(cc, direct)) {
522         KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
523             ("Migrating legacy callout %p", c));
524         panic("migration should not happen");
525     }
526     /*
527      * If the current callout is locally allocated (from
528      * timeout(9)) then put it on the freelist.
529      *
530      * Note: we need to check the cached copy of c_iflags because
531      * if it was not local, then it's not safe to deref the
532      * callout pointer.
533      */
534     KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
535         c->c_iflags == CALLOUT_LOCAL_ALLOC,
536         ("corrupted callout"));
537     if (c_iflags & CALLOUT_LOCAL_ALLOC)
538         callout_cc_del(c, cc);
539 }
540 
541 /*
542  * The callout mechanism is based on the work of Adam M. Costello and
543  * George Varghese, published in a technical report entitled "Redesigning
544  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
545  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
546  * used in this implementation was published by G. Varghese and T. Lauck in
547  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
548  * the Efficient Implementation of a Timer Facility" in the Proceedings of
549  * the 11th ACM Annual Symposium on Operating Systems Principles,
550  * Austin, Texas Nov 1987.
551  */
552 
553 /*
554  * Software (low priority) clock interrupt.
555  * Run periodic events from timeout queue.
556  */
557 void
558 softclock(void *arg)
559 {
560     struct callout *c;
561     struct callout_cpu *cc;
562     struct callout_list *sc;
563     int curticks;
564 #ifdef CALLOUT_PROFILING
565     int depth = 0, gcalls = 0, mpcalls = 0, lockcalls = 0;
566 #endif
567 
568     cc = (struct callout_cpu *)arg;
569     CC_LOCK(cc);
570 
571     while (cc->cc_softticks != ticks) {
572         /*
573          * cc_softticks may be modified by hard clock, so cache
574          * it while we work on a given bucket.
575          */
576         curticks = cc->cc_softticks;
577         cc->cc_softticks++;
578         sc = &cc->cc_callwheel[curticks & callwheelmask];
579         c = LIST_FIRST(sc);
580         while (c) {
581 #ifdef CALLOUT_PROFILING
582             depth++;
583 #endif
584             if (c->c_time != curticks) {
585                 c = LIST_NEXT(c, c_links.le);
586             } else {
587                 cc_exec_next(cc) =
588                     LIST_NEXT(c, c_links.le);
589                 cc->cc_bucket = callout_get_bucket(curticks);
590                 LIST_REMOVE(c, c_links.le);
591                 softclock_call_cc(c, cc,
592 #ifdef CALLOUT_PROFILING
593                     &mpcalls, &lockcalls, &gcalls,
594 #endif
595                     1);
596                 c = cc_exec_next(cc);
597                 cc_exec_next(cc) = NULL;
598             }
599         }
600     }
601 
602 #ifdef CALLOUT_PROFILING
603     avg_depth += (depth * 1000 - avg_depth) >> 8;
604     avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
605     avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
606     avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
607 #endif
608     CC_UNLOCK(cc);
609 }
610 
611 /*
612  * timeout --
613  *    Execute a function after a specified length of time.
614  *
615  * untimeout --
616  *    Cancel previous timeout function call.
617  *
618  * callout_handle_init --
619  *    Initialize a handle so that using it with untimeout is benign.
620  *
621  *    See AT&T BCI Driver Reference Manual for specification.  This
622  *    implementation differs from that one in that although an
623  *    identification value is returned from timeout, the original
624  *    arguments to timeout as well as the identifier are used to
625  *    identify entries for untimeout.
626  */
627 struct callout_handle
628 timeout(timeout_t *ftn, void *arg, int to_ticks)
629 {
630     struct callout_cpu *cc;
631     struct callout *new;
632     struct callout_handle handle;
633 
634     cc = CC_CPU(timeout_cpu);
635     CC_LOCK(cc);
636     /* Fill in the next free callout structure. */
637     new = SLIST_FIRST(&cc->cc_callfree);
638     if (new == NULL)
639         /* XXX Attempt to malloc first */
640         panic("timeout table full");
641     SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
642     callout_reset(new, to_ticks, ftn, arg);
643     handle.callout = new;
644     CC_UNLOCK(cc);
645 
646     return (handle);
647 }
648 
649 void
650 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
651 {
652     struct callout_cpu *cc;
653 
654     /*
655      * Check for a handle that was initialized
656      * by callout_handle_init, but never used
657      * for a real timeout.
658      */
659     if (handle.callout == NULL)
660         return;
661 
662     cc = callout_lock(handle.callout);
663     if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
664         callout_stop(handle.callout);
665     CC_UNLOCK(cc);
666 }
667 
668 void
669 callout_handle_init(struct callout_handle *handle)
670 {
671     handle->callout = NULL;
672 }
673 
674 /*
675  * New interface; clients allocate their own callout structures.
676  *
677  * callout_reset() - establish or change a timeout
678  * callout_stop() - disestablish a timeout
679  * callout_init() - initialize a callout structure so that it can
680  *    safely be passed to callout_reset() and callout_stop()
681  *
682  * <sys/callout.h> defines three convenience macros:
683  *
684  * callout_active() - returns truth if callout has not been stopped,
685  *    drained, or deactivated since the last time the callout was
686  *    reset.
687  * callout_pending() - returns truth if callout is still waiting for timeout
688  * callout_deactivate() - marks the callout as having been serviced
689  */
690 int
691 callout_reset_tick_on(struct callout *c, int to_ticks,
692     void (*ftn)(void *), void *arg, int cpu, int flags)
693 {
694     struct callout_cpu *cc;
695     int cancelled, direct;
696     int ignore_cpu=0;
697 
698     cancelled = 0;
699     if (cpu == -1) {
700         ignore_cpu = 1;
701     } else if ((cpu >= MAXCPU) ||
702            ((CC_CPU(cpu))->cc_inited == 0)) {
703         /* Invalid CPU spec */
704         panic("Invalid CPU in callout %d", cpu);
705     }
706 
707     /*
708      * This flag used to be added by callout_cc_add, but the
709      * first time you call this we could end up with the
710      * wrong direct flag if we don't do it before we add.
711      */
712     if (flags & C_DIRECT_EXEC) {
713         direct = 1;
714     } else {
715         direct = 0;
716     }
717     KASSERT(!direct || c->c_lock == NULL,
718         ("%s: direct callout %p has lock", __func__, c));
719     cc = callout_lock(c);
720     /*
721      * Don't allow migration of pre-allocated callouts lest they
722      * become unbalanced or handle the case where the user does
723      * not care.
724      */
725     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
726         ignore_cpu) {
727         cpu = c->c_cpu;
728     }
729 
730     if (cc_exec_curr(cc, direct) == c) {
731         /*
732          * We're being asked to reschedule a callout which is
733          * currently in progress.  If there is a lock then we
734          * can cancel the callout if it has not really started.
735          */
736         if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
737             cancelled = cc_exec_cancel(cc, direct) = true;
738         if (cc_exec_waiting(cc, direct)) {
739             /*
740              * Someone has called callout_drain to kill this
741              * callout.  Don't reschedule.
742              */
743             CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
744                 cancelled ? "cancelled" : "failed to cancel",
745                 c, c->c_func, c->c_arg);
746             CC_UNLOCK(cc);
747             return (cancelled);
748         }
749     }
750     if (c->c_iflags & CALLOUT_PENDING) {
751         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
752             if (cc_exec_next(cc) == c)
753                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
754             LIST_REMOVE(c, c_links.le);
755         } else {
756             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
757         }
758         cancelled = 1;
759         c->c_iflags &= ~ CALLOUT_PENDING;
760         c->c_flags &= ~ CALLOUT_ACTIVE;
761     }
762 
763     if (to_ticks <= 0)
764         to_ticks = 1;
765 
766     callout_cc_add(c, cc, to_ticks, ftn, arg, cpu, flags);
767     CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
768         cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
769     CC_UNLOCK(cc);
770 
771     return (cancelled);
772 }
773 
774 /*
775  * Common idioms that can be optimized in the future.
776  */
777 int
778 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
779 {
780     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
781 }
782 
783 int
784 callout_schedule(struct callout *c, int to_ticks)
785 {
786     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
787 }
788 
789 int
790 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
791 {
792     struct callout_cpu *cc, *old_cc;
793     struct lock_class *class;
794     int direct, sq_locked, use_lock;
795     int cancelled, not_on_a_list;
796 
797     if ((flags & CS_DRAIN) != 0)
798         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
799             "calling %s", __func__);
800 
801     /*
802      * Some old subsystems don't hold Giant while running a callout_stop(),
803      * so just discard this check for the moment.
804      */
805     if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
806         if (c->c_lock == &Giant.lock_object)
807             use_lock = mtx_owned(&Giant);
808         else {
809             use_lock = 1;
810             class = LOCK_CLASS(c->c_lock);
811             class->lc_assert(c->c_lock, LA_XLOCKED);
812         }
813     } else
814         use_lock = 0;
815     if (c->c_iflags & CALLOUT_DIRECT) {
816         direct = 1;
817     } else {
818         direct = 0;
819     }
820     sq_locked = 0;
821     old_cc = NULL;
822 again:
823     cc = callout_lock(c);
824 
825     if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
826         (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
827         ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
828         /*
829          * Special case where this slipped in while we
830          * were migrating *as* the callout is about to
831          * execute. The caller probably holds the lock
832          * the callout wants.
833          *
834          * Get rid of the migration first. Then set
835          * the flag that tells this code *not* to
836          * try to remove it from any lists (its not
837          * on one yet). When the callout wheel runs,
838          * it will ignore this callout.
839          */
840         c->c_iflags &= ~CALLOUT_PENDING;
841         c->c_flags &= ~CALLOUT_ACTIVE;
842         not_on_a_list = 1;
843     } else {
844         not_on_a_list = 0;
845     }
846 
847     /*
848      * If the callout was migrating while the callout cpu lock was
849      * dropped,  just drop the sleepqueue lock and check the states
850      * again.
851      */
852     if (sq_locked != 0 && cc != old_cc) {
853         panic("migration should not happen");
854     }
855 
856     /*
857      * If the callout is running, try to stop it or drain it.
858      */
859     if (cc_exec_curr(cc, direct) == c) {
860         /*
861          * Succeed we to stop it or not, we must clear the
862          * active flag - this is what API users expect.
863          */
864         c->c_flags &= ~CALLOUT_ACTIVE;
865 
866         if ((flags & CS_DRAIN) != 0) {
867             /*
868              * The current callout is running (or just
869              * about to run) and blocking is allowed, so
870              * just wait for the current invocation to
871              * finish.
872              */
873             while (cc_exec_curr(cc, direct) == c) {
874                 /*
875                  * Use direct calls to sleepqueue interface
876                  * instead of cv/msleep in order to avoid
877                  * a LOR between cc_lock and sleepqueue
878                  * chain spinlocks.  This piece of code
879                  * emulates a msleep_spin() call actually.
880                  *
881                  * If we already have the sleepqueue chain
882                  * locked, then we can safely block.  If we
883                  * don't already have it locked, however,
884                  * we have to drop the cc_lock to lock
885                  * it.  This opens several races, so we
886                  * restart at the beginning once we have
887                  * both locks.  If nothing has changed, then
888                  * we will end up back here with sq_locked
889                  * set.
890                  */
891                 if (!sq_locked) {
892                     CC_UNLOCK(cc);
893                     sleepq_lock(
894                         &cc_exec_waiting(cc, direct));
895                     sq_locked = 1;
896                     old_cc = cc;
897                     goto again;
898                 }
899 
900                 /*
901                  * Migration could be cancelled here, but
902                  * as long as it is still not sure when it
903                  * will be packed up, just let softclock()
904                  * take care of it.
905                  */
906                 cc_exec_waiting(cc, direct) = true;
907                 DROP_GIANT();
908                 CC_UNLOCK(cc);
909                 sleepq_add(
910                     &cc_exec_waiting(cc, direct),
911                     &cc->cc_lock.lock_object, "codrain",
912                     SLEEPQ_SLEEP, 0);
913                 sleepq_wait(
914                     &cc_exec_waiting(cc, direct),
915                          0);
916                 sq_locked = 0;
917                 old_cc = NULL;
918 
919                 /* Reacquire locks previously released. */
920                 PICKUP_GIANT();
921                 CC_LOCK(cc);
922             }
923         } else if (use_lock &&
924                !cc_exec_cancel(cc, direct) && (drain == NULL)) {
925 
926             /*
927              * The current callout is waiting for its
928              * lock which we hold.  Cancel the callout
929              * and return.  After our caller drops the
930              * lock, the callout will be skipped in
931              * softclock(). This *only* works with a
932              * callout_stop() *not* callout_drain() or
933              * callout_async_drain().
934              */
935             cc_exec_cancel(cc, direct) = true;
936             CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
937                 c, c->c_func, c->c_arg);
938             KASSERT(!cc_cce_migrating(cc, direct),
939                 ("callout wrongly scheduled for migration"));
940             if (callout_migrating(c)) {
941                 c->c_iflags &= ~CALLOUT_DFRMIGRATION;
942             }
943             CC_UNLOCK(cc);
944             KASSERT(!sq_locked, ("sleepqueue chain locked"));
945             return (1);
946         } else if (callout_migrating(c)) {
947             /*
948              * The callout is currently being serviced
949              * and the "next" callout is scheduled at
950              * its completion with a migration. We remove
951              * the migration flag so it *won't* get rescheduled,
952              * but we can't stop the one thats running so
953              * we return 0.
954              */
955             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
956             CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
957                 c, c->c_func, c->c_arg);
958              if (drain) {
959                 cc_exec_drain(cc, direct) = drain;
960             }
961             CC_UNLOCK(cc);
962             return ((flags & CS_EXECUTING) != 0);
963         }
964         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
965             c, c->c_func, c->c_arg);
966         if (drain) {
967             cc_exec_drain(cc, direct) = drain;
968         }
969         KASSERT(!sq_locked, ("sleepqueue chain still locked"));
970         cancelled = ((flags & CS_EXECUTING) != 0);
971     } else
972         cancelled = 1;
973 
974     if (sq_locked)
975         sleepq_release(&cc_exec_waiting(cc, direct));
976 
977     if ((c->c_iflags & CALLOUT_PENDING) == 0) {
978         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
979             c, c->c_func, c->c_arg);
980         /*
981          * For not scheduled and not executing callout return
982          * negative value.
983          */
984         if (cc_exec_curr(cc, direct) != c)
985             cancelled = -1;
986         CC_UNLOCK(cc);
987         return (cancelled);
988     }
989 
990     c->c_iflags &= ~CALLOUT_PENDING;
991     c->c_flags &= ~CALLOUT_ACTIVE;
992 
993     CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
994         c, c->c_func, c->c_arg);
995     if (not_on_a_list == 0) {
996         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
997             if (cc_exec_next(cc) == c)
998                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
999             LIST_REMOVE(c, c_links.le);
1000         } else {
1001             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
1002         }
1003     }
1004     callout_cc_del(c, cc);
1005     CC_UNLOCK(cc);
1006     return (cancelled);
1007 }
1008 
1009 void
1010 callout_init(struct callout *c, int mpsafe)
1011 {
1012     bzero(c, sizeof *c);
1013     if (mpsafe) {
1014         c->c_lock = NULL;
1015         c->c_iflags = CALLOUT_RETURNUNLOCKED;
1016     } else {
1017         c->c_lock = &Giant.lock_object;
1018         c->c_iflags = 0;
1019     }
1020     c->c_cpu = timeout_cpu;
1021 }
1022 
1023 void
1024 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
1025 {
1026     bzero(c, sizeof *c);
1027     c->c_lock = lock;
1028     KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
1029         ("callout_init_lock: bad flags %d", flags));
1030     KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
1031         ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
1032     KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
1033         (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
1034         __func__));
1035     c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
1036     c->c_cpu = timeout_cpu;
1037 }
1038 
1039 #ifdef APM_FIXUP_CALLTODO
1040 /*
1041  * Adjust the kernel calltodo timeout list.  This routine is used after
1042  * an APM resume to recalculate the calltodo timer list values with the
1043  * number of hz's we have been sleeping.  The next hardclock() will detect
1044  * that there are fired timers and run softclock() to execute them.
1045  *
1046  * Please note, I have not done an exhaustive analysis of what code this
1047  * might break.  I am motivated to have my select()'s and alarm()'s that
1048  * have expired during suspend firing upon resume so that the applications
1049  * which set the timer can do the maintanence the timer was for as close
1050  * as possible to the originally intended time.  Testing this code for a
1051  * week showed that resuming from a suspend resulted in 22 to 25 timers
1052  * firing, which seemed independent on whether the suspend was 2 hours or
1053  * 2 days.  Your milage may vary.   - Ken Key <[email protected]>
1054  */
1055 void
1056 adjust_timeout_calltodo(struct timeval *time_change)
1057 {
1058     register struct callout *p;
1059     unsigned long delta_ticks;
1060 
1061     /*
1062      * How many ticks were we asleep?
1063      * (stolen from tvtohz()).
1064      */
1065 
1066     /* Don't do anything */
1067     if (time_change->tv_sec < 0)
1068         return;
1069     else if (time_change->tv_sec <= LONG_MAX / 1000000)
1070         delta_ticks = howmany(time_change->tv_sec * 1000000 +
1071             time_change->tv_usec, tick) + 1;
1072     else if (time_change->tv_sec <= LONG_MAX / hz)
1073         delta_ticks = time_change->tv_sec * hz +
1074             howmany(time_change->tv_usec, tick) + 1;
1075     else
1076         delta_ticks = LONG_MAX;
1077 
1078     if (delta_ticks > INT_MAX)
1079         delta_ticks = INT_MAX;
1080 
1081     /*
1082      * Now rip through the timer calltodo list looking for timers
1083      * to expire.
1084      */
1085 
1086     /* don't collide with softclock() */
1087     CC_LOCK(cc);
1088     for (p = calltodo.c_next; p != NULL; p = p->c_next) {
1089         p->c_time -= delta_ticks;
1090 
1091         /* Break if the timer had more time on it than delta_ticks */
1092         if (p->c_time > 0)
1093             break;
1094 
1095         /* take back the ticks the timer didn't use (p->c_time <= 0) */
1096         delta_ticks = -p->c_time;
1097     }
1098     CC_UNLOCK(cc);
1099 
1100     return;
1101 }
1102 #endif /* APM_FIXUP_CALLTODO */
1103 
1104 static int
1105 flssbt(sbintime_t sbt)
1106 {
1107 
1108     sbt += (uint64_t)sbt >> 1;
1109     if (sizeof(long) >= sizeof(sbintime_t))
1110         return (flsl(sbt));
1111     if (sbt >= SBT_1S)
1112         return (flsl(((uint64_t)sbt) >> 32) + 32);
1113     return (flsl(sbt));
1114 }
1115 
1116 /*
1117  * Dump immediate statistic snapshot of the scheduled callouts.
1118  */
1119 static int
1120 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
1121 {
1122     struct callout *tmp;
1123     struct callout_cpu *cc;
1124     struct callout_list *sc;
1125     int st, maxt, tick, now;
1126     sbintime_t medt;
1127     int ct[64], ccpbk[32];
1128     int error, val, i, count, tcum, pcum, maxc, c, medc;
1129 
1130     val = 0;
1131     error = sysctl_handle_int(oidp, &val, 0, req);
1132     if (error != 0 || req->newptr == NULL)
1133         return (error);
1134     count = maxc = 0;
1135     st = maxt = 0;
1136     bzero(ccpbk, sizeof(ccpbk));
1137     bzero(ct, sizeof(ct));
1138     now = ticks;
1139 
1140     cc = CC_CPU(timeout_cpu);
1141     CC_LOCK(cc);
1142     for (i = 0; i < callwheelsize; i++) {
1143         sc = &cc->cc_callwheel[i];
1144         c = 0;
1145         LIST_FOREACH(tmp, sc, c_links.le) {
1146             c++;
1147             tick = tmp->c_time - now;
1148             if (tick < 0)
1149                 tick = 0;
1150             st += tick*(1000/hz);
1151             if (tick > maxt)
1152                 maxt = tick;
1153             ct[flssbt(tick)]++;
1154         }
1155         if (c > maxc)
1156             maxc = c;
1157         ccpbk[fls(c + c / 2)]++;
1158         count += c;
1159     }
1160     CC_UNLOCK(cc);
1161 
1162     for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
1163         tcum += ct[i];
1164     medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1165     for (i = 0, c = 0; i < 32 && c < count / 2; i++)
1166         c += ccpbk[i];
1167     medc = (i >= 2) ? (1 << (i - 2)) : 0;
1168 
1169     printf("Scheduled callouts statistic snapshot:\n");
1170     printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
1171         count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
1172     printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
1173         medc,
1174         count / callwheelsize / mp_ncpus,
1175         (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
1176         maxc);
1177     printf("  Time: med %5jd.%06jds avg %6d.%06ds max %ds\n",
1178         medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
1179         st / count / 1000, (st / count) % 1000, maxt);
1180     printf("  Distribution:       \tbuckets\t   time\t   tcum\n");
1181     for (i = 0, tcum = pcum = 0; i < 64; i++) {
1182         if (ct[i] == 0)
1183             continue;
1184         sbintime_t t;
1185         t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
1186         tcum += ct[i];
1187         printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\n",
1188             t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
1189             i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum);
1190     }
1191     return (error);
1192 }
1193 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
1194     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1195     0, 0, sysctl_kern_callout_stat, "I",
1196     "Dump immediate statistic snapshot of the scheduled callouts");
1197 
1198 #ifdef FSTACK
1199 void ff_hardclock(void);
1200 
1201 void
1202 ff_hardclock(void)
1203 {
1204     atomic_add_int(&ticks, 1);
1205     callout_tick();
1206     tc_ticktock(1);
1207     cpu_tick_calibration();
1208 
1209 #ifdef DEVICE_POLLING
1210     hardclock_device_poll();    /* this is very short and quick */
1211 #endif /* DEVICE_POLLING */
1212 }
1213 
1214 static unsigned int
1215 ff_tc_get_timecount(struct timecounter *tc)
1216 {
1217     uint64_t ns;
1218     ns = ff_get_tsc_ns();
1219     return ((ns * tc->tc_frequency) / ff_NSEC_PER_SEC);
1220 }
1221 
1222 static struct timecounter ff_timecounter = {
1223     ff_tc_get_timecount, 0, ~0u, 100, "ff_clock", 1
1224 };
1225 
1226 static void
1227 ff_tc_init(void)
1228 {
1229     ff_timecounter.tc_frequency = hz;
1230     tc_init(&ff_timecounter);
1231 }
1232 SYSINIT(ff_tc, SI_SUB_SMP, SI_ORDER_ANY, ff_tc_init, NULL);
1233 #endif
1234