xref: /f-stack/lib/ff_kern_timeout.c (revision 1f5a5310)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *  The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Copyright (c) 2010 Kip Macy. All rights reserved.
11  * Copyright (c) 2013 Patrick Kelsey. All rights reserved.
12  * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company.
13  * All rights reserved.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *  From: @(#)kern_clock.c  8.5 (Berkeley) 1/21/94
40  *
41  * Derived in part from libplebnet's pn_kern_timeout.c and libuinet's uinet_timecounter.c.
42  *
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_callout_profiling.h"
49 #include "opt_ddb.h"
50 #if defined(__arm__)
51 #include "opt_timer.h"
52 #endif
53 #include "opt_rss.h"
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/bus.h>
58 #include <sys/callout.h>
59 #include <sys/file.h>
60 #include <sys/interrupt.h>
61 #include <sys/kernel.h>
62 #include <sys/ktr.h>
63 #include <sys/lock.h>
64 #include <sys/malloc.h>
65 #include <sys/mutex.h>
66 #include <sys/proc.h>
67 #include <sys/sdt.h>
68 #include <sys/sleepqueue.h>
69 #include <sys/sysctl.h>
70 #include <sys/smp.h>
71 #include <sys/timetc.h>
72 
73 SDT_PROVIDER_DEFINE(callout_execute);
74 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
75 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
76 
77 #ifdef CALLOUT_PROFILING
78 static int avg_depth;
79 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
80     "Average number of items examined per softclock call. Units = 1/1000");
81 static int avg_gcalls;
82 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
83     "Average number of Giant callouts made per softclock call. Units = 1/1000");
84 static int avg_lockcalls;
85 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
86     "Average number of lock callouts made per softclock call. Units = 1/1000");
87 static int avg_mpcalls;
88 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
89     "Average number of MP callouts made per softclock call. Units = 1/1000");
90 #endif
91 
92 static int ncallout;
93 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
94     "Number of entries in callwheel and size of timeout() preallocation");
95 
96 #ifdef RSS
97 static int pin_default_swi = 1;
98 static int pin_pcpu_swi = 1;
99 #else
100 static int pin_default_swi = 0;
101 static int pin_pcpu_swi = 0;
102 #endif
103 
104 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
105     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
106 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
107     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
108 
109 #define sleepq_lock(w) do {} while(0)
110 #define sleepq_release(w) do {} while(0)
111 #define sleepq_add(a, b, c, d, e) do {} while(0)
112 #define sleepq_wait(w, p) do {} while(0)
113 
114 #define    CC_HASH_SHIFT    8
115 
116 /*
117  * TODO:
118  *    allocate more timeout table slots when table overflows.
119  */
120 u_int callwheelsize, callwheelmask;
121 
122 /*
123  * The callout cpu exec entities represent informations necessary for
124  * describing the state of callouts currently running on the CPU and the ones
125  * necessary for migrating callouts to the new callout cpu. In particular,
126  * the first entry of the array cc_exec_entity holds informations for callout
127  * running in SWI thread context, while the second one holds informations
128  * for callout running directly from hardware interrupt context.
129  * The cached informations are very important for deferring migration when
130  * the migrating callout is already running.
131  */
132 struct cc_exec {
133     struct callout *cc_curr;
134     void (*cc_drain)(void *);
135     bool cc_cancel;
136     bool cc_waiting;
137 };
138 
139 /*
140  * There is one struct callout_cpu per cpu, holding all relevant
141  * state for the callout processing thread on the individual CPU.
142  */
143 struct callout_cpu {
144     struct mtx_padalign cc_lock;
145     struct cc_exec cc_exec_entity[2];
146     struct callout *cc_next;
147     struct callout *cc_callout;
148     struct callout_list *cc_callwheel;
149     struct callout_tailq cc_expireq;
150     struct callout_slist cc_callfree;
151     int cc_softticks;
152     void *cc_cookie;
153     u_int cc_bucket;
154     u_int cc_inited;
155     char cc_ktr_event_name[20];
156 };
157 
158 #define callout_migrating(c)    ((c)->c_iflags & CALLOUT_DFRMIGRATION)
159 
160 #define cc_exec_curr(cc, dir)        cc->cc_exec_entity[dir].cc_curr
161 #define cc_exec_drain(cc, dir)       cc->cc_exec_entity[dir].cc_drain
162 #define cc_exec_next(cc)             cc->cc_next
163 #define cc_exec_cancel(cc, dir)      cc->cc_exec_entity[dir].cc_cancel
164 #define cc_exec_waiting(cc, dir)     cc->cc_exec_entity[dir].cc_waiting
165 struct callout_cpu cc_cpu;
166 #define CC_CPU(cpu)    &cc_cpu
167 #define CC_SELF()      &cc_cpu
168 #define CC_LOCK(cc)           mtx_lock_spin(&(cc)->cc_lock)
169 #define CC_UNLOCK(cc)         mtx_unlock_spin(&(cc)->cc_lock)
170 #define CC_LOCK_ASSERT(cc)    mtx_assert(&(cc)->cc_lock, MA_OWNED)
171 
172 static int timeout_cpu;
173 
174 static void callout_cpu_init(struct callout_cpu *cc, int cpu);
175 static void softclock_call_cc(struct callout *c, struct callout_cpu *cc,
176 #ifdef CALLOUT_PROFILING
177     int *mpcalls, int *lockcalls, int *gcalls,
178 #endif
179     int direct);
180 
181 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
182 
183 /**
184  * Locked by cc_lock:
185  *   cc_curr         - If a callout is in progress, it is cc_curr.
186  *                     If cc_curr is non-NULL, threads waiting in
187  *                     callout_drain() will be woken up as soon as the
188  *                     relevant callout completes.
189  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
190  *                     guarantees that the current callout will not run.
191  *                     The softclock() function sets this to 0 before it
192  *                     drops callout_lock to acquire c_lock, and it calls
193  *                     the handler only if curr_cancelled is still 0 after
194  *                     cc_lock is successfully acquired.
195  *   cc_waiting      - If a thread is waiting in callout_drain(), then
196  *                     callout_wait is nonzero.  Set only when
197  *                     cc_curr is non-NULL.
198  */
199 
200 /*
201  * Resets the execution entity tied to a specific callout cpu.
202  */
203 static void
204 cc_cce_cleanup(struct callout_cpu *cc, int direct)
205 {
206     cc_exec_curr(cc, direct) = NULL;
207     cc_exec_cancel(cc, direct) = false;
208     cc_exec_waiting(cc, direct) = false;
209 }
210 
211 /*
212  * Checks if migration is requested by a specific callout cpu.
213  */
214 static int
215 cc_cce_migrating(struct callout_cpu *cc, int direct)
216 {
217     return (0);
218 }
219 
220 /*
221  * Kernel low level callwheel initialization
222  * called on cpu0 during kernel startup.
223  */
224 static void
225 callout_callwheel_init(void *dummy)
226 {
227     struct callout_cpu *cc;
228 
229     /*
230      * Calculate the size of the callout wheel and the preallocated
231      * timeout() structures.
232      * XXX: Clip callout to result of previous function of maxusers
233      * maximum 384.  This is still huge, but acceptable.
234      */
235     memset(CC_CPU(0), 0, sizeof(cc_cpu));
236     ncallout = imin(16 + maxproc + maxfiles, 18508);
237     TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
238 
239     /*
240      * Calculate callout wheel size, should be next power of two higher
241      * than 'ncallout'.
242      */
243     callwheelsize = 1 << fls(ncallout);
244     callwheelmask = callwheelsize - 1;
245 
246     /*
247      * Fetch whether we're pinning the swi's or not.
248      */
249     TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
250     TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
251 
252     /*
253      * Only cpu0 handles timeout(9) and receives a preallocation.
254      *
255      * XXX: Once all timeout(9) consumers are converted this can
256      * be removed.
257      */
258     timeout_cpu = PCPU_GET(cpuid);
259     cc = CC_CPU(timeout_cpu);
260     cc->cc_callout = malloc(ncallout * sizeof(struct callout),
261         M_CALLOUT, M_WAITOK);
262     callout_cpu_init(cc, timeout_cpu);
263 }
264 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
265 
266 /*
267  * Initialize the per-cpu callout structures.
268  */
269 static void
270 callout_cpu_init(struct callout_cpu *cc, int cpu)
271 {
272     struct callout *c;
273     int i;
274 
275     mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
276     SLIST_INIT(&cc->cc_callfree);
277     cc->cc_inited = 1;
278     cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
279         M_CALLOUT, M_WAITOK);
280     for (i = 0; i < callwheelsize; i++)
281         LIST_INIT(&cc->cc_callwheel[i]);
282     TAILQ_INIT(&cc->cc_expireq);
283     for (i = 0; i < 2; i++)
284         cc_cce_cleanup(cc, i);
285     snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
286         "callwheel cpu %d", cpu);
287     if (cc->cc_callout == NULL)    /* Only cpu0 handles timeout(9) */
288         return;
289     for (i = 0; i < ncallout; i++) {
290         c = &cc->cc_callout[i];
291         callout_init(c, 0);
292         c->c_iflags = CALLOUT_LOCAL_ALLOC;
293         SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
294     }
295 }
296 
297 static inline u_int
298 callout_get_bucket(int to_ticks)
299 {
300     return (to_ticks & callwheelmask);
301 }
302 
303 void
304 callout_tick(void)
305 {
306     struct callout_cpu *cc;
307     int need_softclock;
308     int bucket;
309 
310     /*
311      * Process callouts at a very low cpu priority, so we don't keep the
312      * relatively high clock interrupt priority any longer than necessary.
313      */
314     need_softclock = 0;
315     cc = CC_SELF();
316     mtx_lock(&cc->cc_lock);
317     for (; (cc->cc_softticks - ticks) < 0; cc->cc_softticks++) {
318         bucket = cc->cc_softticks & callwheelmask;
319         if (!LIST_EMPTY(&cc->cc_callwheel[bucket])) {
320             need_softclock = 1;
321             break;
322         }
323     }
324     mtx_unlock(&cc->cc_lock);
325     /*
326      * swi_sched acquires the thread lock, so we don't want to call it
327      * with cc_lock held; incorrect locking order.
328      */
329     if (need_softclock)
330         softclock(cc);
331 }
332 
333 static struct callout_cpu *
334 callout_lock(struct callout *c)
335 {
336     struct callout_cpu *cc;
337     int cpu;
338 
339     for (;;) {
340         cpu = c->c_cpu;
341         cc = CC_CPU(cpu);
342         CC_LOCK(cc);
343         if (cpu == c->c_cpu)
344             break;
345         CC_UNLOCK(cc);
346     }
347     return (cc);
348 }
349 
350 static void
351 callout_cc_add(struct callout *c, struct callout_cpu *cc,
352     int to_ticks, void (*func)(void *), void *arg, int cpu, int flags)
353 {
354     int bucket;
355 
356     CC_LOCK_ASSERT(cc);
357 
358     c->c_arg = arg;
359     c->c_iflags |= CALLOUT_PENDING;
360     c->c_iflags &= ~CALLOUT_PROCESSED;
361     c->c_flags |= CALLOUT_ACTIVE;
362     if (flags & C_DIRECT_EXEC)
363         c->c_iflags |= CALLOUT_DIRECT;
364     c->c_func = func;
365     c->c_time = ticks + to_ticks;
366     bucket = callout_get_bucket(c->c_time);
367     LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
368     if (cc->cc_bucket == bucket)
369         cc_exec_next(cc) = c;
370 }
371 
372 static void
373 callout_cc_del(struct callout *c, struct callout_cpu *cc)
374 {
375 
376     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
377         return;
378     c->c_func = NULL;
379     SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
380 }
381 
382 static void
383 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
384 #ifdef CALLOUT_PROFILING
385     int *mpcalls, int *lockcalls, int *gcalls,
386 #endif
387     int direct)
388 {
389     struct rm_priotracker tracker;
390     void (*c_func)(void *);
391     void *c_arg;
392     struct lock_class *class;
393     struct lock_object *c_lock;
394     uintptr_t lock_status;
395     int c_iflags;
396 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
397     sbintime_t sbt1, sbt2;
398     struct timespec ts2;
399     static sbintime_t maxdt = 2 * SBT_1MS;    /* 2 msec */
400     static timeout_t *lastfunc;
401 #endif
402 
403     KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
404         ("softclock_call_cc: pend %p %x", c, c->c_iflags));
405     KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
406         ("softclock_call_cc: act %p %x", c, c->c_flags));
407     class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
408     lock_status = 0;
409     if (c->c_flags & CALLOUT_SHAREDLOCK) {
410         if (class == &lock_class_rm)
411             lock_status = (uintptr_t)&tracker;
412         else
413             lock_status = 1;
414     }
415     c_lock = c->c_lock;
416     c_func = c->c_func;
417     c_arg = c->c_arg;
418     c_iflags = c->c_iflags;
419     if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
420         c->c_iflags = CALLOUT_LOCAL_ALLOC;
421     else
422         c->c_iflags &= ~CALLOUT_PENDING;
423 
424     cc_exec_curr(cc, direct) = c;
425     cc_exec_cancel(cc, direct) = false;
426     cc_exec_drain(cc, direct) = NULL;
427     CC_UNLOCK(cc);
428     if (c_lock != NULL) {
429         class->lc_lock(c_lock, lock_status);
430         /*
431          * The callout may have been cancelled
432          * while we switched locks.
433          */
434         if (cc_exec_cancel(cc, direct)) {
435             class->lc_unlock(c_lock);
436             goto skip;
437         }
438         /* The callout cannot be stopped now. */
439         cc_exec_cancel(cc, direct) = true;
440         if (c_lock == &Giant.lock_object) {
441 #ifdef CALLOUT_PROFILING
442             (*gcalls)++;
443 #endif
444             CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
445                 c, c_func, c_arg);
446         } else {
447 #ifdef CALLOUT_PROFILING
448             (*lockcalls)++;
449 #endif
450             CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
451                 c, c_func, c_arg);
452         }
453     } else {
454 #ifdef CALLOUT_PROFILING
455         (*mpcalls)++;
456 #endif
457         CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
458             c, c_func, c_arg);
459     }
460     KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
461         "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
462 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
463     sbt1 = sbinuptime();
464 #endif
465     THREAD_NO_SLEEPING();
466     SDT_PROBE1(callout_execute, , , callout__start, c);
467     c_func(c_arg);
468     SDT_PROBE1(callout_execute, , , callout__end, c);
469     THREAD_SLEEPING_OK();
470 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
471     sbt2 = sbinuptime();
472     sbt2 -= sbt1;
473     if (sbt2 > maxdt) {
474         if (lastfunc != c_func || sbt2 > maxdt * 2) {
475             ts2 = sbttots(sbt2);
476             printf(
477         "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
478                 c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
479         }
480         maxdt = sbt2;
481         lastfunc = c_func;
482     }
483 #endif
484     KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
485     CTR1(KTR_CALLOUT, "callout %p finished", c);
486     if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
487         class->lc_unlock(c_lock);
488 skip:
489     CC_LOCK(cc);
490     KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
491     cc_exec_curr(cc, direct) = NULL;
492     if (cc_exec_drain(cc, direct)) {
493         void (*drain)(void *);
494 
495         drain = cc_exec_drain(cc, direct);
496         cc_exec_drain(cc, direct) = NULL;
497         CC_UNLOCK(cc);
498         drain(c_arg);
499         CC_LOCK(cc);
500     }
501     if (cc_exec_waiting(cc, direct)) {
502         /*
503          * There is someone waiting for the
504          * callout to complete.
505          * If the callout was scheduled for
506          * migration just cancel it.
507          */
508         if (cc_cce_migrating(cc, direct)) {
509             cc_cce_cleanup(cc, direct);
510 
511             /*
512              * It should be assert here that the callout is not
513              * destroyed but that is not easy.
514              */
515             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
516         }
517         cc_exec_waiting(cc, direct) = false;
518         CC_UNLOCK(cc);
519         wakeup(&cc_exec_waiting(cc, direct));
520         CC_LOCK(cc);
521     } else if (cc_cce_migrating(cc, direct)) {
522         KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
523             ("Migrating legacy callout %p", c));
524         panic("migration should not happen");
525     }
526     /*
527      * If the current callout is locally allocated (from
528      * timeout(9)) then put it on the freelist.
529      *
530      * Note: we need to check the cached copy of c_iflags because
531      * if it was not local, then it's not safe to deref the
532      * callout pointer.
533      */
534     KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
535         c->c_iflags == CALLOUT_LOCAL_ALLOC,
536         ("corrupted callout"));
537     if (c_iflags & CALLOUT_LOCAL_ALLOC)
538         callout_cc_del(c, cc);
539 }
540 
541 /*
542  * The callout mechanism is based on the work of Adam M. Costello and
543  * George Varghese, published in a technical report entitled "Redesigning
544  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
545  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
546  * used in this implementation was published by G. Varghese and T. Lauck in
547  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
548  * the Efficient Implementation of a Timer Facility" in the Proceedings of
549  * the 11th ACM Annual Symposium on Operating Systems Principles,
550  * Austin, Texas Nov 1987.
551  */
552 
553 /*
554  * Software (low priority) clock interrupt.
555  * Run periodic events from timeout queue.
556  */
557 void
558 softclock(void *arg)
559 {
560     struct callout *c;
561     struct callout_cpu *cc;
562     struct callout_list *sc;
563     int curticks;
564 #ifdef CALLOUT_PROFILING
565     int depth = 0, gcalls = 0, mpcalls = 0, lockcalls = 0;
566 #endif
567 
568     cc = (struct callout_cpu *)arg;
569     CC_LOCK(cc);
570 
571     while (cc->cc_softticks != ticks) {
572         /*
573          * cc_softticks may be modified by hard clock, so cache
574          * it while we work on a given bucket.
575          */
576         curticks = cc->cc_softticks;
577         cc->cc_softticks++;
578         sc = &cc->cc_callwheel[curticks & callwheelmask];
579         c = LIST_FIRST(sc);
580         while (c) {
581 #ifdef CALLOUT_PROFILING
582             depth++;
583 #endif
584             if (c->c_time != curticks) {
585                 c = LIST_NEXT(c, c_links.le);
586             } else {
587                 cc_exec_next(cc) =
588                     LIST_NEXT(c, c_links.le);
589                 cc->cc_bucket = callout_get_bucket(curticks);
590                 LIST_REMOVE(c, c_links.le);
591                 softclock_call_cc(c, cc,
592 #ifdef CALLOUT_PROFILING
593                     &mpcalls, &lockcalls, &gcalls,
594 #endif
595                     1);
596                 c = cc_exec_next(cc);
597                 cc_exec_next(cc) = NULL;
598             }
599         }
600     }
601 
602 #ifdef CALLOUT_PROFILING
603     avg_depth += (depth * 1000 - avg_depth) >> 8;
604     avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
605     avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
606     avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
607 #endif
608     CC_UNLOCK(cc);
609 }
610 
611 #if 0
612 /*
613  * timeout --
614  *    Execute a function after a specified length of time.
615  *
616  * untimeout --
617  *    Cancel previous timeout function call.
618  *
619  * callout_handle_init --
620  *    Initialize a handle so that using it with untimeout is benign.
621  *
622  *    See AT&T BCI Driver Reference Manual for specification.  This
623  *    implementation differs from that one in that although an
624  *    identification value is returned from timeout, the original
625  *    arguments to timeout as well as the identifier are used to
626  *    identify entries for untimeout.
627  */
628 struct callout_handle
629 timeout(timeout_t *ftn, void *arg, int to_ticks)
630 {
631     struct callout_cpu *cc;
632     struct callout *new;
633     struct callout_handle handle;
634 
635     cc = CC_CPU(timeout_cpu);
636     CC_LOCK(cc);
637     /* Fill in the next free callout structure. */
638     new = SLIST_FIRST(&cc->cc_callfree);
639     if (new == NULL)
640         /* XXX Attempt to malloc first */
641         panic("timeout table full");
642     SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
643     callout_reset(new, to_ticks, ftn, arg);
644     handle.callout = new;
645     CC_UNLOCK(cc);
646 
647     return (handle);
648 }
649 
650 void
651 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
652 {
653     struct callout_cpu *cc;
654 
655     /*
656      * Check for a handle that was initialized
657      * by callout_handle_init, but never used
658      * for a real timeout.
659      */
660     if (handle.callout == NULL)
661         return;
662 
663     cc = callout_lock(handle.callout);
664     if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
665         callout_stop(handle.callout);
666     CC_UNLOCK(cc);
667 }
668 
669 void
670 callout_handle_init(struct callout_handle *handle)
671 {
672     handle->callout = NULL;
673 }
674 #endif
675 
676 /*
677  * New interface; clients allocate their own callout structures.
678  *
679  * callout_reset() - establish or change a timeout
680  * callout_stop() - disestablish a timeout
681  * callout_init() - initialize a callout structure so that it can
682  *    safely be passed to callout_reset() and callout_stop()
683  *
684  * <sys/callout.h> defines three convenience macros:
685  *
686  * callout_active() - returns truth if callout has not been stopped,
687  *    drained, or deactivated since the last time the callout was
688  *    reset.
689  * callout_pending() - returns truth if callout is still waiting for timeout
690  * callout_deactivate() - marks the callout as having been serviced
691  */
692 int
693 callout_reset_tick_on(struct callout *c, int to_ticks,
694     void (*ftn)(void *), void *arg, int cpu, int flags)
695 {
696     struct callout_cpu *cc;
697     int cancelled, direct;
698     int ignore_cpu=0;
699 
700     cancelled = 0;
701     if (cpu == -1) {
702         ignore_cpu = 1;
703     } else if ((cpu >= MAXCPU) ||
704            ((CC_CPU(cpu))->cc_inited == 0)) {
705         /* Invalid CPU spec */
706         panic("Invalid CPU in callout %d", cpu);
707     }
708 
709     /*
710      * This flag used to be added by callout_cc_add, but the
711      * first time you call this we could end up with the
712      * wrong direct flag if we don't do it before we add.
713      */
714     if (flags & C_DIRECT_EXEC) {
715         direct = 1;
716     } else {
717         direct = 0;
718     }
719     KASSERT(!direct || c->c_lock == NULL,
720         ("%s: direct callout %p has lock", __func__, c));
721     cc = callout_lock(c);
722     /*
723      * Don't allow migration of pre-allocated callouts lest they
724      * become unbalanced or handle the case where the user does
725      * not care.
726      */
727     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
728         ignore_cpu) {
729         cpu = c->c_cpu;
730     }
731 
732     if (cc_exec_curr(cc, direct) == c) {
733         /*
734          * We're being asked to reschedule a callout which is
735          * currently in progress.  If there is a lock then we
736          * can cancel the callout if it has not really started.
737          */
738         if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
739             cancelled = cc_exec_cancel(cc, direct) = true;
740         if (cc_exec_waiting(cc, direct)) {
741             /*
742              * Someone has called callout_drain to kill this
743              * callout.  Don't reschedule.
744              */
745             CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
746                 cancelled ? "cancelled" : "failed to cancel",
747                 c, c->c_func, c->c_arg);
748             CC_UNLOCK(cc);
749             return (cancelled);
750         }
751     }
752     if (c->c_iflags & CALLOUT_PENDING) {
753         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
754             if (cc_exec_next(cc) == c)
755                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
756             LIST_REMOVE(c, c_links.le);
757         } else {
758             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
759         }
760         cancelled = 1;
761         c->c_iflags &= ~ CALLOUT_PENDING;
762         c->c_flags &= ~ CALLOUT_ACTIVE;
763     }
764 
765     if (to_ticks <= 0)
766         to_ticks = 1;
767 
768     callout_cc_add(c, cc, to_ticks, ftn, arg, cpu, flags);
769     CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
770         cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
771     CC_UNLOCK(cc);
772 
773     return (cancelled);
774 }
775 
776 /*
777  * Common idioms that can be optimized in the future.
778  */
779 int
780 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
781 {
782     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
783 }
784 
785 int
786 callout_schedule(struct callout *c, int to_ticks)
787 {
788     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
789 }
790 
791 int
792 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
793 {
794     struct callout_cpu *cc, *old_cc;
795     struct lock_class *class;
796     int direct, sq_locked, use_lock;
797     int cancelled, not_on_a_list;
798 
799     if ((flags & CS_DRAIN) != 0)
800         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
801             "calling %s", __func__);
802 
803     /*
804      * Some old subsystems don't hold Giant while running a callout_stop(),
805      * so just discard this check for the moment.
806      */
807     if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
808         if (c->c_lock == &Giant.lock_object)
809             use_lock = mtx_owned(&Giant);
810         else {
811             use_lock = 1;
812             class = LOCK_CLASS(c->c_lock);
813             class->lc_assert(c->c_lock, LA_XLOCKED);
814         }
815     } else
816         use_lock = 0;
817     if (c->c_iflags & CALLOUT_DIRECT) {
818         direct = 1;
819     } else {
820         direct = 0;
821     }
822     sq_locked = 0;
823     old_cc = NULL;
824 again:
825     cc = callout_lock(c);
826 
827     if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
828         (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
829         ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
830         /*
831          * Special case where this slipped in while we
832          * were migrating *as* the callout is about to
833          * execute. The caller probably holds the lock
834          * the callout wants.
835          *
836          * Get rid of the migration first. Then set
837          * the flag that tells this code *not* to
838          * try to remove it from any lists (its not
839          * on one yet). When the callout wheel runs,
840          * it will ignore this callout.
841          */
842         c->c_iflags &= ~CALLOUT_PENDING;
843         c->c_flags &= ~CALLOUT_ACTIVE;
844         not_on_a_list = 1;
845     } else {
846         not_on_a_list = 0;
847     }
848 
849     /*
850      * If the callout was migrating while the callout cpu lock was
851      * dropped,  just drop the sleepqueue lock and check the states
852      * again.
853      */
854     if (sq_locked != 0 && cc != old_cc) {
855         panic("migration should not happen");
856     }
857 
858     /*
859      * If the callout is running, try to stop it or drain it.
860      */
861     if (cc_exec_curr(cc, direct) == c) {
862         /*
863          * Succeed we to stop it or not, we must clear the
864          * active flag - this is what API users expect.
865          */
866         c->c_flags &= ~CALLOUT_ACTIVE;
867 
868         if ((flags & CS_DRAIN) != 0) {
869             /*
870              * The current callout is running (or just
871              * about to run) and blocking is allowed, so
872              * just wait for the current invocation to
873              * finish.
874              */
875             while (cc_exec_curr(cc, direct) == c) {
876                 /*
877                  * Use direct calls to sleepqueue interface
878                  * instead of cv/msleep in order to avoid
879                  * a LOR between cc_lock and sleepqueue
880                  * chain spinlocks.  This piece of code
881                  * emulates a msleep_spin() call actually.
882                  *
883                  * If we already have the sleepqueue chain
884                  * locked, then we can safely block.  If we
885                  * don't already have it locked, however,
886                  * we have to drop the cc_lock to lock
887                  * it.  This opens several races, so we
888                  * restart at the beginning once we have
889                  * both locks.  If nothing has changed, then
890                  * we will end up back here with sq_locked
891                  * set.
892                  */
893                 if (!sq_locked) {
894                     CC_UNLOCK(cc);
895                     sleepq_lock(
896                         &cc_exec_waiting(cc, direct));
897                     sq_locked = 1;
898                     old_cc = cc;
899                     goto again;
900                 }
901 
902                 /*
903                  * Migration could be cancelled here, but
904                  * as long as it is still not sure when it
905                  * will be packed up, just let softclock()
906                  * take care of it.
907                  */
908                 cc_exec_waiting(cc, direct) = true;
909                 DROP_GIANT();
910                 CC_UNLOCK(cc);
911                 sleepq_add(
912                     &cc_exec_waiting(cc, direct),
913                     &cc->cc_lock.lock_object, "codrain",
914                     SLEEPQ_SLEEP, 0);
915                 sleepq_wait(
916                     &cc_exec_waiting(cc, direct),
917                          0);
918                 sq_locked = 0;
919                 old_cc = NULL;
920 
921                 /* Reacquire locks previously released. */
922                 PICKUP_GIANT();
923                 CC_LOCK(cc);
924             }
925         } else if (use_lock &&
926                !cc_exec_cancel(cc, direct) && (drain == NULL)) {
927 
928             /*
929              * The current callout is waiting for its
930              * lock which we hold.  Cancel the callout
931              * and return.  After our caller drops the
932              * lock, the callout will be skipped in
933              * softclock(). This *only* works with a
934              * callout_stop() *not* callout_drain() or
935              * callout_async_drain().
936              */
937             cc_exec_cancel(cc, direct) = true;
938             CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
939                 c, c->c_func, c->c_arg);
940             KASSERT(!cc_cce_migrating(cc, direct),
941                 ("callout wrongly scheduled for migration"));
942             if (callout_migrating(c)) {
943                 c->c_iflags &= ~CALLOUT_DFRMIGRATION;
944             }
945             CC_UNLOCK(cc);
946             KASSERT(!sq_locked, ("sleepqueue chain locked"));
947             return (1);
948         } else if (callout_migrating(c)) {
949             /*
950              * The callout is currently being serviced
951              * and the "next" callout is scheduled at
952              * its completion with a migration. We remove
953              * the migration flag so it *won't* get rescheduled,
954              * but we can't stop the one thats running so
955              * we return 0.
956              */
957             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
958             CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
959                 c, c->c_func, c->c_arg);
960              if (drain) {
961                 cc_exec_drain(cc, direct) = drain;
962             }
963             CC_UNLOCK(cc);
964             return ((flags & CS_EXECUTING) != 0);
965         }
966         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
967             c, c->c_func, c->c_arg);
968         if (drain) {
969             cc_exec_drain(cc, direct) = drain;
970         }
971         KASSERT(!sq_locked, ("sleepqueue chain still locked"));
972         cancelled = ((flags & CS_EXECUTING) != 0);
973     } else
974         cancelled = 1;
975 
976     if (sq_locked)
977         sleepq_release(&cc_exec_waiting(cc, direct));
978 
979     if ((c->c_iflags & CALLOUT_PENDING) == 0) {
980         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
981             c, c->c_func, c->c_arg);
982         /*
983          * For not scheduled and not executing callout return
984          * negative value.
985          */
986         if (cc_exec_curr(cc, direct) != c)
987             cancelled = -1;
988         CC_UNLOCK(cc);
989         return (cancelled);
990     }
991 
992     c->c_iflags &= ~CALLOUT_PENDING;
993     c->c_flags &= ~CALLOUT_ACTIVE;
994 
995     CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
996         c, c->c_func, c->c_arg);
997     if (not_on_a_list == 0) {
998         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
999             if (cc_exec_next(cc) == c)
1000                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
1001             LIST_REMOVE(c, c_links.le);
1002         } else {
1003             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
1004         }
1005     }
1006     callout_cc_del(c, cc);
1007     CC_UNLOCK(cc);
1008     return (cancelled);
1009 }
1010 
1011 void
1012 callout_init(struct callout *c, int mpsafe)
1013 {
1014     bzero(c, sizeof *c);
1015     if (mpsafe) {
1016         c->c_lock = NULL;
1017         c->c_iflags = CALLOUT_RETURNUNLOCKED;
1018     } else {
1019         c->c_lock = &Giant.lock_object;
1020         c->c_iflags = 0;
1021     }
1022     c->c_cpu = timeout_cpu;
1023 }
1024 
1025 void
1026 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
1027 {
1028     bzero(c, sizeof *c);
1029     c->c_lock = lock;
1030     KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
1031         ("callout_init_lock: bad flags %d", flags));
1032     KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
1033         ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
1034     KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
1035         (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
1036         __func__));
1037     c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
1038     c->c_cpu = timeout_cpu;
1039 }
1040 
1041 #ifdef APM_FIXUP_CALLTODO
1042 /*
1043  * Adjust the kernel calltodo timeout list.  This routine is used after
1044  * an APM resume to recalculate the calltodo timer list values with the
1045  * number of hz's we have been sleeping.  The next hardclock() will detect
1046  * that there are fired timers and run softclock() to execute them.
1047  *
1048  * Please note, I have not done an exhaustive analysis of what code this
1049  * might break.  I am motivated to have my select()'s and alarm()'s that
1050  * have expired during suspend firing upon resume so that the applications
1051  * which set the timer can do the maintanence the timer was for as close
1052  * as possible to the originally intended time.  Testing this code for a
1053  * week showed that resuming from a suspend resulted in 22 to 25 timers
1054  * firing, which seemed independent on whether the suspend was 2 hours or
1055  * 2 days.  Your milage may vary.   - Ken Key <[email protected]>
1056  */
1057 void
1058 adjust_timeout_calltodo(struct timeval *time_change)
1059 {
1060     register struct callout *p;
1061     unsigned long delta_ticks;
1062 
1063     /*
1064      * How many ticks were we asleep?
1065      * (stolen from tvtohz()).
1066      */
1067 
1068     /* Don't do anything */
1069     if (time_change->tv_sec < 0)
1070         return;
1071     else if (time_change->tv_sec <= LONG_MAX / 1000000)
1072         delta_ticks = howmany(time_change->tv_sec * 1000000 +
1073             time_change->tv_usec, tick) + 1;
1074     else if (time_change->tv_sec <= LONG_MAX / hz)
1075         delta_ticks = time_change->tv_sec * hz +
1076             howmany(time_change->tv_usec, tick) + 1;
1077     else
1078         delta_ticks = LONG_MAX;
1079 
1080     if (delta_ticks > INT_MAX)
1081         delta_ticks = INT_MAX;
1082 
1083     /*
1084      * Now rip through the timer calltodo list looking for timers
1085      * to expire.
1086      */
1087 
1088     /* don't collide with softclock() */
1089     CC_LOCK(cc);
1090     for (p = calltodo.c_next; p != NULL; p = p->c_next) {
1091         p->c_time -= delta_ticks;
1092 
1093         /* Break if the timer had more time on it than delta_ticks */
1094         if (p->c_time > 0)
1095             break;
1096 
1097         /* take back the ticks the timer didn't use (p->c_time <= 0) */
1098         delta_ticks = -p->c_time;
1099     }
1100     CC_UNLOCK(cc);
1101 
1102     return;
1103 }
1104 #endif /* APM_FIXUP_CALLTODO */
1105 
1106 static int
1107 flssbt(sbintime_t sbt)
1108 {
1109 
1110     sbt += (uint64_t)sbt >> 1;
1111     if (sizeof(long) >= sizeof(sbintime_t))
1112         return (flsl(sbt));
1113     if (sbt >= SBT_1S)
1114         return (flsl(((uint64_t)sbt) >> 32) + 32);
1115     return (flsl(sbt));
1116 }
1117 
1118 /*
1119  * Dump immediate statistic snapshot of the scheduled callouts.
1120  */
1121 static int
1122 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
1123 {
1124     struct callout *tmp;
1125     struct callout_cpu *cc;
1126     struct callout_list *sc;
1127     int st, maxt, tick, now;
1128     sbintime_t medt;
1129     int ct[64], ccpbk[32];
1130     int error, val, i, count, tcum, pcum, maxc, c, medc;
1131 
1132     val = 0;
1133     error = sysctl_handle_int(oidp, &val, 0, req);
1134     if (error != 0 || req->newptr == NULL)
1135         return (error);
1136     count = maxc = 0;
1137     st = maxt = 0;
1138     bzero(ccpbk, sizeof(ccpbk));
1139     bzero(ct, sizeof(ct));
1140     now = ticks;
1141 
1142     cc = CC_CPU(timeout_cpu);
1143     CC_LOCK(cc);
1144     for (i = 0; i < callwheelsize; i++) {
1145         sc = &cc->cc_callwheel[i];
1146         c = 0;
1147         LIST_FOREACH(tmp, sc, c_links.le) {
1148             c++;
1149             tick = tmp->c_time - now;
1150             if (tick < 0)
1151                 tick = 0;
1152             st += tick*(1000/hz);
1153             if (tick > maxt)
1154                 maxt = tick;
1155             ct[flssbt(tick)]++;
1156         }
1157         if (c > maxc)
1158             maxc = c;
1159         ccpbk[fls(c + c / 2)]++;
1160         count += c;
1161     }
1162     CC_UNLOCK(cc);
1163 
1164     for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
1165         tcum += ct[i];
1166     medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1167     for (i = 0, c = 0; i < 32 && c < count / 2; i++)
1168         c += ccpbk[i];
1169     medc = (i >= 2) ? (1 << (i - 2)) : 0;
1170 
1171     printf("Scheduled callouts statistic snapshot:\n");
1172     printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
1173         count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
1174     printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
1175         medc,
1176         count / callwheelsize / mp_ncpus,
1177         (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
1178         maxc);
1179     printf("  Time: med %5jd.%06jds avg %6d.%06ds max %ds\n",
1180         medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
1181         st / count / 1000, (st / count) % 1000, maxt);
1182     printf("  Distribution:       \tbuckets\t   time\t   tcum\n");
1183     for (i = 0, tcum = pcum = 0; i < 64; i++) {
1184         if (ct[i] == 0)
1185             continue;
1186         sbintime_t t;
1187         t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
1188         tcum += ct[i];
1189         printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\n",
1190             t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
1191             i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum);
1192     }
1193     return (error);
1194 }
1195 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
1196     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1197     0, 0, sysctl_kern_callout_stat, "I",
1198     "Dump immediate statistic snapshot of the scheduled callouts");
1199 
1200 #ifdef FSTACK
1201 void ff_hardclock(void);
1202 
1203 void
1204 ff_hardclock(void)
1205 {
1206     atomic_add_int(&ticks, 1);
1207     callout_tick();
1208     tc_ticktock(1);
1209     cpu_tick_calibration();
1210 
1211 #ifdef DEVICE_POLLING
1212     hardclock_device_poll();    /* this is very short and quick */
1213 #endif /* DEVICE_POLLING */
1214 }
1215 
1216 static unsigned int
1217 ff_tc_get_timecount(struct timecounter *tc)
1218 {
1219     uint64_t ns;
1220     ns = ff_get_tsc_ns();
1221     return ((ns * tc->tc_frequency) / ff_NSEC_PER_SEC);
1222 }
1223 
1224 static struct timecounter ff_timecounter = {
1225     ff_tc_get_timecount, 0, ~0u, 100, "ff_clock", 1
1226 };
1227 
1228 static void
1229 ff_tc_init(void)
1230 {
1231     ff_timecounter.tc_frequency = hz;
1232     tc_init(&ff_timecounter);
1233 }
1234 SYSINIT(ff_tc, SI_SUB_SMP, SI_ORDER_ANY, ff_tc_init, NULL);
1235 #endif
1236