xref: /f-stack/lib/ff_kern_timeout.c (revision a9643ea8)
1 /*-
2  * Copyright (c) 1982, 1986, 1991, 1993
3  *  The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Copyright (c) 2010 Kip Macy. All rights reserved.
11  * Copyright (c) 2013 Patrick Kelsey. All rights reserved.
12  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
13  * All rights reserved.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *  From: @(#)kern_clock.c  8.5 (Berkeley) 1/21/94
40  *
41  * Derived in part from libplebnet's pn_kern_timeout.c and libuinet's uinet_timecounter.c.
42  *
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_callout_profiling.h"
49 #include "opt_ddb.h"
50 #if defined(__arm__)
51 #include "opt_timer.h"
52 #endif
53 #include "opt_rss.h"
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/bus.h>
58 #include <sys/callout.h>
59 #include <sys/file.h>
60 #include <sys/interrupt.h>
61 #include <sys/kernel.h>
62 #include <sys/ktr.h>
63 #include <sys/lock.h>
64 #include <sys/malloc.h>
65 #include <sys/mutex.h>
66 #include <sys/proc.h>
67 #include <sys/sdt.h>
68 #include <sys/sleepqueue.h>
69 #include <sys/sysctl.h>
70 #include <sys/smp.h>
71 #include <sys/timetc.h>
72 
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #include <machine/_inttypes.h>
76 #endif
77 
78 #ifdef SMP
79 #include <machine/cpu.h>
80 #endif
81 
82 #ifndef NO_EVENTTIMERS
83 DPCPU_DECLARE(sbintime_t, hardclocktime);
84 #endif
85 
86 SDT_PROVIDER_DEFINE(callout_execute);
87 SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
88 SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
89 
90 #ifdef CALLOUT_PROFILING
91 static int avg_depth;
92 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
93     "Average number of items examined per softclock call. Units = 1/1000");
94 static int avg_gcalls;
95 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
96     "Average number of Giant callouts made per softclock call. Units = 1/1000");
97 static int avg_lockcalls;
98 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
99     "Average number of lock callouts made per softclock call. Units = 1/1000");
100 static int avg_mpcalls;
101 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
102     "Average number of MP callouts made per softclock call. Units = 1/1000");
103 static int avg_depth_dir;
104 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
105     "Average number of direct callouts examined per callout_process call. "
106     "Units = 1/1000");
107 static int avg_lockcalls_dir;
108 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
109     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
110     "callout_process call. Units = 1/1000");
111 static int avg_mpcalls_dir;
112 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
113     0, "Average number of MP direct callouts made per callout_process call. "
114     "Units = 1/1000");
115 #endif
116 
117 static int ncallout;
118 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0,
119     "Number of entries in callwheel and size of timeout() preallocation");
120 
121 #ifdef RSS
122 static int pin_default_swi = 1;
123 static int pin_pcpu_swi = 1;
124 #else
125 static int pin_default_swi = 0;
126 static int pin_pcpu_swi = 0;
127 #endif
128 
129 SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi,
130     0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)");
131 SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi,
132     0, "Pin the per-CPU swis (except PCPU 0, which is also default");
133 
134 #define sleepq_lock(w) do {} while(0)
135 #define sleepq_release(w) do {} while(0)
136 #define sleepq_add(a, b, c, d, e) do {} while(0)
137 #define sleepq_wait(w, p) do {} while(0)
138 
139 /*
140  * TODO:
141  *    allocate more timeout table slots when table overflows.
142  */
143 u_int callwheelsize, callwheelmask;
144 
145 /*
146  * The callout cpu exec entities represent informations necessary for
147  * describing the state of callouts currently running on the CPU and the ones
148  * necessary for migrating callouts to the new callout cpu. In particular,
149  * the first entry of the array cc_exec_entity holds informations for callout
150  * running in SWI thread context, while the second one holds informations
151  * for callout running directly from hardware interrupt context.
152  * The cached informations are very important for deferring migration when
153  * the migrating callout is already running.
154  */
155 struct cc_exec {
156     struct callout *cc_curr;
157     void (*cc_drain)(void *);
158 #ifdef SMP
159     void (*ce_migration_func)(void *);
160     void *ce_migration_arg;
161     int ce_migration_cpu;
162     sbintime_t ce_migration_time;
163     sbintime_t ce_migration_prec;
164 #endif
165     bool cc_cancel;
166     bool cc_waiting;
167 };
168 
169 /*
170  * There is one struct callout_cpu per cpu, holding all relevant
171  * state for the callout processing thread on the individual CPU.
172  */
173 struct callout_cpu {
174     struct mtx_padalign cc_lock;
175     struct cc_exec cc_exec_entity[2];
176     struct callout *cc_next;
177     struct callout *cc_callout;
178     struct callout_list *cc_callwheel;
179     struct callout_tailq cc_expireq;
180     struct callout_slist cc_callfree;
181     sbintime_t cc_firstevent;
182     sbintime_t cc_lastscan;
183     void *cc_cookie;
184     u_int cc_bucket;
185     u_int cc_inited;
186     char cc_ktr_event_name[20];
187 };
188 
189 #define callout_migrating(c)    ((c)->c_iflags & CALLOUT_DFRMIGRATION)
190 
191 #define cc_exec_curr(cc, dir)        cc->cc_exec_entity[dir].cc_curr
192 #define cc_exec_drain(cc, dir)       cc->cc_exec_entity[dir].cc_drain
193 #define cc_exec_next(cc)             cc->cc_next
194 #define cc_exec_cancel(cc, dir)      cc->cc_exec_entity[dir].cc_cancel
195 #define cc_exec_waiting(cc, dir)     cc->cc_exec_entity[dir].cc_waiting
196 #ifdef SMP
197 #define cc_migration_func(cc, dir)   cc->cc_exec_entity[dir].ce_migration_func
198 #define cc_migration_arg(cc, dir)    cc->cc_exec_entity[dir].ce_migration_arg
199 #define cc_migration_cpu(cc, dir)    cc->cc_exec_entity[dir].ce_migration_cpu
200 #define cc_migration_time(cc, dir)   cc->cc_exec_entity[dir].ce_migration_time
201 #define cc_migration_prec(cc, dir)   cc->cc_exec_entity[dir].ce_migration_prec
202 
203 struct callout_cpu cc_cpu[MAXCPU];
204 #define CPUBLOCK       MAXCPU
205 #define CC_CPU(cpu)    (&cc_cpu[(cpu)])
206 #define CC_SELF()      CC_CPU(PCPU_GET(cpuid))
207 #else
208 struct callout_cpu cc_cpu;
209 #define CC_CPU(cpu)    &cc_cpu
210 #define CC_SELF()      &cc_cpu
211 #endif
212 #define CC_LOCK(cc)           mtx_lock_spin(&(cc)->cc_lock)
213 #define CC_UNLOCK(cc)         mtx_unlock_spin(&(cc)->cc_lock)
214 #define CC_LOCK_ASSERT(cc)    mtx_assert(&(cc)->cc_lock, MA_OWNED)
215 
216 static int timeout_cpu;
217 
218 static void callout_cpu_init(struct callout_cpu *cc, int cpu);
219 static void softclock_call_cc(struct callout *c, struct callout_cpu *cc,
220 #ifdef CALLOUT_PROFILING
221  	int *mpcalls, int *lockcalls, int *gcalls,
222 #endif
223 	int direct);
224 
225 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
226 
227 /**
228  * Locked by cc_lock:
229  *   cc_curr         - If a callout is in progress, it is cc_curr.
230  *                     If cc_curr is non-NULL, threads waiting in
231  *                     callout_drain() will be woken up as soon as the
232  *                     relevant callout completes.
233  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
234  *                     guarantees that the current callout will not run.
235  *                     The softclock() function sets this to 0 before it
236  *                     drops callout_lock to acquire c_lock, and it calls
237  *                     the handler only if curr_cancelled is still 0 after
238  *                     cc_lock is successfully acquired.
239  *   cc_waiting      - If a thread is waiting in callout_drain(), then
240  *                     callout_wait is nonzero.  Set only when
241  *                     cc_curr is non-NULL.
242  */
243 
244 /*
245  * Resets the execution entity tied to a specific callout cpu.
246  */
247 static void
248 cc_cce_cleanup(struct callout_cpu *cc, int direct)
249 {
250     cc_exec_curr(cc, direct) = NULL;
251     cc_exec_cancel(cc, direct) = false;
252     cc_exec_waiting(cc, direct) = false;
253 #ifdef SMP
254     cc_migration_cpu(cc, direct) = CPUBLOCK;
255     cc_migration_time(cc, direct) = 0;
256     cc_migration_prec(cc, direct) = 0;
257     cc_migration_func(cc, direct) = NULL;
258     cc_migration_arg(cc, direct) = NULL;
259 #endif
260 }
261 
262 /*
263  * Checks if migration is requested by a specific callout cpu.
264  */
265 static int
266 cc_cce_migrating(struct callout_cpu *cc, int direct)
267 {
268 #ifdef SMP
269     return (cc_migration_cpu(cc, direct) != CPUBLOCK);
270 #else
271     return (0);
272 #endif
273 }
274 
275 /*
276  * Kernel low level callwheel initialization
277  * called on cpu0 during kernel startup.
278  */
279 static void
280 callout_callwheel_init(void *dummy)
281 {
282     struct callout_cpu *cc;
283 
284     /*
285      * Calculate the size of the callout wheel and the preallocated
286      * timeout() structures.
287      * XXX: Clip callout to result of previous function of maxusers
288      * maximum 384.  This is still huge, but acceptable.
289      */
290     memset(CC_CPU(0), 0, sizeof(cc_cpu));
291     ncallout = imin(16 + maxproc + maxfiles, 18508);
292     TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
293 
294     /*
295      * Calculate callout wheel size, should be next power of two higher
296      * than 'ncallout'.
297      */
298     callwheelsize = 1 << fls(ncallout);
299     callwheelmask = callwheelsize - 1;
300 
301     /*
302      * Fetch whether we're pinning the swi's or not.
303      */
304     TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi);
305     TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi);
306 
307     /*
308      * Only cpu0 handles timeout(9) and receives a preallocation.
309      *
310      * XXX: Once all timeout(9) consumers are converted this can
311      * be removed.
312      */
313     timeout_cpu = PCPU_GET(cpuid);
314     cc = CC_CPU(timeout_cpu);
315     cc->cc_callout = malloc(ncallout * sizeof(struct callout),
316         M_CALLOUT, M_WAITOK);
317     callout_cpu_init(cc, timeout_cpu);
318 }
319 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
320 
321 /*
322  * Initialize the per-cpu callout structures.
323  */
324 static void
325 callout_cpu_init(struct callout_cpu *cc, int cpu)
326 {
327     struct callout *c;
328     int i;
329 
330     mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
331     SLIST_INIT(&cc->cc_callfree);
332     cc->cc_inited = 1;
333     cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
334         M_CALLOUT, M_WAITOK);
335     for (i = 0; i < callwheelsize; i++)
336         LIST_INIT(&cc->cc_callwheel[i]);
337     TAILQ_INIT(&cc->cc_expireq);
338     cc->cc_firstevent = SBT_MAX;
339     for (i = 0; i < 2; i++)
340         cc_cce_cleanup(cc, i);
341     snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
342         "callwheel cpu %d", cpu);
343     if (cc->cc_callout == NULL)    /* Only cpu0 handles timeout(9) */
344         return;
345     for (i = 0; i < ncallout; i++) {
346         c = &cc->cc_callout[i];
347         callout_init(c, 0);
348         c->c_iflags = CALLOUT_LOCAL_ALLOC;
349         SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
350     }
351 }
352 
353 #ifdef SMP
354 /*
355  * Switches the cpu tied to a specific callout.
356  * The function expects a locked incoming callout cpu and returns with
357  * locked outcoming callout cpu.
358  */
359 static struct callout_cpu *
360 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
361 {
362     struct callout_cpu *new_cc;
363 
364     MPASS(c != NULL && cc != NULL);
365     CC_LOCK_ASSERT(cc);
366 
367     /*
368      * Avoid interrupts and preemption firing after the callout cpu
369      * is blocked in order to avoid deadlocks as the new thread
370      * may be willing to acquire the callout cpu lock.
371      */
372     c->c_cpu = CPUBLOCK;
373     spinlock_enter();
374     CC_UNLOCK(cc);
375     new_cc = CC_CPU(new_cpu);
376     CC_LOCK(new_cc);
377     spinlock_exit();
378     c->c_cpu = new_cpu;
379     return (new_cc);
380 }
381 #endif
382 
383 #ifndef FSTACK
384 /*
385  * Start standard softclock thread.
386  */
387 static void
388 start_softclock(void *dummy)
389 {
390     struct callout_cpu *cc;
391     char name[MAXCOMLEN];
392 #ifdef SMP
393     int cpu;
394     struct intr_event *ie;
395 #endif
396 
397     cc = CC_CPU(timeout_cpu);
398     snprintf(name, sizeof(name), "clock (%d)", timeout_cpu);
399     if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK,
400         INTR_MPSAFE, &cc->cc_cookie))
401         panic("died while creating standard software ithreads");
402     if (pin_default_swi &&
403         (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) {
404         printf("%s: timeout clock couldn't be pinned to cpu %d\n",
405             __func__,
406             timeout_cpu);
407     }
408 
409 #ifdef SMP
410     CPU_FOREACH(cpu) {
411         if (cpu == timeout_cpu)
412             continue;
413         cc = CC_CPU(cpu);
414         cc->cc_callout = NULL;    /* Only cpu0 handles timeout(9). */
415         callout_cpu_init(cc, cpu);
416         snprintf(name, sizeof(name), "clock (%d)", cpu);
417         ie = NULL;
418         if (swi_add(&ie, name, softclock, cc, SWI_CLOCK,
419             INTR_MPSAFE, &cc->cc_cookie))
420             panic("died while creating standard software ithreads");
421         if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) {
422             printf("%s: per-cpu clock couldn't be pinned to "
423                 "cpu %d\n",
424                 __func__,
425                 cpu);
426         }
427     }
428 #endif
429 }
430 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
431 #endif
432 
433 #define    CC_HASH_SHIFT    8
434 
435 static inline u_int
436 callout_hash(sbintime_t sbt)
437 {
438     return (sbt >> (32 - CC_HASH_SHIFT));
439 }
440 
441 static inline u_int
442 callout_get_bucket(sbintime_t sbt)
443 {
444     return (callout_hash(sbt) & callwheelmask);
445 }
446 
447 void
448 callout_process(sbintime_t now)
449 {
450     struct callout *tmp, *tmpn;
451     struct callout_cpu *cc;
452     struct callout_list *sc;
453     sbintime_t first, last, max, tmp_max;
454     uint32_t lookahead;
455     u_int firstb, lastb, nowb;
456 #ifdef CALLOUT_PROFILING
457     int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
458 #endif
459 
460     cc = CC_SELF();
461     mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
462 
463     /* Compute the buckets of the last scan and present times. */
464     firstb = callout_hash(cc->cc_lastscan);
465     cc->cc_lastscan = now;
466     nowb = callout_hash(now);
467 
468     /* Compute the last bucket and minimum time of the bucket after it. */
469     if (nowb == firstb)
470         lookahead = (SBT_1S / 16);
471     else if (nowb - firstb == 1)
472         lookahead = (SBT_1S / 8);
473     else
474         lookahead = (SBT_1S / 2);
475     first = last = now;
476     first += (lookahead / 2);
477     last += lookahead;
478     last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
479     lastb = callout_hash(last) - 1;
480     max = last;
481 
482     /*
483      * Check if we wrapped around the entire wheel from the last scan.
484      * In case, we need to scan entirely the wheel for pending callouts.
485      */
486     if (lastb - firstb >= callwheelsize) {
487         lastb = firstb + callwheelsize - 1;
488         if (nowb - firstb >= callwheelsize)
489             nowb = lastb;
490     }
491 
492     /* Iterate callwheel from firstb to nowb and then up to lastb. */
493     do {
494         sc = &cc->cc_callwheel[firstb & callwheelmask];
495         tmp = LIST_FIRST(sc);
496         while (tmp != NULL) {
497             /* Run the callout if present time within allowed. */
498             if (tmp->c_time <= now) {
499                 /*
500                  * Consumer told us the callout may be run
501                  * directly from hardware interrupt context.
502                  */
503                 if (tmp->c_iflags & CALLOUT_DIRECT) {
504 #ifdef CALLOUT_PROFILING
505                     ++depth_dir;
506 #endif
507                     cc_exec_next(cc) =
508                         LIST_NEXT(tmp, c_links.le);
509                     cc->cc_bucket = firstb & callwheelmask;
510                     LIST_REMOVE(tmp, c_links.le);
511                     softclock_call_cc(tmp, cc,
512 #ifdef CALLOUT_PROFILING
513                         &mpcalls_dir, &lockcalls_dir, NULL,
514 #endif
515                         1);
516                     tmp = cc_exec_next(cc);
517                     cc_exec_next(cc) = NULL;
518                 } else {
519                     tmpn = LIST_NEXT(tmp, c_links.le);
520                     LIST_REMOVE(tmp, c_links.le);
521                     TAILQ_INSERT_TAIL(&cc->cc_expireq,
522                         tmp, c_links.tqe);
523                     tmp->c_iflags |= CALLOUT_PROCESSED;
524                     tmp = tmpn;
525                 }
526                 continue;
527             }
528             /* Skip events from distant future. */
529             if (tmp->c_time >= max)
530                 goto next;
531             /*
532              * Event minimal time is bigger than present maximal
533              * time, so it cannot be aggregated.
534              */
535             if (tmp->c_time > last) {
536                 lastb = nowb;
537                 goto next;
538             }
539             /* Update first and last time, respecting this event. */
540             if (tmp->c_time < first)
541                 first = tmp->c_time;
542             tmp_max = tmp->c_time + tmp->c_precision;
543             if (tmp_max < last)
544                 last = tmp_max;
545 next:
546             tmp = LIST_NEXT(tmp, c_links.le);
547         }
548         /* Proceed with the next bucket. */
549         firstb++;
550         /*
551          * Stop if we looked after present time and found
552          * some event we can't execute at now.
553          * Stop if we looked far enough into the future.
554          */
555     } while (((int)(firstb - lastb)) <= 0);
556     cc->cc_firstevent = last;
557 #ifndef NO_EVENTTIMERS
558     cpu_new_callout(curcpu, last, first);
559 #endif
560 #ifdef CALLOUT_PROFILING
561     avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
562     avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
563     avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
564 #endif
565     mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
566     /*
567      * swi_sched acquires the thread lock, so we don't want to call it
568      * with cc_lock held; incorrect locking order.
569      */
570     if (!TAILQ_EMPTY(&cc->cc_expireq))
571 #ifndef FSTACK
572         swi_sched(cc->cc_cookie, 0);
573 #else
574         softclock(cc);
575 #endif
576 }
577 
578 static struct callout_cpu *
579 callout_lock(struct callout *c)
580 {
581     struct callout_cpu *cc;
582     int cpu;
583 
584     for (;;) {
585         cpu = c->c_cpu;
586 #ifdef SMP
587         if (cpu == CPUBLOCK) {
588             while (c->c_cpu == CPUBLOCK)
589                 cpu_spinwait();
590             continue;
591         }
592 #endif
593         cc = CC_CPU(cpu);
594         CC_LOCK(cc);
595         if (cpu == c->c_cpu)
596             break;
597         CC_UNLOCK(cc);
598     }
599     return (cc);
600 }
601 
602 static void
603 callout_cc_add(struct callout *c, struct callout_cpu *cc,
604     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
605     void *arg, int cpu, int flags)
606 {
607     int bucket;
608 
609     CC_LOCK_ASSERT(cc);
610     if (sbt < cc->cc_lastscan)
611         sbt = cc->cc_lastscan;
612     c->c_arg = arg;
613     c->c_iflags |= CALLOUT_PENDING;
614     c->c_iflags &= ~CALLOUT_PROCESSED;
615     c->c_flags |= CALLOUT_ACTIVE;
616     if (flags & C_DIRECT_EXEC)
617         c->c_iflags |= CALLOUT_DIRECT;
618     c->c_func = func;
619     c->c_time = sbt;
620     c->c_precision = precision;
621     bucket = callout_get_bucket(c->c_time);
622     CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
623         c, (int)(c->c_precision >> 32),
624         (u_int)(c->c_precision & 0xffffffff));
625     LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
626     if (cc->cc_bucket == bucket)
627         cc_exec_next(cc) = c;
628 #ifndef NO_EVENTTIMERS
629     /*
630      * Inform the eventtimers(4) subsystem there's a new callout
631      * that has been inserted, but only if really required.
632      */
633     if (SBT_MAX - c->c_time < c->c_precision)
634         c->c_precision = SBT_MAX - c->c_time;
635     sbt = c->c_time + c->c_precision;
636     if (sbt < cc->cc_firstevent) {
637         cc->cc_firstevent = sbt;
638         cpu_new_callout(cpu, sbt, c->c_time);
639     }
640 #endif
641 }
642 
643 static void
644 callout_cc_del(struct callout *c, struct callout_cpu *cc)
645 {
646 
647     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
648         return;
649     c->c_func = NULL;
650     SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
651 }
652 
653 static void
654 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
655 #ifdef CALLOUT_PROFILING
656     int *mpcalls, int *lockcalls, int *gcalls,
657 #endif
658     int direct)
659 {
660     struct rm_priotracker tracker;
661     void (*c_func)(void *);
662     void *c_arg;
663     struct lock_class *class;
664     struct lock_object *c_lock;
665     uintptr_t lock_status;
666     int c_iflags;
667 #ifdef SMP
668     struct callout_cpu *new_cc;
669     void (*new_func)(void *);
670     void *new_arg;
671     int flags, new_cpu;
672     sbintime_t new_prec, new_time;
673 #endif
674 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
675     sbintime_t sbt1, sbt2;
676     struct timespec ts2;
677     static sbintime_t maxdt = 2 * SBT_1MS;    /* 2 msec */
678     static timeout_t *lastfunc;
679 #endif
680 
681     KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
682         ("softclock_call_cc: pend %p %x", c, c->c_iflags));
683     KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
684         ("softclock_call_cc: act %p %x", c, c->c_flags));
685     class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
686     lock_status = 0;
687     if (c->c_flags & CALLOUT_SHAREDLOCK) {
688         if (class == &lock_class_rm)
689             lock_status = (uintptr_t)&tracker;
690         else
691             lock_status = 1;
692     }
693     c_lock = c->c_lock;
694     c_func = c->c_func;
695     c_arg = c->c_arg;
696     c_iflags = c->c_iflags;
697     if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
698         c->c_iflags = CALLOUT_LOCAL_ALLOC;
699     else
700         c->c_iflags &= ~CALLOUT_PENDING;
701 
702     cc_exec_curr(cc, direct) = c;
703     cc_exec_cancel(cc, direct) = false;
704     cc_exec_drain(cc, direct) = NULL;
705     CC_UNLOCK(cc);
706     if (c_lock != NULL) {
707         class->lc_lock(c_lock, lock_status);
708         /*
709          * The callout may have been cancelled
710          * while we switched locks.
711          */
712         if (cc_exec_cancel(cc, direct)) {
713             class->lc_unlock(c_lock);
714             goto skip;
715         }
716         /* The callout cannot be stopped now. */
717         cc_exec_cancel(cc, direct) = true;
718         if (c_lock == &Giant.lock_object) {
719 #ifdef CALLOUT_PROFILING
720             (*gcalls)++;
721 #endif
722             CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
723                 c, c_func, c_arg);
724         } else {
725 #ifdef CALLOUT_PROFILING
726             (*lockcalls)++;
727 #endif
728             CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
729                 c, c_func, c_arg);
730         }
731     } else {
732 #ifdef CALLOUT_PROFILING
733         (*mpcalls)++;
734 #endif
735         CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
736             c, c_func, c_arg);
737     }
738     KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
739         "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
740 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
741     sbt1 = sbinuptime();
742 #endif
743     THREAD_NO_SLEEPING();
744     SDT_PROBE1(callout_execute, , , callout__start, c);
745     c_func(c_arg);
746     SDT_PROBE1(callout_execute, , , callout__end, c);
747     THREAD_SLEEPING_OK();
748 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
749     sbt2 = sbinuptime();
750     sbt2 -= sbt1;
751     if (sbt2 > maxdt) {
752         if (lastfunc != c_func || sbt2 > maxdt * 2) {
753             ts2 = sbttots(sbt2);
754             printf(
755         "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
756                 c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
757         }
758         maxdt = sbt2;
759         lastfunc = c_func;
760     }
761 #endif
762     KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
763     CTR1(KTR_CALLOUT, "callout %p finished", c);
764     if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
765         class->lc_unlock(c_lock);
766 skip:
767     CC_LOCK(cc);
768     KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
769     cc_exec_curr(cc, direct) = NULL;
770     if (cc_exec_drain(cc, direct)) {
771         void (*drain)(void *);
772 
773         drain = cc_exec_drain(cc, direct);
774         cc_exec_drain(cc, direct) = NULL;
775         CC_UNLOCK(cc);
776         drain(c_arg);
777         CC_LOCK(cc);
778     }
779     if (cc_exec_waiting(cc, direct)) {
780         /*
781          * There is someone waiting for the
782          * callout to complete.
783          * If the callout was scheduled for
784          * migration just cancel it.
785          */
786         if (cc_cce_migrating(cc, direct)) {
787             cc_cce_cleanup(cc, direct);
788 
789             /*
790              * It should be assert here that the callout is not
791              * destroyed but that is not easy.
792              */
793             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
794         }
795         cc_exec_waiting(cc, direct) = false;
796         CC_UNLOCK(cc);
797         wakeup(&cc_exec_waiting(cc, direct));
798         CC_LOCK(cc);
799     } else if (cc_cce_migrating(cc, direct)) {
800         KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
801             ("Migrating legacy callout %p", c));
802 #ifdef SMP
803         /*
804          * If the callout was scheduled for
805          * migration just perform it now.
806          */
807         new_cpu = cc_migration_cpu(cc, direct);
808         new_time = cc_migration_time(cc, direct);
809         new_prec = cc_migration_prec(cc, direct);
810         new_func = cc_migration_func(cc, direct);
811         new_arg = cc_migration_arg(cc, direct);
812         cc_cce_cleanup(cc, direct);
813 
814         /*
815          * It should be assert here that the callout is not destroyed
816          * but that is not easy.
817          *
818          * As first thing, handle deferred callout stops.
819          */
820         if (!callout_migrating(c)) {
821             CTR3(KTR_CALLOUT,
822                  "deferred cancelled %p func %p arg %p",
823                  c, new_func, new_arg);
824             callout_cc_del(c, cc);
825             return;
826         }
827         c->c_iflags &= ~CALLOUT_DFRMIGRATION;
828 
829         new_cc = callout_cpu_switch(c, cc, new_cpu);
830         flags = (direct) ? C_DIRECT_EXEC : 0;
831         callout_cc_add(c, new_cc, new_time, new_prec, new_func,
832             new_arg, new_cpu, flags);
833         CC_UNLOCK(new_cc);
834         CC_LOCK(cc);
835 #else
836         panic("migration should not happen");
837 #endif
838     }
839     /*
840      * If the current callout is locally allocated (from
841      * timeout(9)) then put it on the freelist.
842      *
843      * Note: we need to check the cached copy of c_iflags because
844      * if it was not local, then it's not safe to deref the
845      * callout pointer.
846      */
847     KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
848         c->c_iflags == CALLOUT_LOCAL_ALLOC,
849         ("corrupted callout"));
850     if (c_iflags & CALLOUT_LOCAL_ALLOC)
851         callout_cc_del(c, cc);
852 }
853 
854 /*
855  * The callout mechanism is based on the work of Adam M. Costello and
856  * George Varghese, published in a technical report entitled "Redesigning
857  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
858  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
859  * used in this implementation was published by G. Varghese and T. Lauck in
860  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
861  * the Efficient Implementation of a Timer Facility" in the Proceedings of
862  * the 11th ACM Annual Symposium on Operating Systems Principles,
863  * Austin, Texas Nov 1987.
864  */
865 
866 /*
867  * Software (low priority) clock interrupt.
868  * Run periodic events from timeout queue.
869  */
870 void
871 softclock(void *arg)
872 {
873     struct callout_cpu *cc;
874     struct callout *c;
875 #ifdef CALLOUT_PROFILING
876     int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
877 #endif
878 
879     cc = (struct callout_cpu *)arg;
880     CC_LOCK(cc);
881     while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
882         TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
883         softclock_call_cc(c, cc,
884 #ifdef CALLOUT_PROFILING
885             &mpcalls, &lockcalls, &gcalls,
886 #endif
887             0);
888 #ifdef CALLOUT_PROFILING
889         ++depth;
890 #endif
891     }
892 #ifdef CALLOUT_PROFILING
893     avg_depth += (depth * 1000 - avg_depth) >> 8;
894     avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
895     avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
896     avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
897 #endif
898     CC_UNLOCK(cc);
899 }
900 
901 /*
902  * timeout --
903  *    Execute a function after a specified length of time.
904  *
905  * untimeout --
906  *    Cancel previous timeout function call.
907  *
908  * callout_handle_init --
909  *    Initialize a handle so that using it with untimeout is benign.
910  *
911  *    See AT&T BCI Driver Reference Manual for specification.  This
912  *    implementation differs from that one in that although an
913  *    identification value is returned from timeout, the original
914  *    arguments to timeout as well as the identifier are used to
915  *    identify entries for untimeout.
916  */
917 struct callout_handle
918 timeout(timeout_t *ftn, void *arg, int to_ticks)
919 {
920     struct callout_cpu *cc;
921     struct callout *new;
922     struct callout_handle handle;
923 
924     cc = CC_CPU(timeout_cpu);
925     CC_LOCK(cc);
926     /* Fill in the next free callout structure. */
927     new = SLIST_FIRST(&cc->cc_callfree);
928     if (new == NULL)
929         /* XXX Attempt to malloc first */
930         panic("timeout table full");
931     SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
932     callout_reset(new, to_ticks, ftn, arg);
933     handle.callout = new;
934     CC_UNLOCK(cc);
935 
936     return (handle);
937 }
938 
939 void
940 untimeout(timeout_t *ftn, void *arg, struct callout_handle handle)
941 {
942     struct callout_cpu *cc;
943 
944     /*
945      * Check for a handle that was initialized
946      * by callout_handle_init, but never used
947      * for a real timeout.
948      */
949     if (handle.callout == NULL)
950         return;
951 
952     cc = callout_lock(handle.callout);
953     if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
954         callout_stop(handle.callout);
955     CC_UNLOCK(cc);
956 }
957 
958 void
959 callout_handle_init(struct callout_handle *handle)
960 {
961     handle->callout = NULL;
962 }
963 
964 /*
965  * New interface; clients allocate their own callout structures.
966  *
967  * callout_reset() - establish or change a timeout
968  * callout_stop() - disestablish a timeout
969  * callout_init() - initialize a callout structure so that it can
970  *    safely be passed to callout_reset() and callout_stop()
971  *
972  * <sys/callout.h> defines three convenience macros:
973  *
974  * callout_active() - returns truth if callout has not been stopped,
975  *    drained, or deactivated since the last time the callout was
976  *    reset.
977  * callout_pending() - returns truth if callout is still waiting for timeout
978  * callout_deactivate() - marks the callout as having been serviced
979  */
980 int
981 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
982     void (*ftn)(void *), void *arg, int cpu, int flags)
983 {
984     sbintime_t to_sbt, pr;
985     struct callout_cpu *cc;
986     int cancelled, direct;
987     int ignore_cpu=0;
988 
989     cancelled = 0;
990     if (cpu == -1) {
991         ignore_cpu = 1;
992     } else if ((cpu >= MAXCPU) ||
993            ((CC_CPU(cpu))->cc_inited == 0)) {
994         /* Invalid CPU spec */
995         panic("Invalid CPU in callout %d", cpu);
996     }
997     if (flags & C_ABSOLUTE) {
998         to_sbt = sbt;
999     } else {
1000         if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
1001             sbt = tick_sbt;
1002         if ((flags & C_HARDCLOCK) ||
1003 #ifdef NO_EVENTTIMERS
1004             sbt >= sbt_timethreshold) {
1005             to_sbt = getsbinuptime();
1006 
1007             /* Add safety belt for the case of hz > 1000. */
1008             to_sbt += tc_tick_sbt - tick_sbt;
1009 #else
1010             sbt >= sbt_tickthreshold) {
1011             /*
1012              * Obtain the time of the last hardclock() call on
1013              * this CPU directly from the kern_clocksource.c.
1014              * This value is per-CPU, but it is equal for all
1015              * active ones.
1016              */
1017 #ifdef __LP64__
1018             to_sbt = DPCPU_GET(hardclocktime);
1019 #else
1020             spinlock_enter();
1021             to_sbt = DPCPU_GET(hardclocktime);
1022             spinlock_exit();
1023 #endif
1024 #endif
1025             if ((flags & C_HARDCLOCK) == 0)
1026                 to_sbt += tick_sbt;
1027         } else
1028             to_sbt = sbinuptime();
1029         if (SBT_MAX - to_sbt < sbt)
1030             to_sbt = SBT_MAX;
1031         else
1032             to_sbt += sbt;
1033         pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
1034             sbt >> C_PRELGET(flags));
1035         if (pr > precision)
1036             precision = pr;
1037     }
1038     /*
1039      * This flag used to be added by callout_cc_add, but the
1040      * first time you call this we could end up with the
1041      * wrong direct flag if we don't do it before we add.
1042      */
1043     if (flags & C_DIRECT_EXEC) {
1044         direct = 1;
1045     } else {
1046         direct = 0;
1047     }
1048     KASSERT(!direct || c->c_lock == NULL,
1049         ("%s: direct callout %p has lock", __func__, c));
1050     cc = callout_lock(c);
1051     /*
1052      * Don't allow migration of pre-allocated callouts lest they
1053      * become unbalanced or handle the case where the user does
1054      * not care.
1055      */
1056     if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
1057         ignore_cpu) {
1058         cpu = c->c_cpu;
1059     }
1060 
1061     if (cc_exec_curr(cc, direct) == c) {
1062         /*
1063          * We're being asked to reschedule a callout which is
1064          * currently in progress.  If there is a lock then we
1065          * can cancel the callout if it has not really started.
1066          */
1067         if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
1068             cancelled = cc_exec_cancel(cc, direct) = true;
1069         if (cc_exec_waiting(cc, direct)) {
1070             /*
1071              * Someone has called callout_drain to kill this
1072              * callout.  Don't reschedule.
1073              */
1074             CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
1075                 cancelled ? "cancelled" : "failed to cancel",
1076                 c, c->c_func, c->c_arg);
1077             CC_UNLOCK(cc);
1078             return (cancelled);
1079         }
1080 #ifdef SMP
1081         if (callout_migrating(c)) {
1082             /*
1083              * This only occurs when a second callout_reset_sbt_on
1084              * is made after a previous one moved it into
1085              * deferred migration (below). Note we do *not* change
1086              * the prev_cpu even though the previous target may
1087              * be different.
1088              */
1089             cc_migration_cpu(cc, direct) = cpu;
1090             cc_migration_time(cc, direct) = to_sbt;
1091             cc_migration_prec(cc, direct) = precision;
1092             cc_migration_func(cc, direct) = ftn;
1093             cc_migration_arg(cc, direct) = arg;
1094             cancelled = 1;
1095             CC_UNLOCK(cc);
1096             return (cancelled);
1097         }
1098 #endif
1099     }
1100     if (c->c_iflags & CALLOUT_PENDING) {
1101         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
1102             if (cc_exec_next(cc) == c)
1103                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
1104             LIST_REMOVE(c, c_links.le);
1105         } else {
1106             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
1107         }
1108         cancelled = 1;
1109         c->c_iflags &= ~ CALLOUT_PENDING;
1110         c->c_flags &= ~ CALLOUT_ACTIVE;
1111     }
1112 
1113 #ifdef SMP
1114     /*
1115      * If the callout must migrate try to perform it immediately.
1116      * If the callout is currently running, just defer the migration
1117      * to a more appropriate moment.
1118      */
1119     if (c->c_cpu != cpu) {
1120         if (cc_exec_curr(cc, direct) == c) {
1121             /*
1122              * Pending will have been removed since we are
1123              * actually executing the callout on another
1124              * CPU. That callout should be waiting on the
1125              * lock the caller holds. If we set both
1126              * active/and/pending after we return and the
1127              * lock on the executing callout proceeds, it
1128              * will then see pending is true and return.
1129              * At the return from the actual callout execution
1130              * the migration will occur in softclock_call_cc
1131              * and this new callout will be placed on the
1132              * new CPU via a call to callout_cpu_switch() which
1133              * will get the lock on the right CPU followed
1134              * by a call callout_cc_add() which will add it there.
1135              * (see above in softclock_call_cc()).
1136              */
1137             cc_migration_cpu(cc, direct) = cpu;
1138             cc_migration_time(cc, direct) = to_sbt;
1139             cc_migration_prec(cc, direct) = precision;
1140             cc_migration_func(cc, direct) = ftn;
1141             cc_migration_arg(cc, direct) = arg;
1142             c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
1143             c->c_flags |= CALLOUT_ACTIVE;
1144             CTR6(KTR_CALLOUT,
1145             "migration of %p func %p arg %p in %d.%08x to %u deferred",
1146                 c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
1147                 (u_int)(to_sbt & 0xffffffff), cpu);
1148             CC_UNLOCK(cc);
1149             return (cancelled);
1150         }
1151         cc = callout_cpu_switch(c, cc, cpu);
1152     }
1153 #endif
1154 
1155     callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
1156     CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
1157         cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
1158         (u_int)(to_sbt & 0xffffffff));
1159     CC_UNLOCK(cc);
1160 
1161     return (cancelled);
1162 }
1163 
1164 /*
1165  * Common idioms that can be optimized in the future.
1166  */
1167 int
1168 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
1169 {
1170     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
1171 }
1172 
1173 int
1174 callout_schedule(struct callout *c, int to_ticks)
1175 {
1176     return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
1177 }
1178 
1179 int
1180 _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *))
1181 {
1182     struct callout_cpu *cc, *old_cc;
1183     struct lock_class *class;
1184     int direct, sq_locked, use_lock;
1185     int cancelled, not_on_a_list;
1186 
1187     if ((flags & CS_DRAIN) != 0)
1188         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock,
1189             "calling %s", __func__);
1190 
1191     /*
1192      * Some old subsystems don't hold Giant while running a callout_stop(),
1193      * so just discard this check for the moment.
1194      */
1195     if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
1196         if (c->c_lock == &Giant.lock_object)
1197             use_lock = mtx_owned(&Giant);
1198         else {
1199             use_lock = 1;
1200             class = LOCK_CLASS(c->c_lock);
1201             class->lc_assert(c->c_lock, LA_XLOCKED);
1202         }
1203     } else
1204         use_lock = 0;
1205     if (c->c_iflags & CALLOUT_DIRECT) {
1206         direct = 1;
1207     } else {
1208         direct = 0;
1209     }
1210     sq_locked = 0;
1211     old_cc = NULL;
1212 again:
1213     cc = callout_lock(c);
1214 
1215     if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
1216         (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
1217         ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
1218         /*
1219          * Special case where this slipped in while we
1220          * were migrating *as* the callout is about to
1221          * execute. The caller probably holds the lock
1222          * the callout wants.
1223          *
1224          * Get rid of the migration first. Then set
1225          * the flag that tells this code *not* to
1226          * try to remove it from any lists (its not
1227          * on one yet). When the callout wheel runs,
1228          * it will ignore this callout.
1229          */
1230         c->c_iflags &= ~CALLOUT_PENDING;
1231         c->c_flags &= ~CALLOUT_ACTIVE;
1232         not_on_a_list = 1;
1233     } else {
1234         not_on_a_list = 0;
1235     }
1236 
1237     /*
1238      * If the callout was migrating while the callout cpu lock was
1239      * dropped,  just drop the sleepqueue lock and check the states
1240      * again.
1241      */
1242     if (sq_locked != 0 && cc != old_cc) {
1243 #ifdef SMP
1244         CC_UNLOCK(cc);
1245         sleepq_release(&cc_exec_waiting(old_cc, direct));
1246         sq_locked = 0;
1247         old_cc = NULL;
1248         goto again;
1249 #else
1250         panic("migration should not happen");
1251 #endif
1252     }
1253 
1254     /*
1255      * If the callout is running, try to stop it or drain it.
1256      */
1257     if (cc_exec_curr(cc, direct) == c) {
1258         /*
1259          * Succeed we to stop it or not, we must clear the
1260          * active flag - this is what API users expect.
1261          */
1262         c->c_flags &= ~CALLOUT_ACTIVE;
1263 
1264         if ((flags & CS_DRAIN) != 0) {
1265             /*
1266              * The current callout is running (or just
1267              * about to run) and blocking is allowed, so
1268              * just wait for the current invocation to
1269              * finish.
1270              */
1271             while (cc_exec_curr(cc, direct) == c) {
1272                 /*
1273                  * Use direct calls to sleepqueue interface
1274                  * instead of cv/msleep in order to avoid
1275                  * a LOR between cc_lock and sleepqueue
1276                  * chain spinlocks.  This piece of code
1277                  * emulates a msleep_spin() call actually.
1278                  *
1279                  * If we already have the sleepqueue chain
1280                  * locked, then we can safely block.  If we
1281                  * don't already have it locked, however,
1282                  * we have to drop the cc_lock to lock
1283                  * it.  This opens several races, so we
1284                  * restart at the beginning once we have
1285                  * both locks.  If nothing has changed, then
1286                  * we will end up back here with sq_locked
1287                  * set.
1288                  */
1289                 if (!sq_locked) {
1290                     CC_UNLOCK(cc);
1291                     sleepq_lock(
1292                         &cc_exec_waiting(cc, direct));
1293                     sq_locked = 1;
1294                     old_cc = cc;
1295                     goto again;
1296                 }
1297 
1298                 /*
1299                  * Migration could be cancelled here, but
1300                  * as long as it is still not sure when it
1301                  * will be packed up, just let softclock()
1302                  * take care of it.
1303                  */
1304                 cc_exec_waiting(cc, direct) = true;
1305                 DROP_GIANT();
1306                 CC_UNLOCK(cc);
1307                 sleepq_add(
1308                     &cc_exec_waiting(cc, direct),
1309                     &cc->cc_lock.lock_object, "codrain",
1310                     SLEEPQ_SLEEP, 0);
1311                 sleepq_wait(
1312                     &cc_exec_waiting(cc, direct),
1313                          0);
1314                 sq_locked = 0;
1315                 old_cc = NULL;
1316 
1317                 /* Reacquire locks previously released. */
1318                 PICKUP_GIANT();
1319                 CC_LOCK(cc);
1320             }
1321         } else if (use_lock &&
1322                !cc_exec_cancel(cc, direct) && (drain == NULL)) {
1323 
1324             /*
1325              * The current callout is waiting for its
1326              * lock which we hold.  Cancel the callout
1327              * and return.  After our caller drops the
1328              * lock, the callout will be skipped in
1329              * softclock(). This *only* works with a
1330              * callout_stop() *not* callout_drain() or
1331              * callout_async_drain().
1332              */
1333             cc_exec_cancel(cc, direct) = true;
1334             CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
1335                 c, c->c_func, c->c_arg);
1336             KASSERT(!cc_cce_migrating(cc, direct),
1337                 ("callout wrongly scheduled for migration"));
1338             if (callout_migrating(c)) {
1339                 c->c_iflags &= ~CALLOUT_DFRMIGRATION;
1340 #ifdef SMP
1341                 cc_migration_cpu(cc, direct) = CPUBLOCK;
1342                 cc_migration_time(cc, direct) = 0;
1343                 cc_migration_prec(cc, direct) = 0;
1344                 cc_migration_func(cc, direct) = NULL;
1345                 cc_migration_arg(cc, direct) = NULL;
1346 #endif
1347             }
1348             CC_UNLOCK(cc);
1349             KASSERT(!sq_locked, ("sleepqueue chain locked"));
1350             return (1);
1351         } else if (callout_migrating(c)) {
1352             /*
1353              * The callout is currently being serviced
1354              * and the "next" callout is scheduled at
1355              * its completion with a migration. We remove
1356              * the migration flag so it *won't* get rescheduled,
1357              * but we can't stop the one thats running so
1358              * we return 0.
1359              */
1360             c->c_iflags &= ~CALLOUT_DFRMIGRATION;
1361 #ifdef SMP
1362             /*
1363              * We can't call cc_cce_cleanup here since
1364              * if we do it will remove .ce_curr and
1365              * its still running. This will prevent a
1366              * reschedule of the callout when the
1367              * execution completes.
1368              */
1369             cc_migration_cpu(cc, direct) = CPUBLOCK;
1370             cc_migration_time(cc, direct) = 0;
1371             cc_migration_prec(cc, direct) = 0;
1372             cc_migration_func(cc, direct) = NULL;
1373             cc_migration_arg(cc, direct) = NULL;
1374 #endif
1375             CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
1376                 c, c->c_func, c->c_arg);
1377              if (drain) {
1378                 cc_exec_drain(cc, direct) = drain;
1379             }
1380             CC_UNLOCK(cc);
1381             return ((flags & CS_EXECUTING) != 0);
1382         }
1383         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
1384             c, c->c_func, c->c_arg);
1385         if (drain) {
1386             cc_exec_drain(cc, direct) = drain;
1387         }
1388         KASSERT(!sq_locked, ("sleepqueue chain still locked"));
1389         cancelled = ((flags & CS_EXECUTING) != 0);
1390     } else
1391         cancelled = 1;
1392 
1393     if (sq_locked)
1394         sleepq_release(&cc_exec_waiting(cc, direct));
1395 
1396     if ((c->c_iflags & CALLOUT_PENDING) == 0) {
1397         CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
1398             c, c->c_func, c->c_arg);
1399         /*
1400          * For not scheduled and not executing callout return
1401          * negative value.
1402          */
1403         if (cc_exec_curr(cc, direct) != c)
1404             cancelled = -1;
1405         CC_UNLOCK(cc);
1406         return (cancelled);
1407     }
1408 
1409     c->c_iflags &= ~CALLOUT_PENDING;
1410     c->c_flags &= ~CALLOUT_ACTIVE;
1411 
1412     CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
1413         c, c->c_func, c->c_arg);
1414     if (not_on_a_list == 0) {
1415         if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
1416             if (cc_exec_next(cc) == c)
1417                 cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
1418             LIST_REMOVE(c, c_links.le);
1419         } else {
1420             TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
1421         }
1422     }
1423     callout_cc_del(c, cc);
1424     CC_UNLOCK(cc);
1425     return (cancelled);
1426 }
1427 
1428 void
1429 callout_init(struct callout *c, int mpsafe)
1430 {
1431     bzero(c, sizeof *c);
1432     if (mpsafe) {
1433         c->c_lock = NULL;
1434         c->c_iflags = CALLOUT_RETURNUNLOCKED;
1435     } else {
1436         c->c_lock = &Giant.lock_object;
1437         c->c_iflags = 0;
1438     }
1439     c->c_cpu = timeout_cpu;
1440 }
1441 
1442 void
1443 _callout_init_lock(struct callout *c, struct lock_object *lock, int flags)
1444 {
1445     bzero(c, sizeof *c);
1446     c->c_lock = lock;
1447     KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
1448         ("callout_init_lock: bad flags %d", flags));
1449     KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
1450         ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
1451     KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
1452         (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
1453         __func__));
1454     c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
1455     c->c_cpu = timeout_cpu;
1456 }
1457 
1458 #ifdef APM_FIXUP_CALLTODO
1459 /*
1460  * Adjust the kernel calltodo timeout list.  This routine is used after
1461  * an APM resume to recalculate the calltodo timer list values with the
1462  * number of hz's we have been sleeping.  The next hardclock() will detect
1463  * that there are fired timers and run softclock() to execute them.
1464  *
1465  * Please note, I have not done an exhaustive analysis of what code this
1466  * might break.  I am motivated to have my select()'s and alarm()'s that
1467  * have expired during suspend firing upon resume so that the applications
1468  * which set the timer can do the maintanence the timer was for as close
1469  * as possible to the originally intended time.  Testing this code for a
1470  * week showed that resuming from a suspend resulted in 22 to 25 timers
1471  * firing, which seemed independent on whether the suspend was 2 hours or
1472  * 2 days.  Your milage may vary.   - Ken Key <[email protected]>
1473  */
1474 void
1475 adjust_timeout_calltodo(struct timeval *time_change)
1476 {
1477     register struct callout *p;
1478     unsigned long delta_ticks;
1479 
1480     /*
1481      * How many ticks were we asleep?
1482      * (stolen from tvtohz()).
1483      */
1484 
1485     /* Don't do anything */
1486     if (time_change->tv_sec < 0)
1487         return;
1488     else if (time_change->tv_sec <= LONG_MAX / 1000000)
1489         delta_ticks = howmany(time_change->tv_sec * 1000000 +
1490             time_change->tv_usec, tick) + 1;
1491     else if (time_change->tv_sec <= LONG_MAX / hz)
1492         delta_ticks = time_change->tv_sec * hz +
1493             howmany(time_change->tv_usec, tick) + 1;
1494     else
1495         delta_ticks = LONG_MAX;
1496 
1497     if (delta_ticks > INT_MAX)
1498         delta_ticks = INT_MAX;
1499 
1500     /*
1501      * Now rip through the timer calltodo list looking for timers
1502      * to expire.
1503      */
1504 
1505     /* don't collide with softclock() */
1506     CC_LOCK(cc);
1507     for (p = calltodo.c_next; p != NULL; p = p->c_next) {
1508         p->c_time -= delta_ticks;
1509 
1510         /* Break if the timer had more time on it than delta_ticks */
1511         if (p->c_time > 0)
1512             break;
1513 
1514         /* take back the ticks the timer didn't use (p->c_time <= 0) */
1515         delta_ticks = -p->c_time;
1516     }
1517     CC_UNLOCK(cc);
1518 
1519     return;
1520 }
1521 #endif /* APM_FIXUP_CALLTODO */
1522 
1523 static int
1524 flssbt(sbintime_t sbt)
1525 {
1526 
1527     sbt += (uint64_t)sbt >> 1;
1528     if (sizeof(long) >= sizeof(sbintime_t))
1529         return (flsl(sbt));
1530     if (sbt >= SBT_1S)
1531         return (flsl(((uint64_t)sbt) >> 32) + 32);
1532     return (flsl(sbt));
1533 }
1534 
1535 /*
1536  * Dump immediate statistic snapshot of the scheduled callouts.
1537  */
1538 static int
1539 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
1540 {
1541     struct callout *tmp;
1542     struct callout_cpu *cc;
1543     struct callout_list *sc;
1544     sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
1545     int ct[64], cpr[64], ccpbk[32];
1546     int error, val, i, count, tcum, pcum, maxc, c, medc;
1547 #ifdef SMP
1548     int cpu;
1549 #endif
1550 
1551     val = 0;
1552     error = sysctl_handle_int(oidp, &val, 0, req);
1553     if (error != 0 || req->newptr == NULL)
1554         return (error);
1555     count = maxc = 0;
1556     st = spr = maxt = maxpr = 0;
1557     bzero(ccpbk, sizeof(ccpbk));
1558     bzero(ct, sizeof(ct));
1559     bzero(cpr, sizeof(cpr));
1560     now = sbinuptime();
1561 #ifdef SMP
1562     CPU_FOREACH(cpu) {
1563         cc = CC_CPU(cpu);
1564 #else
1565         cc = CC_CPU(timeout_cpu);
1566 #endif
1567         CC_LOCK(cc);
1568         for (i = 0; i < callwheelsize; i++) {
1569             sc = &cc->cc_callwheel[i];
1570             c = 0;
1571             LIST_FOREACH(tmp, sc, c_links.le) {
1572                 c++;
1573                 t = tmp->c_time - now;
1574                 if (t < 0)
1575                     t = 0;
1576                 st += t / SBT_1US;
1577                 spr += tmp->c_precision / SBT_1US;
1578                 if (t > maxt)
1579                     maxt = t;
1580                 if (tmp->c_precision > maxpr)
1581                     maxpr = tmp->c_precision;
1582                 ct[flssbt(t)]++;
1583                 cpr[flssbt(tmp->c_precision)]++;
1584             }
1585             if (c > maxc)
1586                 maxc = c;
1587             ccpbk[fls(c + c / 2)]++;
1588             count += c;
1589         }
1590         CC_UNLOCK(cc);
1591 #ifdef SMP
1592     }
1593 #endif
1594 
1595     for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
1596         tcum += ct[i];
1597     medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1598     for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
1599         pcum += cpr[i];
1600     medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
1601     for (i = 0, c = 0; i < 32 && c < count / 2; i++)
1602         c += ccpbk[i];
1603     medc = (i >= 2) ? (1 << (i - 2)) : 0;
1604 
1605     printf("Scheduled callouts statistic snapshot:\n");
1606     printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
1607         count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
1608     printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
1609         medc,
1610         count / callwheelsize / mp_ncpus,
1611         (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
1612         maxc);
1613     printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
1614         medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
1615         (st / count) / 1000000, (st / count) % 1000000,
1616         maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
1617     printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
1618         medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
1619         (spr / count) / 1000000, (spr / count) % 1000000,
1620         maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
1621     printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
1622         "   prec\t   pcum\n");
1623     for (i = 0, tcum = pcum = 0; i < 64; i++) {
1624         if (ct[i] == 0 && cpr[i] == 0)
1625             continue;
1626         t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
1627         tcum += ct[i];
1628         pcum += cpr[i];
1629         printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
1630             t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
1631             i - 1 - (32 - CC_HASH_SHIFT),
1632             ct[i], tcum, cpr[i], pcum);
1633     }
1634     return (error);
1635 }
1636 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
1637     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1638     0, 0, sysctl_kern_callout_stat, "I",
1639     "Dump immediate statistic snapshot of the scheduled callouts");
1640 
1641 #ifdef FSTACK
1642 void ff_hardclock(void);
1643 
1644 void
1645 ff_hardclock(void)
1646 {
1647     atomic_add_int(&ticks, 1);
1648     callout_process(getsbinuptime());
1649     tc_ticktock(1);
1650     cpu_tick_calibration();
1651 
1652 #ifdef DEVICE_POLLING
1653     hardclock_device_poll();    /* this is very short and quick */
1654 #endif /* DEVICE_POLLING */
1655 }
1656 
1657 static unsigned int
1658 ff_tc_get_timecount(struct timecounter *tc)
1659 {
1660     static u_int now;
1661     return (++now);
1662 }
1663 
1664 static struct timecounter ff_timecounter = {
1665     ff_tc_get_timecount, 0, ~0u, 100, "fst clock", 1
1666 };
1667 
1668 static void
1669 ff_tc_init(void)
1670 {
1671     ff_timecounter.tc_frequency = hz;
1672     tc_init(&ff_timecounter);
1673 }
1674 SYSINIT(ff_tc, SI_SUB_SMP, SI_ORDER_ANY, ff_tc_init, NULL);
1675 #endif
1676 
1677 #ifdef DDB
1678 static void
1679 _show_callout(struct callout *c)
1680 {
1681 
1682     db_printf("callout %p\n", c);
1683 #define    C_DB_PRINTF(f, e)    db_printf("   %s = " f "\n", #e, c->e);
1684     db_printf("   &c_links = %p\n", &(c->c_links));
1685     C_DB_PRINTF("%" PRId64,    c_time);
1686     C_DB_PRINTF("%" PRId64,    c_precision);
1687     C_DB_PRINTF("%p",    c_arg);
1688     C_DB_PRINTF("%p",    c_func);
1689     C_DB_PRINTF("%p",    c_lock);
1690     C_DB_PRINTF("%#x",    c_flags);
1691     C_DB_PRINTF("%#x",    c_iflags);
1692     C_DB_PRINTF("%d",    c_cpu);
1693 #undef    C_DB_PRINTF
1694 }
1695 
1696 DB_SHOW_COMMAND(callout, db_show_callout)
1697 {
1698 
1699     if (!have_addr) {
1700         db_printf("usage: show callout <struct callout *>\n");
1701         return;
1702     }
1703 
1704     _show_callout((struct callout *)addr);
1705 }
1706 #endif /* DDB */
1707