xref: /linux-6.15/kernel/context_tracking.c (revision 17147677)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Context tracking: Probe on high level context boundaries such as kernel
4  * and userspace. This includes syscalls and exceptions entry/exit.
5  *
6  * This is used by RCU to remove its dependency on the timer tick while a CPU
7  * runs in userspace.
8  *
9  *  Started by Frederic Weisbecker:
10  *
11  * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <[email protected]>
12  *
13  * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
14  * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
15  *
16  */
17 
18 #include <linux/context_tracking.h>
19 #include <linux/rcupdate.h>
20 #include <linux/sched.h>
21 #include <linux/hardirq.h>
22 #include <linux/export.h>
23 #include <linux/kprobes.h>
24 #include <trace/events/rcu.h>
25 
26 
27 DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
28 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
29 	.dynticks_nesting = 1,
30 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
31 #endif
32 	.state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
33 };
34 EXPORT_SYMBOL_GPL(context_tracking);
35 
36 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
37 #define TPS(x)  tracepoint_string(x)
38 
39 /* Record the current task on dyntick-idle entry. */
40 static __always_inline void rcu_dynticks_task_enter(void)
41 {
42 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
43 	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
44 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
45 }
46 
47 /* Record no current task on dyntick-idle exit. */
48 static __always_inline void rcu_dynticks_task_exit(void)
49 {
50 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
51 	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
52 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
53 }
54 
55 /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
56 static __always_inline void rcu_dynticks_task_trace_enter(void)
57 {
58 #ifdef CONFIG_TASKS_TRACE_RCU
59 	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
60 		current->trc_reader_special.b.need_mb = true;
61 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
62 }
63 
64 /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
65 static __always_inline void rcu_dynticks_task_trace_exit(void)
66 {
67 #ifdef CONFIG_TASKS_TRACE_RCU
68 	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
69 		current->trc_reader_special.b.need_mb = false;
70 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
71 }
72 
73 /*
74  * Record entry into an extended quiescent state.  This is only to be
75  * called when not already in an extended quiescent state, that is,
76  * RCU is watching prior to the call to this function and is no longer
77  * watching upon return.
78  */
79 static noinstr void ct_kernel_exit_state(int offset)
80 {
81 	int seq;
82 
83 	/*
84 	 * CPUs seeing atomic_add_return() must see prior RCU read-side
85 	 * critical sections, and we also must force ordering with the
86 	 * next idle sojourn.
87 	 */
88 	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
89 	seq = ct_state_inc(offset);
90 	// RCU is no longer watching.  Better be in extended quiescent state!
91 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
92 }
93 
94 /*
95  * Record exit from an extended quiescent state.  This is only to be
96  * called from an extended quiescent state, that is, RCU is not watching
97  * prior to the call to this function and is watching upon return.
98  */
99 static noinstr void ct_kernel_enter_state(int offset)
100 {
101 	int seq;
102 
103 	/*
104 	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
105 	 * and we also must force ordering with the next RCU read-side
106 	 * critical section.
107 	 */
108 	seq = ct_state_inc(offset);
109 	// RCU is now watching.  Better not be in an extended quiescent state!
110 	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
111 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
112 }
113 
114 /*
115  * Enter an RCU extended quiescent state, which can be either the
116  * idle loop or adaptive-tickless usermode execution.
117  *
118  * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
119  * the possibility of usermode upcalls having messed up our count
120  * of interrupt nesting level during the prior busy period.
121  */
122 static void noinstr ct_kernel_exit(bool user, int offset)
123 {
124 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
125 
126 	WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
127 	WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
128 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
129 		     ct_dynticks_nesting() == 0);
130 	if (ct_dynticks_nesting() != 1) {
131 		// RCU will still be watching, so just do accounting and leave.
132 		ct->dynticks_nesting--;
133 		return;
134 	}
135 
136 	instrumentation_begin();
137 	lockdep_assert_irqs_disabled();
138 	trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
139 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
140 	rcu_preempt_deferred_qs(current);
141 
142 	// instrumentation for the noinstr ct_kernel_exit_state()
143 	instrument_atomic_write(&ct->state, sizeof(ct->state));
144 
145 	instrumentation_end();
146 	WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
147 	// RCU is watching here ...
148 	ct_kernel_exit_state(offset);
149 	// ... but is no longer watching here.
150 	rcu_dynticks_task_enter();
151 }
152 
153 /*
154  * Exit an RCU extended quiescent state, which can be either the
155  * idle loop or adaptive-tickless usermode execution.
156  *
157  * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
158  * allow for the possibility of usermode upcalls messing up our count of
159  * interrupt nesting level during the busy period that is just now starting.
160  */
161 static void noinstr ct_kernel_enter(bool user, int offset)
162 {
163 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
164 	long oldval;
165 
166 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
167 	oldval = ct_dynticks_nesting();
168 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
169 	if (oldval) {
170 		// RCU was already watching, so just do accounting and leave.
171 		ct->dynticks_nesting++;
172 		return;
173 	}
174 	rcu_dynticks_task_exit();
175 	// RCU is not watching here ...
176 	ct_kernel_enter_state(offset);
177 	// ... but is watching here.
178 	instrumentation_begin();
179 
180 	// instrumentation for the noinstr ct_kernel_enter_state()
181 	instrument_atomic_write(&ct->state, sizeof(ct->state));
182 
183 	trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
184 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
185 	WRITE_ONCE(ct->dynticks_nesting, 1);
186 	WARN_ON_ONCE(ct_dynticks_nmi_nesting());
187 	WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
188 	instrumentation_end();
189 }
190 
191 /**
192  * ct_nmi_exit - inform RCU of exit from NMI context
193  *
194  * If we are returning from the outermost NMI handler that interrupted an
195  * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
196  * to let the RCU grace-period handling know that the CPU is back to
197  * being RCU-idle.
198  *
199  * If you add or remove a call to ct_nmi_exit(), be sure to test
200  * with CONFIG_RCU_EQS_DEBUG=y.
201  */
202 void noinstr ct_nmi_exit(void)
203 {
204 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
205 
206 	instrumentation_begin();
207 	/*
208 	 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
209 	 * (We are exiting an NMI handler, so RCU better be paying attention
210 	 * to us!)
211 	 */
212 	WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
213 	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
214 
215 	/*
216 	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
217 	 * leave it in non-RCU-idle state.
218 	 */
219 	if (ct_dynticks_nmi_nesting() != 1) {
220 		trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
221 				  ct_dynticks());
222 		WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
223 			   ct_dynticks_nmi_nesting() - 2);
224 		instrumentation_end();
225 		return;
226 	}
227 
228 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
229 	trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
230 	WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
231 
232 	// instrumentation for the noinstr ct_kernel_exit_state()
233 	instrument_atomic_write(&ct->state, sizeof(ct->state));
234 	instrumentation_end();
235 
236 	// RCU is watching here ...
237 	ct_kernel_exit_state(RCU_DYNTICKS_IDX);
238 	// ... but is no longer watching here.
239 
240 	if (!in_nmi())
241 		rcu_dynticks_task_enter();
242 }
243 
244 /**
245  * ct_nmi_enter - inform RCU of entry to NMI context
246  *
247  * If the CPU was idle from RCU's viewpoint, update ct->state and
248  * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
249  * that the CPU is active.  This implementation permits nested NMIs, as
250  * long as the nesting level does not overflow an int.  (You will probably
251  * run out of stack space first.)
252  *
253  * If you add or remove a call to ct_nmi_enter(), be sure to test
254  * with CONFIG_RCU_EQS_DEBUG=y.
255  */
256 void noinstr ct_nmi_enter(void)
257 {
258 	long incby = 2;
259 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
260 
261 	/* Complain about underflow. */
262 	WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
263 
264 	/*
265 	 * If idle from RCU viewpoint, atomically increment ->dynticks
266 	 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
267 	 * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
268 	 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
269 	 * to be in the outermost NMI handler that interrupted an RCU-idle
270 	 * period (observation due to Andy Lutomirski).
271 	 */
272 	if (rcu_dynticks_curr_cpu_in_eqs()) {
273 
274 		if (!in_nmi())
275 			rcu_dynticks_task_exit();
276 
277 		// RCU is not watching here ...
278 		ct_kernel_enter_state(RCU_DYNTICKS_IDX);
279 		// ... but is watching here.
280 
281 		instrumentation_begin();
282 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
283 		instrument_atomic_read(&ct->state, sizeof(ct->state));
284 		// instrumentation for the noinstr ct_kernel_enter_state()
285 		instrument_atomic_write(&ct->state, sizeof(ct->state));
286 
287 		incby = 1;
288 	} else if (!in_nmi()) {
289 		instrumentation_begin();
290 		rcu_irq_enter_check_tick();
291 	} else  {
292 		instrumentation_begin();
293 	}
294 
295 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
296 			  ct_dynticks_nmi_nesting(),
297 			  ct_dynticks_nmi_nesting() + incby, ct_dynticks());
298 	instrumentation_end();
299 	WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
300 		   ct_dynticks_nmi_nesting() + incby);
301 	barrier();
302 }
303 
304 /**
305  * ct_idle_enter - inform RCU that current CPU is entering idle
306  *
307  * Enter idle mode, in other words, -leave- the mode in which RCU
308  * read-side critical sections can occur.  (Though RCU read-side
309  * critical sections can occur in irq handlers in idle, a possibility
310  * handled by irq_enter() and irq_exit().)
311  *
312  * If you add or remove a call to ct_idle_enter(), be sure to test with
313  * CONFIG_RCU_EQS_DEBUG=y.
314  */
315 void noinstr ct_idle_enter(void)
316 {
317 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
318 	ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
319 }
320 EXPORT_SYMBOL_GPL(ct_idle_enter);
321 
322 /**
323  * ct_idle_exit - inform RCU that current CPU is leaving idle
324  *
325  * Exit idle mode, in other words, -enter- the mode in which RCU
326  * read-side critical sections can occur.
327  *
328  * If you add or remove a call to ct_idle_exit(), be sure to test with
329  * CONFIG_RCU_EQS_DEBUG=y.
330  */
331 void noinstr ct_idle_exit(void)
332 {
333 	unsigned long flags;
334 
335 	raw_local_irq_save(flags);
336 	ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
337 	raw_local_irq_restore(flags);
338 }
339 EXPORT_SYMBOL_GPL(ct_idle_exit);
340 
341 /**
342  * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
343  *
344  * Enter an interrupt handler, which might possibly result in exiting
345  * idle mode, in other words, entering the mode in which read-side critical
346  * sections can occur.  The caller must have disabled interrupts.
347  *
348  * Note that the Linux kernel is fully capable of entering an interrupt
349  * handler that it never exits, for example when doing upcalls to user mode!
350  * This code assumes that the idle loop never does upcalls to user mode.
351  * If your architecture's idle loop does do upcalls to user mode (or does
352  * anything else that results in unbalanced calls to the irq_enter() and
353  * irq_exit() functions), RCU will give you what you deserve, good and hard.
354  * But very infrequently and irreproducibly.
355  *
356  * Use things like work queues to work around this limitation.
357  *
358  * You have been warned.
359  *
360  * If you add or remove a call to ct_irq_enter(), be sure to test with
361  * CONFIG_RCU_EQS_DEBUG=y.
362  */
363 noinstr void ct_irq_enter(void)
364 {
365 	lockdep_assert_irqs_disabled();
366 	ct_nmi_enter();
367 }
368 
369 /**
370  * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
371  *
372  * Exit from an interrupt handler, which might possibly result in entering
373  * idle mode, in other words, leaving the mode in which read-side critical
374  * sections can occur.  The caller must have disabled interrupts.
375  *
376  * This code assumes that the idle loop never does anything that might
377  * result in unbalanced calls to irq_enter() and irq_exit().  If your
378  * architecture's idle loop violates this assumption, RCU will give you what
379  * you deserve, good and hard.  But very infrequently and irreproducibly.
380  *
381  * Use things like work queues to work around this limitation.
382  *
383  * You have been warned.
384  *
385  * If you add or remove a call to ct_irq_exit(), be sure to test with
386  * CONFIG_RCU_EQS_DEBUG=y.
387  */
388 noinstr void ct_irq_exit(void)
389 {
390 	lockdep_assert_irqs_disabled();
391 	ct_nmi_exit();
392 }
393 
394 /*
395  * Wrapper for ct_irq_enter() where interrupts are enabled.
396  *
397  * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
398  * with CONFIG_RCU_EQS_DEBUG=y.
399  */
400 void ct_irq_enter_irqson(void)
401 {
402 	unsigned long flags;
403 
404 	local_irq_save(flags);
405 	ct_irq_enter();
406 	local_irq_restore(flags);
407 }
408 
409 /*
410  * Wrapper for ct_irq_exit() where interrupts are enabled.
411  *
412  * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
413  * with CONFIG_RCU_EQS_DEBUG=y.
414  */
415 void ct_irq_exit_irqson(void)
416 {
417 	unsigned long flags;
418 
419 	local_irq_save(flags);
420 	ct_irq_exit();
421 	local_irq_restore(flags);
422 }
423 #else
424 static __always_inline void ct_kernel_exit(bool user, int offset) { }
425 static __always_inline void ct_kernel_enter(bool user, int offset) { }
426 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
427 
428 #ifdef CONFIG_CONTEXT_TRACKING_USER
429 
430 #define CREATE_TRACE_POINTS
431 #include <trace/events/context_tracking.h>
432 
433 DEFINE_STATIC_KEY_FALSE(context_tracking_key);
434 EXPORT_SYMBOL_GPL(context_tracking_key);
435 
436 static noinstr bool context_tracking_recursion_enter(void)
437 {
438 	int recursion;
439 
440 	recursion = __this_cpu_inc_return(context_tracking.recursion);
441 	if (recursion == 1)
442 		return true;
443 
444 	WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
445 	__this_cpu_dec(context_tracking.recursion);
446 
447 	return false;
448 }
449 
450 static __always_inline void context_tracking_recursion_exit(void)
451 {
452 	__this_cpu_dec(context_tracking.recursion);
453 }
454 
455 /**
456  * __ct_user_enter - Inform the context tracking that the CPU is going
457  *		     to enter user or guest space mode.
458  *
459  * This function must be called right before we switch from the kernel
460  * to user or guest space, when it's guaranteed the remaining kernel
461  * instructions to execute won't use any RCU read side critical section
462  * because this function sets RCU in extended quiescent state.
463  */
464 void noinstr __ct_user_enter(enum ctx_state state)
465 {
466 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
467 	lockdep_assert_irqs_disabled();
468 
469 	/* Kernel threads aren't supposed to go to userspace */
470 	WARN_ON_ONCE(!current->mm);
471 
472 	if (!context_tracking_recursion_enter())
473 		return;
474 
475 	if (__ct_state() != state) {
476 		if (ct->active) {
477 			/*
478 			 * At this stage, only low level arch entry code remains and
479 			 * then we'll run in userspace. We can assume there won't be
480 			 * any RCU read-side critical section until the next call to
481 			 * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
482 			 * on the tick.
483 			 */
484 			if (state == CONTEXT_USER) {
485 				instrumentation_begin();
486 				trace_user_enter(0);
487 				vtime_user_enter(current);
488 				instrumentation_end();
489 			}
490 			/*
491 			 * Other than generic entry implementation, we may be past the last
492 			 * rescheduling opportunity in the entry code. Trigger a self IPI
493 			 * that will fire and reschedule once we resume in user/guest mode.
494 			 */
495 			rcu_irq_work_resched();
496 
497 			/*
498 			 * Enter RCU idle mode right before resuming userspace.  No use of RCU
499 			 * is permitted between this call and rcu_eqs_exit(). This way the
500 			 * CPU doesn't need to maintain the tick for RCU maintenance purposes
501 			 * when the CPU runs in userspace.
502 			 */
503 			ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
504 
505 			/*
506 			 * Special case if we only track user <-> kernel transitions for tickless
507 			 * cputime accounting but we don't support RCU extended quiescent state.
508 			 * In this we case we don't care about any concurrency/ordering.
509 			 */
510 			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
511 				atomic_set(&ct->state, state);
512 		} else {
513 			/*
514 			 * Even if context tracking is disabled on this CPU, because it's outside
515 			 * the full dynticks mask for example, we still have to keep track of the
516 			 * context transitions and states to prevent inconsistency on those of
517 			 * other CPUs.
518 			 * If a task triggers an exception in userspace, sleep on the exception
519 			 * handler and then migrate to another CPU, that new CPU must know where
520 			 * the exception returns by the time we call exception_exit().
521 			 * This information can only be provided by the previous CPU when it called
522 			 * exception_enter().
523 			 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
524 			 * is false because we know that CPU is not tickless.
525 			 */
526 			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
527 				/* Tracking for vtime only, no concurrent RCU EQS accounting */
528 				atomic_set(&ct->state, state);
529 			} else {
530 				/*
531 				 * Tracking for vtime and RCU EQS. Make sure we don't race
532 				 * with NMIs. OTOH we don't care about ordering here since
533 				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
534 				 * ordered.
535 				 */
536 				atomic_add(state, &ct->state);
537 			}
538 		}
539 	}
540 	context_tracking_recursion_exit();
541 }
542 EXPORT_SYMBOL_GPL(__ct_user_enter);
543 
544 /*
545  * OBSOLETE:
546  * This function should be noinstr but the below local_irq_restore() is
547  * unsafe because it involves illegal RCU uses through tracing and lockdep.
548  * This is unlikely to be fixed as this function is obsolete. The preferred
549  * way is to call __context_tracking_enter() through user_enter_irqoff()
550  * or context_tracking_guest_enter(). It should be the arch entry code
551  * responsibility to call into context tracking with IRQs disabled.
552  */
553 void ct_user_enter(enum ctx_state state)
554 {
555 	unsigned long flags;
556 
557 	/*
558 	 * Some contexts may involve an exception occuring in an irq,
559 	 * leading to that nesting:
560 	 * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
561 	 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
562 	 * helpers are enough to protect RCU uses inside the exception. So
563 	 * just return immediately if we detect we are in an IRQ.
564 	 */
565 	if (in_interrupt())
566 		return;
567 
568 	local_irq_save(flags);
569 	__ct_user_enter(state);
570 	local_irq_restore(flags);
571 }
572 NOKPROBE_SYMBOL(ct_user_enter);
573 EXPORT_SYMBOL_GPL(ct_user_enter);
574 
575 /**
576  * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
577  *			   archs that didn't manage to check the context tracking
578  *			   static key from low level code.
579  *
580  * This OBSOLETE function should be noinstr but it unsafely calls
581  * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
582  * This is unlikely to be fixed as this function is obsolete. The preferred
583  * way is to call user_enter_irqoff(). It should be the arch entry code
584  * responsibility to call into context tracking with IRQs disabled.
585  */
586 void user_enter_callable(void)
587 {
588 	user_enter();
589 }
590 NOKPROBE_SYMBOL(user_enter_callable);
591 
592 /**
593  * __ct_user_exit - Inform the context tracking that the CPU is
594  *		    exiting user or guest mode and entering the kernel.
595  *
596  * This function must be called after we entered the kernel from user or
597  * guest space before any use of RCU read side critical section. This
598  * potentially include any high level kernel code like syscalls, exceptions,
599  * signal handling, etc...
600  *
601  * This call supports re-entrancy. This way it can be called from any exception
602  * handler without needing to know if we came from userspace or not.
603  */
604 void noinstr __ct_user_exit(enum ctx_state state)
605 {
606 	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
607 
608 	if (!context_tracking_recursion_enter())
609 		return;
610 
611 	if (__ct_state() == state) {
612 		if (ct->active) {
613 			/*
614 			 * Exit RCU idle mode while entering the kernel because it can
615 			 * run a RCU read side critical section anytime.
616 			 */
617 			ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
618 			if (state == CONTEXT_USER) {
619 				instrumentation_begin();
620 				vtime_user_exit(current);
621 				trace_user_exit(0);
622 				instrumentation_end();
623 			}
624 
625 			/*
626 			 * Special case if we only track user <-> kernel transitions for tickless
627 			 * cputime accounting but we don't support RCU extended quiescent state.
628 			 * In this we case we don't care about any concurrency/ordering.
629 			 */
630 			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
631 				atomic_set(&ct->state, CONTEXT_KERNEL);
632 
633 		} else {
634 			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
635 				/* Tracking for vtime only, no concurrent RCU EQS accounting */
636 				atomic_set(&ct->state, CONTEXT_KERNEL);
637 			} else {
638 				/*
639 				 * Tracking for vtime and RCU EQS. Make sure we don't race
640 				 * with NMIs. OTOH we don't care about ordering here since
641 				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully
642 				 * ordered.
643 				 */
644 				atomic_sub(state, &ct->state);
645 			}
646 		}
647 	}
648 	context_tracking_recursion_exit();
649 }
650 EXPORT_SYMBOL_GPL(__ct_user_exit);
651 
652 /*
653  * OBSOLETE:
654  * This function should be noinstr but the below local_irq_save() is
655  * unsafe because it involves illegal RCU uses through tracing and lockdep.
656  * This is unlikely to be fixed as this function is obsolete. The preferred
657  * way is to call __context_tracking_exit() through user_exit_irqoff()
658  * or context_tracking_guest_exit(). It should be the arch entry code
659  * responsibility to call into context tracking with IRQs disabled.
660  */
661 void ct_user_exit(enum ctx_state state)
662 {
663 	unsigned long flags;
664 
665 	if (in_interrupt())
666 		return;
667 
668 	local_irq_save(flags);
669 	__ct_user_exit(state);
670 	local_irq_restore(flags);
671 }
672 NOKPROBE_SYMBOL(ct_user_exit);
673 EXPORT_SYMBOL_GPL(ct_user_exit);
674 
675 /**
676  * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
677  *			  archs that didn't manage to check the context tracking
678  *			  static key from low level code.
679  *
680  * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
681  * involving illegal RCU uses through tracing and lockdep. This is unlikely
682  * to be fixed as this function is obsolete. The preferred way is to call
683  * user_exit_irqoff(). It should be the arch entry code responsibility to
684  * call into context tracking with IRQs disabled.
685  */
686 void user_exit_callable(void)
687 {
688 	user_exit();
689 }
690 NOKPROBE_SYMBOL(user_exit_callable);
691 
692 void __init ct_cpu_track_user(int cpu)
693 {
694 	static __initdata bool initialized = false;
695 
696 	if (!per_cpu(context_tracking.active, cpu)) {
697 		per_cpu(context_tracking.active, cpu) = true;
698 		static_branch_inc(&context_tracking_key);
699 	}
700 
701 	if (initialized)
702 		return;
703 
704 #ifdef CONFIG_HAVE_TIF_NOHZ
705 	/*
706 	 * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
707 	 * This assumes that init is the only task at this early boot stage.
708 	 */
709 	set_tsk_thread_flag(&init_task, TIF_NOHZ);
710 #endif
711 	WARN_ON_ONCE(!tasklist_empty());
712 
713 	initialized = true;
714 }
715 
716 #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
717 void __init context_tracking_init(void)
718 {
719 	int cpu;
720 
721 	for_each_possible_cpu(cpu)
722 		ct_cpu_track_user(cpu);
723 }
724 #endif
725 
726 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
727