xref: /linux-6.15/kernel/entry/common.c (revision caf4062e)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/resume_user_mode.h>
6 #include <linux/highmem.h>
7 #include <linux/jump_label.h>
8 #include <linux/kmsan.h>
9 #include <linux/livepatch.h>
10 #include <linux/audit.h>
11 #include <linux/tick.h>
12 
13 #include "common.h"
14 
15 #define CREATE_TRACE_POINTS
16 #include <trace/events/syscalls.h>
17 
18 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
19 {
20 	if (unlikely(audit_context())) {
21 		unsigned long args[6];
22 
23 		syscall_get_arguments(current, regs, args);
24 		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
25 	}
26 }
27 
28 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
29 				unsigned long work)
30 {
31 	long ret = 0;
32 
33 	/*
34 	 * Handle Syscall User Dispatch.  This must comes first, since
35 	 * the ABI here can be something that doesn't make sense for
36 	 * other syscall_work features.
37 	 */
38 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
39 		if (syscall_user_dispatch(regs))
40 			return -1L;
41 	}
42 
43 	/* Handle ptrace */
44 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
45 		ret = ptrace_report_syscall_entry(regs);
46 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
47 			return -1L;
48 	}
49 
50 	/* Do seccomp after ptrace, to catch any tracer changes. */
51 	if (work & SYSCALL_WORK_SECCOMP) {
52 		ret = __secure_computing(NULL);
53 		if (ret == -1L)
54 			return ret;
55 	}
56 
57 	/* Either of the above might have changed the syscall number */
58 	syscall = syscall_get_nr(current, regs);
59 
60 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
61 		trace_sys_enter(regs, syscall);
62 
63 	syscall_enter_audit(regs, syscall);
64 
65 	return ret ? : syscall;
66 }
67 
68 static __always_inline long
69 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
70 {
71 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
72 
73 	if (work & SYSCALL_WORK_ENTER)
74 		syscall = syscall_trace_enter(regs, syscall, work);
75 
76 	return syscall;
77 }
78 
79 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
80 {
81 	return __syscall_enter_from_user_work(regs, syscall);
82 }
83 
84 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
85 {
86 	long ret;
87 
88 	enter_from_user_mode(regs);
89 
90 	instrumentation_begin();
91 	local_irq_enable();
92 	ret = __syscall_enter_from_user_work(regs, syscall);
93 	instrumentation_end();
94 
95 	return ret;
96 }
97 
98 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
99 {
100 	enter_from_user_mode(regs);
101 	instrumentation_begin();
102 	local_irq_enable();
103 	instrumentation_end();
104 }
105 
106 /* Workaround to allow gradual conversion of architecture code */
107 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
108 
109 /**
110  * exit_to_user_mode_loop - do any pending work before leaving to user space
111  * @regs:	Pointer to pt_regs on entry stack
112  * @ti_work:	TIF work flags as read by the caller
113  */
114 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
115 						     unsigned long ti_work)
116 {
117 	/*
118 	 * Before returning to user space ensure that all pending work
119 	 * items have been completed.
120 	 */
121 	while (ti_work & EXIT_TO_USER_MODE_WORK) {
122 
123 		local_irq_enable_exit_to_user(ti_work);
124 
125 		if (ti_work & _TIF_NEED_RESCHED)
126 			schedule();
127 
128 		if (ti_work & _TIF_UPROBE)
129 			uprobe_notify_resume(regs);
130 
131 		if (ti_work & _TIF_PATCH_PENDING)
132 			klp_update_patch_state(current);
133 
134 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
135 			arch_do_signal_or_restart(regs);
136 
137 		if (ti_work & _TIF_NOTIFY_RESUME)
138 			resume_user_mode_work(regs);
139 
140 		/* Architecture specific TIF work */
141 		arch_exit_to_user_mode_work(regs, ti_work);
142 
143 		/*
144 		 * Disable interrupts and reevaluate the work flags as they
145 		 * might have changed while interrupts and preemption was
146 		 * enabled above.
147 		 */
148 		local_irq_disable_exit_to_user();
149 
150 		/* Check if any of the above work has queued a deferred wakeup */
151 		tick_nohz_user_enter_prepare();
152 
153 		ti_work = read_thread_flags();
154 	}
155 
156 	/* Return the latest work state for arch_exit_to_user_mode() */
157 	return ti_work;
158 }
159 
160 /*
161  * If SYSCALL_EMU is set, then the only reason to report is when
162  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
163  * instruction has been already reported in syscall_enter_from_user_mode().
164  */
165 static inline bool report_single_step(unsigned long work)
166 {
167 	if (work & SYSCALL_WORK_SYSCALL_EMU)
168 		return false;
169 
170 	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
171 }
172 
173 static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
174 {
175 	bool step;
176 
177 	/*
178 	 * If the syscall was rolled back due to syscall user dispatching,
179 	 * then the tracers below are not invoked for the same reason as
180 	 * the entry side was not invoked in syscall_trace_enter(): The ABI
181 	 * of these syscalls is unknown.
182 	 */
183 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
184 		if (unlikely(current->syscall_dispatch.on_dispatch)) {
185 			current->syscall_dispatch.on_dispatch = false;
186 			return;
187 		}
188 	}
189 
190 	audit_syscall_exit(regs);
191 
192 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
193 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
194 
195 	step = report_single_step(work);
196 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
197 		ptrace_report_syscall_exit(regs, step);
198 }
199 
200 /*
201  * Syscall specific exit to user mode preparation. Runs with interrupts
202  * enabled.
203  */
204 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
205 {
206 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
207 	unsigned long nr = syscall_get_nr(current, regs);
208 
209 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
210 
211 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
212 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
213 			local_irq_enable();
214 	}
215 
216 	rseq_syscall(regs);
217 
218 	/*
219 	 * Do one-time syscall specific work. If these work items are
220 	 * enabled, we want to run them exactly once per syscall exit with
221 	 * interrupts enabled.
222 	 */
223 	if (unlikely(work & SYSCALL_WORK_EXIT))
224 		syscall_exit_work(regs, work);
225 }
226 
227 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
228 {
229 	syscall_exit_to_user_mode_prepare(regs);
230 	local_irq_disable_exit_to_user();
231 	exit_to_user_mode_prepare(regs);
232 }
233 
234 void syscall_exit_to_user_mode_work(struct pt_regs *regs)
235 {
236 	__syscall_exit_to_user_mode_work(regs);
237 }
238 
239 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
240 {
241 	instrumentation_begin();
242 	__syscall_exit_to_user_mode_work(regs);
243 	instrumentation_end();
244 	exit_to_user_mode();
245 }
246 
247 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
248 {
249 	enter_from_user_mode(regs);
250 }
251 
252 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
253 {
254 	instrumentation_begin();
255 	exit_to_user_mode_prepare(regs);
256 	instrumentation_end();
257 	exit_to_user_mode();
258 }
259 
260 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
261 {
262 	irqentry_state_t ret = {
263 		.exit_rcu = false,
264 	};
265 
266 	if (user_mode(regs)) {
267 		irqentry_enter_from_user_mode(regs);
268 		return ret;
269 	}
270 
271 	/*
272 	 * If this entry hit the idle task invoke ct_irq_enter() whether
273 	 * RCU is watching or not.
274 	 *
275 	 * Interrupts can nest when the first interrupt invokes softirq
276 	 * processing on return which enables interrupts.
277 	 *
278 	 * Scheduler ticks in the idle task can mark quiescent state and
279 	 * terminate a grace period, if and only if the timer interrupt is
280 	 * not nested into another interrupt.
281 	 *
282 	 * Checking for rcu_is_watching() here would prevent the nesting
283 	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
284 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
285 	 * assume that it is the first interrupt and eventually claim
286 	 * quiescent state and end grace periods prematurely.
287 	 *
288 	 * Unconditionally invoke ct_irq_enter() so RCU state stays
289 	 * consistent.
290 	 *
291 	 * TINY_RCU does not support EQS, so let the compiler eliminate
292 	 * this part when enabled.
293 	 */
294 	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
295 		/*
296 		 * If RCU is not watching then the same careful
297 		 * sequence vs. lockdep and tracing is required
298 		 * as in irqentry_enter_from_user_mode().
299 		 */
300 		lockdep_hardirqs_off(CALLER_ADDR0);
301 		ct_irq_enter();
302 		instrumentation_begin();
303 		kmsan_unpoison_entry_regs(regs);
304 		trace_hardirqs_off_finish();
305 		instrumentation_end();
306 
307 		ret.exit_rcu = true;
308 		return ret;
309 	}
310 
311 	/*
312 	 * If RCU is watching then RCU only wants to check whether it needs
313 	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
314 	 * already contains a warning when RCU is not watching, so no point
315 	 * in having another one here.
316 	 */
317 	lockdep_hardirqs_off(CALLER_ADDR0);
318 	instrumentation_begin();
319 	kmsan_unpoison_entry_regs(regs);
320 	rcu_irq_enter_check_tick();
321 	trace_hardirqs_off_finish();
322 	instrumentation_end();
323 
324 	return ret;
325 }
326 
327 void raw_irqentry_exit_cond_resched(void)
328 {
329 	if (!preempt_count()) {
330 		/* Sanity check RCU and thread stack */
331 		rcu_irq_exit_check_preempt();
332 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
333 			WARN_ON_ONCE(!on_thread_stack());
334 		if (need_resched())
335 			preempt_schedule_irq();
336 	}
337 }
338 #ifdef CONFIG_PREEMPT_DYNAMIC
339 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
340 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
341 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
342 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
343 void dynamic_irqentry_exit_cond_resched(void)
344 {
345 	if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
346 		return;
347 	raw_irqentry_exit_cond_resched();
348 }
349 #endif
350 #endif
351 
352 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
353 {
354 	lockdep_assert_irqs_disabled();
355 
356 	/* Check whether this returns to user mode */
357 	if (user_mode(regs)) {
358 		irqentry_exit_to_user_mode(regs);
359 	} else if (!regs_irqs_disabled(regs)) {
360 		/*
361 		 * If RCU was not watching on entry this needs to be done
362 		 * carefully and needs the same ordering of lockdep/tracing
363 		 * and RCU as the return to user mode path.
364 		 */
365 		if (state.exit_rcu) {
366 			instrumentation_begin();
367 			/* Tell the tracer that IRET will enable interrupts */
368 			trace_hardirqs_on_prepare();
369 			lockdep_hardirqs_on_prepare();
370 			instrumentation_end();
371 			ct_irq_exit();
372 			lockdep_hardirqs_on(CALLER_ADDR0);
373 			return;
374 		}
375 
376 		instrumentation_begin();
377 		if (IS_ENABLED(CONFIG_PREEMPTION))
378 			irqentry_exit_cond_resched();
379 
380 		/* Covers both tracing and lockdep */
381 		trace_hardirqs_on();
382 		instrumentation_end();
383 	} else {
384 		/*
385 		 * IRQ flags state is correct already. Just tell RCU if it
386 		 * was not watching on entry.
387 		 */
388 		if (state.exit_rcu)
389 			ct_irq_exit();
390 	}
391 }
392 
393 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
394 {
395 	irqentry_state_t irq_state;
396 
397 	irq_state.lockdep = lockdep_hardirqs_enabled();
398 
399 	__nmi_enter();
400 	lockdep_hardirqs_off(CALLER_ADDR0);
401 	lockdep_hardirq_enter();
402 	ct_nmi_enter();
403 
404 	instrumentation_begin();
405 	kmsan_unpoison_entry_regs(regs);
406 	trace_hardirqs_off_finish();
407 	ftrace_nmi_enter();
408 	instrumentation_end();
409 
410 	return irq_state;
411 }
412 
413 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
414 {
415 	instrumentation_begin();
416 	ftrace_nmi_exit();
417 	if (irq_state.lockdep) {
418 		trace_hardirqs_on_prepare();
419 		lockdep_hardirqs_on_prepare();
420 	}
421 	instrumentation_end();
422 
423 	ct_nmi_exit();
424 	lockdep_hardirq_exit();
425 	if (irq_state.lockdep)
426 		lockdep_hardirqs_on(CALLER_ADDR0);
427 	__nmi_exit();
428 }
429