1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef __LINUX_ENTRYCOMMON_H 3 #define __LINUX_ENTRYCOMMON_H 4 5 #include <linux/static_call_types.h> 6 #include <linux/ptrace.h> 7 #include <linux/syscalls.h> 8 #include <linux/seccomp.h> 9 #include <linux/sched.h> 10 #include <linux/context_tracking.h> 11 #include <linux/livepatch.h> 12 #include <linux/resume_user_mode.h> 13 #include <linux/tick.h> 14 15 #include <asm/entry-common.h> 16 17 /* 18 * Define dummy _TIF work flags if not defined by the architecture or for 19 * disabled functionality. 20 */ 21 #ifndef _TIF_PATCH_PENDING 22 # define _TIF_PATCH_PENDING (0) 23 #endif 24 25 #ifndef _TIF_UPROBE 26 # define _TIF_UPROBE (0) 27 #endif 28 29 /* 30 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() 31 */ 32 #ifndef ARCH_SYSCALL_WORK_ENTER 33 # define ARCH_SYSCALL_WORK_ENTER (0) 34 #endif 35 36 /* 37 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() 38 */ 39 #ifndef ARCH_SYSCALL_WORK_EXIT 40 # define ARCH_SYSCALL_WORK_EXIT (0) 41 #endif 42 43 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ 44 SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 45 SYSCALL_WORK_SYSCALL_TRACE | \ 46 SYSCALL_WORK_SYSCALL_EMU | \ 47 SYSCALL_WORK_SYSCALL_AUDIT | \ 48 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 49 ARCH_SYSCALL_WORK_ENTER) 50 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 51 SYSCALL_WORK_SYSCALL_TRACE | \ 52 SYSCALL_WORK_SYSCALL_AUDIT | \ 53 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 54 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 55 ARCH_SYSCALL_WORK_EXIT) 56 57 /* 58 * TIF flags handled in exit_to_user_mode_loop() 59 */ 60 #ifndef ARCH_EXIT_TO_USER_MODE_WORK 61 # define ARCH_EXIT_TO_USER_MODE_WORK (0) 62 #endif 63 64 #define EXIT_TO_USER_MODE_WORK \ 65 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 66 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 67 ARCH_EXIT_TO_USER_MODE_WORK) 68 69 /** 70 * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs 71 * @regs: Pointer to currents pt_regs 72 * 73 * Defaults to an empty implementation. Can be replaced by architecture 74 * specific code. 75 * 76 * Invoked from syscall_enter_from_user_mode() in the non-instrumentable 77 * section. Use __always_inline so the compiler cannot push it out of line 78 * and make it instrumentable. 79 */ 80 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); 81 82 #ifndef arch_enter_from_user_mode 83 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} 84 #endif 85 86 /** 87 * enter_from_user_mode - Establish state when coming from user mode 88 * 89 * Syscall/interrupt entry disables interrupts, but user mode is traced as 90 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 91 * 92 * 1) Tell lockdep that interrupts are disabled 93 * 2) Invoke context tracking if enabled to reactivate RCU 94 * 3) Trace interrupts off state 95 * 96 * Invoked from architecture specific syscall entry code with interrupts 97 * disabled. The calling code has to be non-instrumentable. When the 98 * function returns all state is correct and interrupts are still 99 * disabled. The subsequent functions can be instrumented. 100 * 101 * This is invoked when there is architecture specific functionality to be 102 * done between establishing state and enabling interrupts. The caller must 103 * enable interrupts before invoking syscall_enter_from_user_mode_work(). 104 */ 105 void enter_from_user_mode(struct pt_regs *regs); 106 107 /** 108 * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 109 * @regs: Pointer to currents pt_regs 110 * 111 * Invoked from architecture specific syscall entry code with interrupts 112 * disabled. The calling code has to be non-instrumentable. When the 113 * function returns all state is correct, interrupts are enabled and the 114 * subsequent functions can be instrumented. 115 * 116 * This handles lockdep, RCU (context tracking) and tracing state, i.e. 117 * the functionality provided by enter_from_user_mode(). 118 * 119 * This is invoked when there is extra architecture specific functionality 120 * to be done between establishing state and handling user mode entry work. 121 */ 122 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 123 124 /** 125 * syscall_enter_from_user_mode_work - Check and handle work before invoking 126 * a syscall 127 * @regs: Pointer to currents pt_regs 128 * @syscall: The syscall number 129 * 130 * Invoked from architecture specific syscall entry code with interrupts 131 * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 132 * architecture specific work. 133 * 134 * Returns: The original or a modified syscall number 135 * 136 * If the returned syscall number is -1 then the syscall should be 137 * skipped. In this case the caller may invoke syscall_set_error() or 138 * syscall_set_return_value() first. If neither of those are called and -1 139 * is returned, then the syscall will fail with ENOSYS. 140 * 141 * It handles the following work items: 142 * 143 * 1) syscall_work flag dependent invocations of 144 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() 145 * 2) Invocation of audit_syscall_entry() 146 */ 147 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); 148 149 /** 150 * syscall_enter_from_user_mode - Establish state and check and handle work 151 * before invoking a syscall 152 * @regs: Pointer to currents pt_regs 153 * @syscall: The syscall number 154 * 155 * Invoked from architecture specific syscall entry code with interrupts 156 * disabled. The calling code has to be non-instrumentable. When the 157 * function returns all state is correct, interrupts are enabled and the 158 * subsequent functions can be instrumented. 159 * 160 * This is combination of syscall_enter_from_user_mode_prepare() and 161 * syscall_enter_from_user_mode_work(). 162 * 163 * Returns: The original or a modified syscall number. See 164 * syscall_enter_from_user_mode_work() for further explanation. 165 */ 166 long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); 167 168 /** 169 * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() 170 * @ti_work: Cached TIF flags gathered with interrupts disabled 171 * 172 * Defaults to local_irq_enable(). Can be supplied by architecture specific 173 * code. 174 */ 175 static inline void local_irq_enable_exit_to_user(unsigned long ti_work); 176 177 #ifndef local_irq_enable_exit_to_user 178 static inline void local_irq_enable_exit_to_user(unsigned long ti_work) 179 { 180 local_irq_enable(); 181 } 182 #endif 183 184 /** 185 * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() 186 * 187 * Defaults to local_irq_disable(). Can be supplied by architecture specific 188 * code. 189 */ 190 static inline void local_irq_disable_exit_to_user(void); 191 192 #ifndef local_irq_disable_exit_to_user 193 static inline void local_irq_disable_exit_to_user(void) 194 { 195 local_irq_disable(); 196 } 197 #endif 198 199 /** 200 * arch_exit_to_user_mode_work - Architecture specific TIF work for exit 201 * to user mode. 202 * @regs: Pointer to currents pt_regs 203 * @ti_work: Cached TIF flags gathered with interrupts disabled 204 * 205 * Invoked from exit_to_user_mode_loop() with interrupt enabled 206 * 207 * Defaults to NOOP. Can be supplied by architecture specific code. 208 */ 209 static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 210 unsigned long ti_work); 211 212 #ifndef arch_exit_to_user_mode_work 213 static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 214 unsigned long ti_work) 215 { 216 } 217 #endif 218 219 /** 220 * arch_exit_to_user_mode_prepare - Architecture specific preparation for 221 * exit to user mode. 222 * @regs: Pointer to currents pt_regs 223 * @ti_work: Cached TIF flags gathered with interrupts disabled 224 * 225 * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last 226 * function before return. Defaults to NOOP. 227 */ 228 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 229 unsigned long ti_work); 230 231 #ifndef arch_exit_to_user_mode_prepare 232 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 233 unsigned long ti_work) 234 { 235 } 236 #endif 237 238 /** 239 * arch_exit_to_user_mode - Architecture specific final work before 240 * exit to user mode. 241 * 242 * Invoked from exit_to_user_mode() with interrupt disabled as the last 243 * function before return. Defaults to NOOP. 244 * 245 * This needs to be __always_inline because it is non-instrumentable code 246 * invoked after context tracking switched to user mode. 247 * 248 * An architecture implementation must not do anything complex, no locking 249 * etc. The main purpose is for speculation mitigations. 250 */ 251 static __always_inline void arch_exit_to_user_mode(void); 252 253 #ifndef arch_exit_to_user_mode 254 static __always_inline void arch_exit_to_user_mode(void) { } 255 #endif 256 257 /** 258 * arch_do_signal_or_restart - Architecture specific signal delivery function 259 * @regs: Pointer to currents pt_regs 260 * 261 * Invoked from exit_to_user_mode_loop(). 262 */ 263 void arch_do_signal_or_restart(struct pt_regs *regs); 264 265 /** 266 * exit_to_user_mode_loop - do any pending work before leaving to user space 267 */ 268 unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 269 unsigned long ti_work); 270 271 /** 272 * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 273 * @regs: Pointer to pt_regs on entry stack 274 * 275 * 1) check that interrupts are disabled 276 * 2) call tick_nohz_user_enter_prepare() 277 * 3) call exit_to_user_mode_loop() if any flags from 278 * EXIT_TO_USER_MODE_WORK are set 279 * 4) check that interrupts are still disabled 280 */ 281 static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 282 { 283 unsigned long ti_work; 284 285 lockdep_assert_irqs_disabled(); 286 287 /* Flush pending rcuog wakeup before the last need_resched() check */ 288 tick_nohz_user_enter_prepare(); 289 290 ti_work = read_thread_flags(); 291 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 292 ti_work = exit_to_user_mode_loop(regs, ti_work); 293 294 arch_exit_to_user_mode_prepare(regs, ti_work); 295 296 /* Ensure that kernel state is sane for a return to userspace */ 297 kmap_assert_nomap(); 298 lockdep_assert_irqs_disabled(); 299 lockdep_sys_exit(); 300 } 301 302 /** 303 * exit_to_user_mode - Fixup state when exiting to user mode 304 * 305 * Syscall/interrupt exit enables interrupts, but the kernel state is 306 * interrupts disabled when this is invoked. Also tell RCU about it. 307 * 308 * 1) Trace interrupts on state 309 * 2) Invoke context tracking if enabled to adjust RCU state 310 * 3) Invoke architecture specific last minute exit code, e.g. speculation 311 * mitigations, etc.: arch_exit_to_user_mode() 312 * 4) Tell lockdep that interrupts are enabled 313 * 314 * Invoked from architecture specific code when syscall_exit_to_user_mode() 315 * is not suitable as the last step before returning to userspace. Must be 316 * invoked with interrupts disabled and the caller must be 317 * non-instrumentable. 318 * The caller has to invoke syscall_exit_to_user_mode_work() before this. 319 */ 320 static __always_inline void exit_to_user_mode(void) 321 { 322 instrumentation_begin(); 323 trace_hardirqs_on_prepare(); 324 lockdep_hardirqs_on_prepare(); 325 instrumentation_end(); 326 327 user_enter_irqoff(); 328 arch_exit_to_user_mode(); 329 lockdep_hardirqs_on(CALLER_ADDR0); 330 } 331 332 /** 333 * syscall_exit_to_user_mode_work - Handle work before returning to user mode 334 * @regs: Pointer to currents pt_regs 335 * 336 * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling 337 * exit_to_user_mode() to perform the final transition to user mode. 338 * 339 * Calling convention is the same as for syscall_exit_to_user_mode() and it 340 * returns with all work handled and interrupts disabled. The caller must 341 * invoke exit_to_user_mode() before actually switching to user mode to 342 * make the final state transitions. Interrupts must stay disabled between 343 * return from this function and the invocation of exit_to_user_mode(). 344 */ 345 void syscall_exit_to_user_mode_work(struct pt_regs *regs); 346 347 /** 348 * syscall_exit_to_user_mode - Handle work before returning to user mode 349 * @regs: Pointer to currents pt_regs 350 * 351 * Invoked with interrupts enabled and fully valid regs. Returns with all 352 * work handled, interrupts disabled such that the caller can immediately 353 * switch to user mode. Called from architecture specific syscall and ret 354 * from fork code. 355 * 356 * The call order is: 357 * 1) One-time syscall exit work: 358 * - rseq syscall exit 359 * - audit 360 * - syscall tracing 361 * - ptrace (single stepping) 362 * 363 * 2) Preparatory work 364 * - Exit to user mode loop (common TIF handling). Invokes 365 * arch_exit_to_user_mode_work() for architecture specific TIF work 366 * - Architecture specific one time work arch_exit_to_user_mode_prepare() 367 * - Address limit and lockdep checks 368 * 369 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the 370 * functionality in exit_to_user_mode(). 371 * 372 * This is a combination of syscall_exit_to_user_mode_work() (1,2) and 373 * exit_to_user_mode(). This function is preferred unless there is a 374 * compelling architectural reason to use the separate functions. 375 */ 376 void syscall_exit_to_user_mode(struct pt_regs *regs); 377 378 /** 379 * irqentry_enter_from_user_mode - Establish state before invoking the irq handler 380 * @regs: Pointer to currents pt_regs 381 * 382 * Invoked from architecture specific entry code with interrupts disabled. 383 * Can only be called when the interrupt entry came from user mode. The 384 * calling code must be non-instrumentable. When the function returns all 385 * state is correct and the subsequent functions can be instrumented. 386 * 387 * The function establishes state (lockdep, RCU (context tracking), tracing) 388 */ 389 void irqentry_enter_from_user_mode(struct pt_regs *regs); 390 391 /** 392 * irqentry_exit_to_user_mode - Interrupt exit work 393 * @regs: Pointer to current's pt_regs 394 * 395 * Invoked with interrupts disabled and fully valid regs. Returns with all 396 * work handled, interrupts disabled such that the caller can immediately 397 * switch to user mode. Called from architecture specific interrupt 398 * handling code. 399 * 400 * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). 401 * Interrupt exit is not invoking #1 which is the syscall specific one time 402 * work. 403 */ 404 void irqentry_exit_to_user_mode(struct pt_regs *regs); 405 406 #ifndef irqentry_state 407 /** 408 * struct irqentry_state - Opaque object for exception state storage 409 * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the 410 * exit path has to invoke ct_irq_exit(). 411 * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that 412 * lockdep state is restored correctly on exit from nmi. 413 * 414 * This opaque object is filled in by the irqentry_*_enter() functions and 415 * must be passed back into the corresponding irqentry_*_exit() functions 416 * when the exception is complete. 417 * 418 * Callers of irqentry_*_[enter|exit]() must consider this structure opaque 419 * and all members private. Descriptions of the members are provided to aid in 420 * the maintenance of the irqentry_*() functions. 421 */ 422 typedef struct irqentry_state { 423 union { 424 bool exit_rcu; 425 bool lockdep; 426 }; 427 } irqentry_state_t; 428 #endif 429 430 /** 431 * irqentry_enter - Handle state tracking on ordinary interrupt entries 432 * @regs: Pointer to pt_regs of interrupted context 433 * 434 * Invokes: 435 * - lockdep irqflag state tracking as low level ASM entry disabled 436 * interrupts. 437 * 438 * - Context tracking if the exception hit user mode. 439 * 440 * - The hardirq tracer to keep the state consistent as low level ASM 441 * entry disabled interrupts. 442 * 443 * As a precondition, this requires that the entry came from user mode, 444 * idle, or a kernel context in which RCU is watching. 445 * 446 * For kernel mode entries RCU handling is done conditional. If RCU is 447 * watching then the only RCU requirement is to check whether the tick has 448 * to be restarted. If RCU is not watching then ct_irq_enter() has to be 449 * invoked on entry and ct_irq_exit() on exit. 450 * 451 * Avoiding the ct_irq_enter/exit() calls is an optimization but also 452 * solves the problem of kernel mode pagefaults which can schedule, which 453 * is not possible after invoking ct_irq_enter() without undoing it. 454 * 455 * For user mode entries irqentry_enter_from_user_mode() is invoked to 456 * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 457 * would not be possible. 458 * 459 * Returns: An opaque object that must be passed to idtentry_exit() 460 */ 461 irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); 462 463 /** 464 * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt 465 * 466 * Conditional reschedule with additional sanity checks. 467 */ 468 void raw_irqentry_exit_cond_resched(void); 469 #ifdef CONFIG_PREEMPT_DYNAMIC 470 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 471 #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched 472 #define irqentry_exit_cond_resched_dynamic_disabled NULL 473 DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 474 #define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() 475 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 476 DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 477 void dynamic_irqentry_exit_cond_resched(void); 478 #define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() 479 #endif 480 #else /* CONFIG_PREEMPT_DYNAMIC */ 481 #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() 482 #endif /* CONFIG_PREEMPT_DYNAMIC */ 483 484 /** 485 * irqentry_exit - Handle return from exception that used irqentry_enter() 486 * @regs: Pointer to pt_regs (exception entry regs) 487 * @state: Return value from matching call to irqentry_enter() 488 * 489 * Depending on the return target (kernel/user) this runs the necessary 490 * preemption and work checks if possible and required and returns to 491 * the caller with interrupts disabled and no further work pending. 492 * 493 * This is the last action before returning to the low level ASM code which 494 * just needs to return to the appropriate context. 495 * 496 * Counterpart to irqentry_enter(). 497 */ 498 void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); 499 500 /** 501 * irqentry_nmi_enter - Handle NMI entry 502 * @regs: Pointer to currents pt_regs 503 * 504 * Similar to irqentry_enter() but taking care of the NMI constraints. 505 */ 506 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); 507 508 /** 509 * irqentry_nmi_exit - Handle return from NMI handling 510 * @regs: Pointer to pt_regs (NMI entry regs) 511 * @irq_state: Return value from matching call to irqentry_nmi_enter() 512 * 513 * Last action before returning to the low level assembly code. 514 * 515 * Counterpart to irqentry_nmi_enter(). 516 */ 517 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); 518 519 #endif 520