1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/livepatch.h> 6 #include <linux/audit.h> 7 8 #include "common.h" 9 10 #define CREATE_TRACE_POINTS 11 #include <trace/events/syscalls.h> 12 13 /** 14 * enter_from_user_mode - Establish state when coming from user mode 15 * 16 * Syscall/interrupt entry disables interrupts, but user mode is traced as 17 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 18 * 19 * 1) Tell lockdep that interrupts are disabled 20 * 2) Invoke context tracking if enabled to reactivate RCU 21 * 3) Trace interrupts off state 22 */ 23 static __always_inline void enter_from_user_mode(struct pt_regs *regs) 24 { 25 arch_check_user_regs(regs); 26 lockdep_hardirqs_off(CALLER_ADDR0); 27 28 CT_WARN_ON(ct_state() != CONTEXT_USER); 29 user_exit_irqoff(); 30 31 instrumentation_begin(); 32 trace_hardirqs_off_finish(); 33 instrumentation_end(); 34 } 35 36 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 37 { 38 if (unlikely(audit_context())) { 39 unsigned long args[6]; 40 41 syscall_get_arguments(current, regs, args); 42 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 43 } 44 } 45 46 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 47 unsigned long work) 48 { 49 long ret = 0; 50 51 /* 52 * Handle Syscall User Dispatch. This must comes first, since 53 * the ABI here can be something that doesn't make sense for 54 * other syscall_work features. 55 */ 56 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 57 if (syscall_user_dispatch(regs)) 58 return -1L; 59 } 60 61 /* Handle ptrace */ 62 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 63 ret = arch_syscall_enter_tracehook(regs); 64 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 65 return -1L; 66 } 67 68 /* Do seccomp after ptrace, to catch any tracer changes. */ 69 if (work & SYSCALL_WORK_SECCOMP) { 70 ret = __secure_computing(NULL); 71 if (ret == -1L) 72 return ret; 73 } 74 75 /* Either of the above might have changed the syscall number */ 76 syscall = syscall_get_nr(current, regs); 77 78 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 79 trace_sys_enter(regs, syscall); 80 81 syscall_enter_audit(regs, syscall); 82 83 return ret ? : syscall; 84 } 85 86 static __always_inline long 87 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 88 { 89 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 90 91 if (work & SYSCALL_WORK_ENTER) 92 syscall = syscall_trace_enter(regs, syscall, work); 93 94 return syscall; 95 } 96 97 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 98 { 99 return __syscall_enter_from_user_work(regs, syscall); 100 } 101 102 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 103 { 104 long ret; 105 106 enter_from_user_mode(regs); 107 108 instrumentation_begin(); 109 local_irq_enable(); 110 ret = __syscall_enter_from_user_work(regs, syscall); 111 instrumentation_end(); 112 113 return ret; 114 } 115 116 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 117 { 118 enter_from_user_mode(regs); 119 instrumentation_begin(); 120 local_irq_enable(); 121 instrumentation_end(); 122 } 123 124 /** 125 * exit_to_user_mode - Fixup state when exiting to user mode 126 * 127 * Syscall/interupt exit enables interrupts, but the kernel state is 128 * interrupts disabled when this is invoked. Also tell RCU about it. 129 * 130 * 1) Trace interrupts on state 131 * 2) Invoke context tracking if enabled to adjust RCU state 132 * 3) Invoke architecture specific last minute exit code, e.g. speculation 133 * mitigations, etc. 134 * 4) Tell lockdep that interrupts are enabled 135 */ 136 static __always_inline void exit_to_user_mode(void) 137 { 138 instrumentation_begin(); 139 trace_hardirqs_on_prepare(); 140 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 141 instrumentation_end(); 142 143 user_enter_irqoff(); 144 arch_exit_to_user_mode(); 145 lockdep_hardirqs_on(CALLER_ADDR0); 146 } 147 148 /* Workaround to allow gradual conversion of architecture code */ 149 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } 150 151 static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) 152 { 153 if (ti_work & _TIF_NOTIFY_SIGNAL) 154 tracehook_notify_signal(); 155 156 arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); 157 } 158 159 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 160 unsigned long ti_work) 161 { 162 /* 163 * Before returning to user space ensure that all pending work 164 * items have been completed. 165 */ 166 while (ti_work & EXIT_TO_USER_MODE_WORK) { 167 168 local_irq_enable_exit_to_user(ti_work); 169 170 if (ti_work & _TIF_NEED_RESCHED) 171 schedule(); 172 173 if (ti_work & _TIF_UPROBE) 174 uprobe_notify_resume(regs); 175 176 if (ti_work & _TIF_PATCH_PENDING) 177 klp_update_patch_state(current); 178 179 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 180 handle_signal_work(regs, ti_work); 181 182 if (ti_work & _TIF_NOTIFY_RESUME) { 183 tracehook_notify_resume(regs); 184 rseq_handle_notify_resume(NULL, regs); 185 } 186 187 /* Architecture specific TIF work */ 188 arch_exit_to_user_mode_work(regs, ti_work); 189 190 /* 191 * Disable interrupts and reevaluate the work flags as they 192 * might have changed while interrupts and preemption was 193 * enabled above. 194 */ 195 local_irq_disable_exit_to_user(); 196 ti_work = READ_ONCE(current_thread_info()->flags); 197 } 198 199 /* Return the latest work state for arch_exit_to_user_mode() */ 200 return ti_work; 201 } 202 203 static void exit_to_user_mode_prepare(struct pt_regs *regs) 204 { 205 unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 206 207 lockdep_assert_irqs_disabled(); 208 209 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 210 ti_work = exit_to_user_mode_loop(regs, ti_work); 211 212 arch_exit_to_user_mode_prepare(regs, ti_work); 213 214 /* Ensure that the address limit is intact and no locks are held */ 215 addr_limit_user_check(); 216 lockdep_assert_irqs_disabled(); 217 lockdep_sys_exit(); 218 } 219 220 #ifndef _TIF_SINGLESTEP 221 static inline bool report_single_step(unsigned long work) 222 { 223 return false; 224 } 225 #else 226 /* 227 * If SYSCALL_EMU is set, then the only reason to report is when 228 * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 229 * instruction has been already reported in syscall_enter_from_user_mode(). 230 */ 231 static inline bool report_single_step(unsigned long work) 232 { 233 if (!(work & SYSCALL_WORK_SYSCALL_EMU)) 234 return false; 235 236 return !!(current_thread_info()->flags & _TIF_SINGLESTEP); 237 } 238 #endif 239 240 241 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 242 { 243 bool step; 244 245 /* 246 * If the syscall was rolled back due to syscall user dispatching, 247 * then the tracers below are not invoked for the same reason as 248 * the entry side was not invoked in syscall_trace_enter(): The ABI 249 * of these syscalls is unknown. 250 */ 251 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 252 if (unlikely(current->syscall_dispatch.on_dispatch)) { 253 current->syscall_dispatch.on_dispatch = false; 254 return; 255 } 256 } 257 258 audit_syscall_exit(regs); 259 260 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 261 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 262 263 step = report_single_step(work); 264 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 265 arch_syscall_exit_tracehook(regs, step); 266 } 267 268 /* 269 * Syscall specific exit to user mode preparation. Runs with interrupts 270 * enabled. 271 */ 272 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 273 { 274 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 275 unsigned long nr = syscall_get_nr(current, regs); 276 277 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 278 279 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 280 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 281 local_irq_enable(); 282 } 283 284 rseq_syscall(regs); 285 286 /* 287 * Do one-time syscall specific work. If these work items are 288 * enabled, we want to run them exactly once per syscall exit with 289 * interrupts enabled. 290 */ 291 if (unlikely(work & SYSCALL_WORK_EXIT)) 292 syscall_exit_work(regs, work); 293 } 294 295 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 296 { 297 instrumentation_begin(); 298 syscall_exit_to_user_mode_prepare(regs); 299 local_irq_disable_exit_to_user(); 300 exit_to_user_mode_prepare(regs); 301 instrumentation_end(); 302 exit_to_user_mode(); 303 } 304 305 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 306 { 307 enter_from_user_mode(regs); 308 } 309 310 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 311 { 312 instrumentation_begin(); 313 exit_to_user_mode_prepare(regs); 314 instrumentation_end(); 315 exit_to_user_mode(); 316 } 317 318 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 319 { 320 irqentry_state_t ret = { 321 .exit_rcu = false, 322 }; 323 324 if (user_mode(regs)) { 325 irqentry_enter_from_user_mode(regs); 326 return ret; 327 } 328 329 /* 330 * If this entry hit the idle task invoke rcu_irq_enter() whether 331 * RCU is watching or not. 332 * 333 * Interrupts can nest when the first interrupt invokes softirq 334 * processing on return which enables interrupts. 335 * 336 * Scheduler ticks in the idle task can mark quiescent state and 337 * terminate a grace period, if and only if the timer interrupt is 338 * not nested into another interrupt. 339 * 340 * Checking for rcu_is_watching() here would prevent the nesting 341 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 342 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 343 * assume that it is the first interupt and eventually claim 344 * quiescent state and end grace periods prematurely. 345 * 346 * Unconditionally invoke rcu_irq_enter() so RCU state stays 347 * consistent. 348 * 349 * TINY_RCU does not support EQS, so let the compiler eliminate 350 * this part when enabled. 351 */ 352 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 353 /* 354 * If RCU is not watching then the same careful 355 * sequence vs. lockdep and tracing is required 356 * as in irqentry_enter_from_user_mode(). 357 */ 358 lockdep_hardirqs_off(CALLER_ADDR0); 359 rcu_irq_enter(); 360 instrumentation_begin(); 361 trace_hardirqs_off_finish(); 362 instrumentation_end(); 363 364 ret.exit_rcu = true; 365 return ret; 366 } 367 368 /* 369 * If RCU is watching then RCU only wants to check whether it needs 370 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 371 * already contains a warning when RCU is not watching, so no point 372 * in having another one here. 373 */ 374 lockdep_hardirqs_off(CALLER_ADDR0); 375 instrumentation_begin(); 376 rcu_irq_enter_check_tick(); 377 trace_hardirqs_off_finish(); 378 instrumentation_end(); 379 380 return ret; 381 } 382 383 void irqentry_exit_cond_resched(void) 384 { 385 if (!preempt_count()) { 386 /* Sanity check RCU and thread stack */ 387 rcu_irq_exit_check_preempt(); 388 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 389 WARN_ON_ONCE(!on_thread_stack()); 390 if (need_resched()) 391 preempt_schedule_irq(); 392 } 393 } 394 395 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 396 { 397 lockdep_assert_irqs_disabled(); 398 399 /* Check whether this returns to user mode */ 400 if (user_mode(regs)) { 401 irqentry_exit_to_user_mode(regs); 402 } else if (!regs_irqs_disabled(regs)) { 403 /* 404 * If RCU was not watching on entry this needs to be done 405 * carefully and needs the same ordering of lockdep/tracing 406 * and RCU as the return to user mode path. 407 */ 408 if (state.exit_rcu) { 409 instrumentation_begin(); 410 /* Tell the tracer that IRET will enable interrupts */ 411 trace_hardirqs_on_prepare(); 412 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 413 instrumentation_end(); 414 rcu_irq_exit(); 415 lockdep_hardirqs_on(CALLER_ADDR0); 416 return; 417 } 418 419 instrumentation_begin(); 420 if (IS_ENABLED(CONFIG_PREEMPTION)) 421 irqentry_exit_cond_resched(); 422 /* Covers both tracing and lockdep */ 423 trace_hardirqs_on(); 424 instrumentation_end(); 425 } else { 426 /* 427 * IRQ flags state is correct already. Just tell RCU if it 428 * was not watching on entry. 429 */ 430 if (state.exit_rcu) 431 rcu_irq_exit(); 432 } 433 } 434 435 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 436 { 437 irqentry_state_t irq_state; 438 439 irq_state.lockdep = lockdep_hardirqs_enabled(); 440 441 __nmi_enter(); 442 lockdep_hardirqs_off(CALLER_ADDR0); 443 lockdep_hardirq_enter(); 444 rcu_nmi_enter(); 445 446 instrumentation_begin(); 447 trace_hardirqs_off_finish(); 448 ftrace_nmi_enter(); 449 instrumentation_end(); 450 451 return irq_state; 452 } 453 454 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 455 { 456 instrumentation_begin(); 457 ftrace_nmi_exit(); 458 if (irq_state.lockdep) { 459 trace_hardirqs_on_prepare(); 460 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 461 } 462 instrumentation_end(); 463 464 rcu_nmi_exit(); 465 lockdep_hardirq_exit(); 466 if (irq_state.lockdep) 467 lockdep_hardirqs_on(CALLER_ADDR0); 468 __nmi_exit(); 469 } 470