xref: /xnu-11215/bsd/dev/dtrace/systrace.c (revision 1031c584)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <ptrauth.h>
27 
28 #include <kern/thread.h>
29 #include <mach/thread_status.h>
30 
31 /* XXX All of these should really be derived from syscall_sw.h */
32 #if defined (__x86_64__)
33 #define SYSCALL_CLASS_SHIFT 24
34 #define SYSCALL_CLASS_MASK  (0xFF << SYSCALL_CLASS_SHIFT)
35 #define SYSCALL_NUMBER_MASK (~SYSCALL_CLASS_MASK)
36 #define I386_SYSCALL_NUMBER_MASK (0xFFFF)
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/conf.h>
45 #include <sys/fcntl.h>
46 #include <sys/syscall.h>
47 #include <miscfs/devfs/devfs.h>
48 
49 #include <sys/dtrace.h>
50 #include <sys/dtrace_impl.h>
51 #include <sys/systrace_args.h>
52 #include "systrace.h"
53 #include <sys/stat.h>
54 #include <sys/systm.h>
55 #include <sys/conf.h>
56 #include <sys/user.h>
57 
58 #include <machine/pal_routines.h>
59 
60 #if defined (__x86_64__)
61 #define SYSTRACE_ARTIFICIAL_FRAMES      2
62 #define MACHTRACE_ARTIFICIAL_FRAMES 3
63 #elif defined(__arm64__)
64 #define SYSTRACE_ARTIFICIAL_FRAMES  2
65 #define MACHTRACE_ARTIFICIAL_FRAMES 3
66 #else
67 #error Unknown Architecture
68 #endif
69 
70 #define SYSTRACE_NARGS (int)(sizeof(((uthread_t)NULL)->uu_arg) / sizeof(((uthread_t)NULL)->uu_arg[0]))
71 #define MACHTRACE_NARGS (int)(sizeof(struct mach_call_args) / sizeof(syscall_arg_t))
72 
73 #include <sys/sysent.h>
74 #define sy_callc sy_call /* Map Solaris slot name to Darwin's */
75 #define NSYSCALL nsysent /* and is less than 500 or so */
76 
77 extern const char *syscallnames[];
78 
79 #include <sys/dtrace_glue.h>
80 #define casptr dtrace_casptr
81 #define membar_enter dtrace_membar_producer
82 
83 #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
84 #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
85 
86 static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock,
87     &dtrace_lck_grp, &dtrace_lck_attr);           /* probe state lock */
88 
89 systrace_sysent_t *systrace_sysent = NULL;
90 void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
91 
92 static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int);
93 static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
94 
95 void
systrace_stub(dtrace_id_t id,uint64_t arg0,uint64_t arg1,uint64_t arg2,uint64_t arg3,uint64_t arg4)96 systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
97     uint64_t arg2, uint64_t arg3, uint64_t arg4)
98 {
99 #pragma unused(id,arg0,arg1,arg2,arg3,arg4)
100 }
101 
102 int32_t
dtrace_systrace_syscall(struct proc * pp,void * uap,int * rv)103 dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
104 {
105 	unsigned short      code;       /* The system call number */
106 
107 	systrace_sysent_t *sy;
108 	dtrace_id_t id;
109 	int32_t rval;
110 	syscall_arg_t *ip = (syscall_arg_t *)uap;
111 	uint64_t uargs[SYSTRACE_NARGS] = {0};
112 
113 #if defined (__x86_64__)
114 	{
115 		pal_register_cache_state(current_thread(), VALID);
116 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
117 
118 		if (is_saved_state64(tagged_regs)) {
119 			x86_saved_state64_t *regs = saved_state64(tagged_regs);
120 			code = regs->rax & SYSCALL_NUMBER_MASK;
121 			/*
122 			 * Check for indirect system call... system call number
123 			 * passed as 'arg0'
124 			 */
125 			if (code == 0) {
126 				code = regs->rdi;
127 			}
128 		} else {
129 			code = saved_state32(tagged_regs)->eax & I386_SYSCALL_NUMBER_MASK;
130 
131 			if (code == 0) {
132 				vm_offset_t params = (vm_offset_t) (saved_state32(tagged_regs)->uesp + sizeof(int));
133 				code = fuword(params);
134 			}
135 		}
136 	}
137 #elif defined(__arm64__)
138 	{
139 		/*
140 		 * On arm64, syscall numbers depend on a flavor (indirect or not)
141 		 * ... and for u32 can be in either r0 or r12
142 		 * ... and for u64 can be in either x0 or x16
143 		 */
144 
145 		/* see bsd/dev/arm/systemcalls.c:arm_get_syscall_number */
146 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
147 
148 		if (is_saved_state32(arm_regs)) {
149 			/* Check for indirect system call */
150 			if (saved_state32(arm_regs)->r[12] != 0) {
151 				code = saved_state32(arm_regs)->r[12];
152 			} else {
153 				code = saved_state32(arm_regs)->r[0];
154 			}
155 		} else {
156 			/* Check for indirect system call */
157 			if (saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM] != 0) {
158 				code = saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
159 			} else {
160 				code = saved_state64(arm_regs)->x[0];
161 			}
162 		}
163 	}
164 #else
165 #error Unknown Architecture
166 #endif
167 
168 	// Bounds "check" the value of code a la unix_syscall
169 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
170 
171 	systrace_args(code, ip, uargs);
172 
173 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
174 		uthread_t uthread = current_uthread();
175 		if (uthread) {
176 			uthread->t_dtrace_syscall_args = uargs;
177 		}
178 
179 		static_assert(SYSTRACE_NARGS >= 5, "not enough system call arguments");
180 		(*systrace_probe)(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
181 
182 		if (uthread) {
183 			uthread->t_dtrace_syscall_args = NULL;
184 		}
185 	}
186 
187 
188 
189 #if 0 /* XXX */
190 	/*
191 	 * APPLE NOTE: Not implemented.
192 	 * We want to explicitly allow DTrace consumers to stop a process
193 	 * before it actually executes the meat of the syscall.
194 	 */
195 	p = ttoproc(curthread);
196 	mutex_enter(&p->p_lock);
197 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
198 		curthread->t_dtrace_stop = 0;
199 		stop(PR_REQUESTED, 0);
200 	}
201 	mutex_exit(&p->p_lock);
202 #endif
203 
204 	rval = (*sy->stsy_underlying)(pp, uap, rv);
205 
206 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
207 		uint64_t munged_rv0, munged_rv1;
208 		uthread_t uthread = current_uthread();
209 
210 		if (uthread) {
211 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
212 		}
213 		/*
214 		 * "Decode" rv for use in the call to dtrace_probe()
215 		 */
216 		if (rval == ERESTART) {
217 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
218 			munged_rv1 = -1LL;
219 		} else if (rval != EJUSTRETURN) {
220 			if (rval) {
221 				munged_rv0 = -1LL; /* Mimic what libc will do. */
222 				munged_rv1 = -1LL;
223 			} else {
224 				switch (sy->stsy_return_type) {
225 				case _SYSCALL_RET_INT_T:
226 					munged_rv0 = rv[0];
227 					munged_rv1 = rv[1];
228 					break;
229 				case _SYSCALL_RET_UINT_T:
230 					munged_rv0 = ((u_int)rv[0]);
231 					munged_rv1 = ((u_int)rv[1]);
232 					break;
233 				case _SYSCALL_RET_OFF_T:
234 				case _SYSCALL_RET_UINT64_T:
235 					munged_rv0 = *(u_int64_t *)rv;
236 					munged_rv1 = 0LL;
237 					break;
238 				case _SYSCALL_RET_ADDR_T:
239 				case _SYSCALL_RET_SIZE_T:
240 				case _SYSCALL_RET_SSIZE_T:
241 					munged_rv0 = *(user_addr_t *)rv;
242 					munged_rv1 = 0LL;
243 					break;
244 				case _SYSCALL_RET_NONE:
245 					munged_rv0 = 0LL;
246 					munged_rv1 = 0LL;
247 					break;
248 				default:
249 					munged_rv0 = 0LL;
250 					munged_rv1 = 0LL;
251 					break;
252 				}
253 			}
254 		} else {
255 			munged_rv0 = 0LL;
256 			munged_rv1 = 0LL;
257 		}
258 
259 		/*
260 		 * <http://mail.opensolaris.org/pipermail/dtrace-discuss/2007-January/003276.html> says:
261 		 *
262 		 * "This is a bit of an historical artifact. At first, the syscall provider just
263 		 * had its return value in arg0, and the fbt and pid providers had their return
264 		 * values in arg1 (so that we could use arg0 for the offset of the return site).
265 		 *
266 		 * We inevitably started writing scripts where we wanted to see the return
267 		 * values from probes in all three providers, and we made this script easier
268 		 * to write by replicating the syscall return values in arg1 to match fbt and
269 		 * pid. We debated briefly about removing the return value from arg0, but
270 		 * decided that it would be less confusing to have the same data in two places
271 		 * than to have some non-helpful, non-intuitive value in arg0.
272 		 *
273 		 * This change was made 4/23/2003 according to the DTrace project's putback log."
274 		 */
275 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
276 	}
277 
278 	return rval;
279 }
280 
281 void
dtrace_systrace_syscall_return(unsigned short code,int rval,int * rv)282 dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
283 {
284 	systrace_sysent_t *sy;
285 	dtrace_id_t id;
286 
287 	// Bounds "check" the value of code a la unix_syscall_return
288 	sy = (code >= nsysent) ? &systrace_sysent[SYS_invalid] : &systrace_sysent[code];
289 
290 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
291 		uint64_t munged_rv0, munged_rv1;
292 		uthread_t uthread = current_uthread();
293 
294 		if (uthread) {
295 			uthread->t_dtrace_errno = rval; /* Establish t_dtrace_errno now in case this enabling refers to it. */
296 		}
297 		/*
298 		 * "Decode" rv for use in the call to dtrace_probe()
299 		 */
300 		if (rval == ERESTART) {
301 			munged_rv0 = -1LL; /* System call will be reissued in user mode. Make DTrace report a -1 return. */
302 			munged_rv1 = -1LL;
303 		} else if (rval != EJUSTRETURN) {
304 			if (rval) {
305 				munged_rv0 = -1LL; /* Mimic what libc will do. */
306 				munged_rv1 = -1LL;
307 			} else {
308 				switch (sy->stsy_return_type) {
309 				case _SYSCALL_RET_INT_T:
310 					munged_rv0 = rv[0];
311 					munged_rv1 = rv[1];
312 					break;
313 				case _SYSCALL_RET_UINT_T:
314 					munged_rv0 = ((u_int)rv[0]);
315 					munged_rv1 = ((u_int)rv[1]);
316 					break;
317 				case _SYSCALL_RET_OFF_T:
318 				case _SYSCALL_RET_UINT64_T:
319 					munged_rv0 = *(u_int64_t *)rv;
320 					munged_rv1 = 0LL;
321 					break;
322 				case _SYSCALL_RET_ADDR_T:
323 				case _SYSCALL_RET_SIZE_T:
324 				case _SYSCALL_RET_SSIZE_T:
325 					munged_rv0 = *(user_addr_t *)rv;
326 					munged_rv1 = 0LL;
327 					break;
328 				case _SYSCALL_RET_NONE:
329 					munged_rv0 = 0LL;
330 					munged_rv1 = 0LL;
331 					break;
332 				default:
333 					munged_rv0 = 0LL;
334 					munged_rv1 = 0LL;
335 					break;
336 				}
337 			}
338 		} else {
339 			munged_rv0 = 0LL;
340 			munged_rv1 = 0LL;
341 		}
342 
343 		(*systrace_probe)(id, munged_rv0, munged_rv0, munged_rv1, (uint64_t)rval, 0);
344 	}
345 }
346 
347 #define SYSTRACE_SHIFT                  16
348 #define SYSTRACE_ISENTRY(x)             ((int)(x) >> SYSTRACE_SHIFT)
349 #define SYSTRACE_SYSNUM(x)              ((int)(x) & ((1 << SYSTRACE_SHIFT) - 1))
350 #define SYSTRACE_ENTRY(id)              ((1 << SYSTRACE_SHIFT) | (id))
351 #define SYSTRACE_RETURN(id)             (id)
352 
353 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
354 #error 1 << SYSTRACE_SHIFT must exceed number of system calls
355 #endif
356 
357 static dtrace_provider_id_t systrace_id;
358 
359 /*
360  * APPLE NOTE: Avoid name clash with Darwin automagic conf symbol.
361  * See balanced undef below.
362  */
363 #define systrace_init _systrace_init
364 
365 static void
systrace_init(const struct sysent * actual,systrace_sysent_t ** interposed)366 systrace_init(const struct sysent *actual, systrace_sysent_t **interposed)
367 {
368 	systrace_sysent_t *ssysent = *interposed;  /* Avoid sysent shadow warning
369 	                                            *       from bsd/sys/sysent.h */
370 	unsigned int i;
371 
372 	if (ssysent == NULL) {
373 		*interposed = ssysent = kmem_zalloc(sizeof(systrace_sysent_t) *
374 		    NSYSCALL, KM_SLEEP);
375 	}
376 
377 	for (i = 0; i < NSYSCALL; i++) {
378 		/* Use of volatile protects the if statement below from being optimized away */
379 		const volatile struct sysent *a = &actual[i];
380 		systrace_sysent_t *s = &ssysent[i];
381 
382 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
383 			continue;
384 		}
385 
386 		if (a->sy_callc == dtrace_systrace_syscall) {
387 			continue;
388 		}
389 
390 		s->stsy_underlying = a->sy_callc;
391 		s->stsy_return_type = a->sy_return_type;
392 	}
393 }
394 
395 
396 /*ARGSUSED*/
397 static void
systrace_provide(void * arg,const dtrace_probedesc_t * desc)398 systrace_provide(void *arg, const dtrace_probedesc_t *desc)
399 {
400 #pragma unused(arg) /* __APPLE__ */
401 	unsigned int i;
402 
403 	if (desc != NULL) {
404 		return;
405 	}
406 
407 	systrace_init(sysent, &systrace_sysent);
408 
409 	for (i = 0; i < NSYSCALL; i++) {
410 		if (systrace_sysent[i].stsy_underlying == NULL) {
411 			continue;
412 		}
413 
414 		if (dtrace_probe_lookup(systrace_id, NULL,
415 		    syscallnames[i], "entry") != 0) {
416 			continue;
417 		}
418 
419 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
420 		    "entry", SYSTRACE_ARTIFICIAL_FRAMES,
421 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
422 		(void) dtrace_probe_create(systrace_id, NULL, syscallnames[i],
423 		    "return", SYSTRACE_ARTIFICIAL_FRAMES,
424 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
425 
426 		systrace_sysent[i].stsy_entry = DTRACE_IDNONE;
427 		systrace_sysent[i].stsy_return = DTRACE_IDNONE;
428 	}
429 }
430 #undef systrace_init
431 
432 /*ARGSUSED*/
433 static void
systrace_destroy(void * arg,dtrace_id_t id,void * parg)434 systrace_destroy(void *arg, dtrace_id_t id, void *parg)
435 {
436 #pragma unused(arg,id) /* __APPLE__ */
437 
438 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
439 
440 #pragma unused(sysnum)  /* __APPLE__ */
441 	/*
442 	 * There's nothing to do here but assert that we have actually been
443 	 * disabled.
444 	 */
445 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
446 		ASSERT(systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
447 	} else {
448 		ASSERT(systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
449 	}
450 }
451 
452 /*ARGSUSED*/
453 static int
systrace_enable(void * arg,dtrace_id_t id,void * parg)454 systrace_enable(void *arg, dtrace_id_t id, void *parg)
455 {
456 #pragma unused(arg) /* __APPLE__ */
457 
458 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
459 	int enabled = (systrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
460 	    systrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
461 
462 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
463 		systrace_sysent[sysnum].stsy_entry = id;
464 	} else {
465 		systrace_sysent[sysnum].stsy_return = id;
466 	}
467 
468 	if (enabled) {
469 		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
470 		return 0;
471 	}
472 
473 	lck_mtx_lock(&dtrace_systrace_lock);
474 	if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
475 		/* It is not possible to write to sysent[] directly because it is const. */
476 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall);
477 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
478 	}
479 	lck_mtx_unlock(&dtrace_systrace_lock);
480 
481 	return 0;
482 }
483 
484 /*ARGSUSED*/
485 static void
systrace_disable(void * arg,dtrace_id_t id,void * parg)486 systrace_disable(void *arg, dtrace_id_t id, void *parg)
487 {
488 #pragma unused(arg,id) /* __APPLE__ */
489 
490 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
491 	int disable = (systrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
492 	    systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
493 
494 	if (disable) {
495 		/*
496 		 * Usage of volatile protects the if statement below from being optimized away.
497 		 *
498 		 * Compilers are clever and know that const array values can't change in time
499 		 * and the if below is always false. That is because it can't see that DTrace
500 		 * injects dtrace_systrace_syscall dynamically and violates constness of the
501 		 * array.
502 		 */
503 		volatile const struct sysent *syscallent = &sysent[sysnum];
504 
505 		lck_mtx_lock(&dtrace_systrace_lock);
506 		if (syscallent->sy_callc == dtrace_systrace_syscall) {
507 			ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying,
508 			    (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t));
509 		}
510 		lck_mtx_unlock(&dtrace_systrace_lock);
511 	}
512 
513 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
514 		systrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
515 	} else {
516 		systrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
517 	}
518 }
519 
520 static dtrace_pattr_t systrace_attr = {
521 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
522 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
523 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
524 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
525 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
526 };
527 
528 static dtrace_pops_t systrace_pops = {
529 	.dtps_provide =         systrace_provide,
530 	.dtps_provide_module =  NULL,
531 	.dtps_enable =          systrace_enable,
532 	.dtps_disable =         systrace_disable,
533 	.dtps_suspend =         NULL,
534 	.dtps_resume =          NULL,
535 	.dtps_getargdesc =      systrace_getargdesc,
536 	.dtps_getargval =       systrace_getargval,
537 	.dtps_usermode =        NULL,
538 	.dtps_destroy =         systrace_destroy
539 };
540 
541 static int
systrace_attach(dev_info_t * devi)542 systrace_attach(dev_info_t *devi)
543 {
544 	systrace_probe = (void*)&dtrace_probe;
545 	membar_enter();
546 
547 	if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
548 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
549 	    dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL,
550 	    &systrace_pops, NULL, &systrace_id) != 0) {
551 		systrace_probe = systrace_stub;
552 		ddi_remove_minor_node(devi, NULL);
553 		return DDI_FAILURE;
554 	}
555 
556 	return DDI_SUCCESS;
557 }
558 
559 
560 /*
561  * APPLE NOTE:  systrace_detach not implemented
562  */
563 #if !defined(__APPLE__)
564 static int
systrace_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)565 systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
566 {
567 	switch (cmd) {
568 	case DDI_DETACH:
569 		break;
570 	case DDI_SUSPEND:
571 		return DDI_SUCCESS;
572 	default:
573 		return DDI_FAILURE;
574 	}
575 
576 	if (dtrace_unregister(systrace_id) != 0) {
577 		return DDI_FAILURE;
578 	}
579 
580 	ddi_remove_minor_node(devi, NULL);
581 	systrace_probe = systrace_stub;
582 	return DDI_SUCCESS;
583 }
584 #endif /* __APPLE__ */
585 
586 
587 typedef kern_return_t (*mach_call_t)(void *);
588 
589 /* APPLE NOTE: From #include <kern/syscall_sw.h> which may be changed for 64 bit! */
590 #if CONFIG_REQUIRES_U32_MUNGING
591 typedef void mach_munge_t(void *);
592 #elif __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
593 typedef int mach_munge_t(const void *, void *);
594 #endif
595 
596 typedef struct {
597 	unsigned char           mach_trap_arg_count; /* Number of trap arguments (Arch independant) */
598 	unsigned char           mach_trap_u32_words; /* number of 32-bit words to copyin for U32 */
599 	unsigned char           mach_trap_returns_port;
600 	unsigned char           __mach_trap_padding;
601 	kern_return_t         (*mach_trap_function)(void *);
602 #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
603 	mach_munge_t           *mach_trap_arg_munge32; /* system call argument munger routine for 32-bit */
604 #endif
605 #if MACH_ASSERT
606 	const char             *mach_trap_name;
607 #endif /* MACH_ASSERT */
608 } mach_trap_t;
609 
610 
611 #define MACH_TRAP_TABLE_COUNT   128
612 
613 extern const mach_trap_t        mach_trap_table[MACH_TRAP_TABLE_COUNT];
614 extern const int                mach_trap_count;
615 extern const char * const       mach_syscall_name_table[MACH_TRAP_TABLE_COUNT];
616 
617 
618 /* XXX From osfmk/i386/bsd_i386.c */
619 struct mach_call_args {
620 	syscall_arg_t arg1;
621 	syscall_arg_t arg2;
622 	syscall_arg_t arg3;
623 	syscall_arg_t arg4;
624 	syscall_arg_t arg5;
625 	syscall_arg_t arg6;
626 	syscall_arg_t arg7;
627 	syscall_arg_t arg8;
628 	syscall_arg_t arg9;
629 };
630 
631 #undef NSYSCALL
632 #define NSYSCALL mach_trap_count
633 
634 #if ((1 << SYSTRACE_SHIFT) <= NSYSCALL)
635 #error 1 << SYSTRACE_SHIFT must exceed number of Mach traps
636 #endif
637 
638 typedef struct machtrace_sysent {
639 	dtrace_id_t     stsy_entry;
640 	dtrace_id_t     stsy_return;
641 	kern_return_t   (*stsy_underlying)(void *);
642 	int32_t         stsy_return_type;
643 } machtrace_sysent_t;
644 
645 static machtrace_sysent_t *machtrace_sysent = NULL;
646 
647 void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
648     uint64_t, uint64_t, uint64_t);
649 
650 static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);
651 
652 static dtrace_provider_id_t machtrace_id;
653 
654 static kern_return_t
dtrace_machtrace_syscall(struct mach_call_args * args)655 dtrace_machtrace_syscall(struct mach_call_args *args)
656 {
657 	int code;       /* The mach call number */
658 
659 	machtrace_sysent_t *sy;
660 	dtrace_id_t id;
661 	kern_return_t rval;
662 #if 0 /* XXX */
663 	proc_t *p;
664 #endif
665 	syscall_arg_t *ip = (syscall_arg_t *)args;
666 	mach_call_t mach_call;
667 
668 #if defined (__x86_64__)
669 	{
670 		pal_register_cache_state(current_thread(), VALID);
671 		x86_saved_state_t   *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread());
672 
673 		if (is_saved_state64(tagged_regs)) {
674 			code = saved_state64(tagged_regs)->rax & SYSCALL_NUMBER_MASK;
675 		} else {
676 			code = -saved_state32(tagged_regs)->eax;
677 		}
678 	}
679 #elif defined(__arm64__)
680 	{
681 		/* From arm/thread_status.h:get_saved_state_svc_number */
682 		arm_saved_state_t *arm_regs = (arm_saved_state_t *) find_user_regs(current_thread());
683 		if (is_saved_state32(arm_regs)) {
684 			code = (int)saved_state32(arm_regs)->r[12];
685 		} else {
686 			code = (int)saved_state64(arm_regs)->x[ARM64_SYSCALL_CODE_REG_NUM];
687 		}
688 
689 		/* From bsd/arm64.c:mach_syscall */
690 		ASSERT(code < 0);    /* Otherwise it would be a Unix syscall */
691 		code = -code;
692 	}
693 #else
694 #error Unknown Architecture
695 #endif
696 
697 	sy = &machtrace_sysent[code];
698 
699 	if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
700 		uthread_t uthread = current_uthread();
701 
702 		if (uthread) {
703 			uthread->t_dtrace_syscall_args = (void *)ip;
704 		}
705 
706 		(*machtrace_probe)(id, *ip, *(ip + 1), *(ip + 2), *(ip + 3), *(ip + 4));
707 
708 		if (uthread) {
709 			uthread->t_dtrace_syscall_args = (void *)0;
710 		}
711 	}
712 
713 #if 0 /* XXX */
714 	/*
715 	 * APPLE NOTE:  Not implemented.
716 	 * We want to explicitly allow DTrace consumers to stop a process
717 	 * before it actually executes the meat of the syscall.
718 	 */
719 	p = ttoproc(curthread);
720 	mutex_enter(&p->p_lock);
721 	if (curthread->t_dtrace_stop && !curthread->t_lwp->lwp_nostop) {
722 		curthread->t_dtrace_stop = 0;
723 		stop(PR_REQUESTED, 0);
724 	}
725 	mutex_exit(&p->p_lock);
726 #endif
727 
728 	mach_call = (mach_call_t)(*sy->stsy_underlying);
729 	rval = mach_call(args);
730 
731 	if ((id = sy->stsy_return) != DTRACE_IDNONE) {
732 		(*machtrace_probe)(id, (uint64_t)rval, 0, 0, 0, 0);
733 	}
734 
735 	return rval;
736 }
737 
738 static void
machtrace_init(const mach_trap_t * actual,machtrace_sysent_t ** interposed)739 machtrace_init(const mach_trap_t *actual, machtrace_sysent_t **interposed)
740 {
741 	machtrace_sysent_t *msysent = *interposed;
742 	int i;
743 
744 	if (msysent == NULL) {
745 		*interposed = msysent = kmem_zalloc(sizeof(machtrace_sysent_t) *
746 		    NSYSCALL, KM_SLEEP);
747 	}
748 
749 	for (i = 0; i < NSYSCALL; i++) {
750 		const volatile mach_trap_t *a = &actual[i];
751 		machtrace_sysent_t *s = &msysent[i];
752 
753 		if (LOADABLE_SYSCALL(a) && !LOADED_SYSCALL(a)) {
754 			continue;
755 		}
756 
757 		if (a->mach_trap_function == (mach_call_t)(dtrace_machtrace_syscall)) {
758 			continue;
759 		}
760 
761 		s->stsy_underlying = a->mach_trap_function;
762 	}
763 }
764 
765 /*ARGSUSED*/
766 static void
machtrace_provide(void * arg,const dtrace_probedesc_t * desc)767 machtrace_provide(void *arg, const dtrace_probedesc_t *desc)
768 {
769 #pragma unused(arg) /* __APPLE__ */
770 
771 	int i;
772 
773 	if (desc != NULL) {
774 		return;
775 	}
776 
777 	machtrace_init(mach_trap_table, &machtrace_sysent);
778 
779 	for (i = 0; i < NSYSCALL; i++) {
780 		if (machtrace_sysent[i].stsy_underlying == NULL) {
781 			continue;
782 		}
783 
784 		if (dtrace_probe_lookup(machtrace_id, NULL,
785 		    mach_syscall_name_table[i], "entry") != 0) {
786 			continue;
787 		}
788 
789 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
790 		    "entry", MACHTRACE_ARTIFICIAL_FRAMES,
791 		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
792 		(void) dtrace_probe_create(machtrace_id, NULL, mach_syscall_name_table[i],
793 		    "return", MACHTRACE_ARTIFICIAL_FRAMES,
794 		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
795 
796 		machtrace_sysent[i].stsy_entry = DTRACE_IDNONE;
797 		machtrace_sysent[i].stsy_return = DTRACE_IDNONE;
798 	}
799 }
800 
801 /*ARGSUSED*/
802 static void
machtrace_destroy(void * arg,dtrace_id_t id,void * parg)803 machtrace_destroy(void *arg, dtrace_id_t id, void *parg)
804 {
805 #pragma unused(arg,id) /* __APPLE__ */
806 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
807 
808 #pragma unused(sysnum) /* __APPLE__ */
809 
810 	/*
811 	 * There's nothing to do here but assert that we have actually been
812 	 * disabled.
813 	 */
814 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
815 		ASSERT(machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE);
816 	} else {
817 		ASSERT(machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
818 	}
819 }
820 
821 /*ARGSUSED*/
822 static int
machtrace_enable(void * arg,dtrace_id_t id,void * parg)823 machtrace_enable(void *arg, dtrace_id_t id, void *parg)
824 {
825 #pragma unused(arg) /* __APPLE__ */
826 
827 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
828 	int enabled = (machtrace_sysent[sysnum].stsy_entry != DTRACE_IDNONE ||
829 	    machtrace_sysent[sysnum].stsy_return != DTRACE_IDNONE);
830 
831 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
832 		machtrace_sysent[sysnum].stsy_entry = id;
833 	} else {
834 		machtrace_sysent[sysnum].stsy_return = id;
835 	}
836 
837 	if (enabled) {
838 		ASSERT(mach_trap_table[sysnum].mach_trap_function == (void *)dtrace_machtrace_syscall);
839 		return 0;
840 	}
841 
842 	lck_mtx_lock(&dtrace_systrace_lock);
843 
844 	if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
845 		/* It is not possible to write to mach_trap_table[] directly because it is const. */
846 		vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall);
847 		ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
848 	}
849 
850 	lck_mtx_unlock(&dtrace_systrace_lock);
851 
852 	return 0;
853 }
854 
855 /*ARGSUSED*/
856 static void
machtrace_disable(void * arg,dtrace_id_t id,void * parg)857 machtrace_disable(void *arg, dtrace_id_t id, void *parg)
858 {
859 #pragma unused(arg,id) /* __APPLE__ */
860 
861 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
862 	int disable = (machtrace_sysent[sysnum].stsy_entry == DTRACE_IDNONE ||
863 	    machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
864 
865 	if (disable) {
866 		/*
867 		 * Usage of volatile protects the if statement below from being optimized away.
868 		 *
869 		 * Compilers are clever and know that const array values can't change in time
870 		 * and the if below is always false. That is because it can't see that DTrace
871 		 * injects dtrace_machtrace_syscall dynamically and violates constness of the
872 		 * array.
873 		 */
874 		volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum];
875 
876 		lck_mtx_lock(&dtrace_systrace_lock);
877 		if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
878 			ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying,
879 			    (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t));
880 		}
881 		lck_mtx_unlock(&dtrace_systrace_lock);
882 	}
883 
884 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
885 		machtrace_sysent[sysnum].stsy_entry = DTRACE_IDNONE;
886 	} else {
887 		machtrace_sysent[sysnum].stsy_return = DTRACE_IDNONE;
888 	}
889 }
890 
891 static dtrace_pattr_t machtrace_attr = {
892 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
893 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
894 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
895 	{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
896 	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
897 };
898 
899 static dtrace_pops_t machtrace_pops = {
900 	.dtps_provide =         machtrace_provide,
901 	.dtps_provide_module =  NULL,
902 	.dtps_enable =          machtrace_enable,
903 	.dtps_disable =         machtrace_disable,
904 	.dtps_suspend =         NULL,
905 	.dtps_resume =          NULL,
906 	.dtps_getargdesc =      NULL,
907 	.dtps_getargval =       machtrace_getarg,
908 	.dtps_usermode =        NULL,
909 	.dtps_destroy =         machtrace_destroy
910 };
911 
912 static int
machtrace_attach(dev_info_t * devi)913 machtrace_attach(dev_info_t *devi)
914 {
915 	machtrace_probe = dtrace_probe;
916 	membar_enter();
917 
918 	if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0,
919 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
920 	    dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
921 	    &machtrace_pops, NULL, &machtrace_id) != 0) {
922 		machtrace_probe = (void*)&systrace_stub;
923 		ddi_remove_minor_node(devi, NULL);
924 		return DDI_FAILURE;
925 	}
926 
927 	return DDI_SUCCESS;
928 }
929 
930 d_open_t _systrace_open;
931 
932 int
_systrace_open(dev_t dev,int flags,int devtype,struct proc * p)933 _systrace_open(dev_t dev, int flags, int devtype, struct proc *p)
934 {
935 #pragma unused(dev,flags,devtype,p)
936 	return 0;
937 }
938 
939 #define SYSTRACE_MAJOR  -24 /* let the kernel pick the device number */
940 
941 static struct cdevsw systrace_cdevsw =
942 {
943 	.d_open = _systrace_open,
944 	.d_close = eno_opcl,
945 	.d_read = eno_rdwrt,
946 	.d_write = eno_rdwrt,
947 	.d_ioctl = eno_ioctl,
948 	.d_stop = eno_stop,
949 	.d_reset = eno_reset,
950 	.d_select = eno_select,
951 	.d_mmap = eno_mmap,
952 	.d_strategy = eno_strat,
953 	.d_reserved_1 = eno_getc,
954 	.d_reserved_2 = eno_putc,
955 };
956 
957 void systrace_init( void );
958 
959 void
systrace_init(void)960 systrace_init( void )
961 {
962 	if (dtrace_sdt_probes_restricted()) {
963 		return;
964 	}
965 
966 	int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
967 
968 	if (majdevno < 0) {
969 		printf("systrace_init: failed to allocate a major number!\n");
970 		return;
971 	}
972 
973 	systrace_attach((dev_info_t*)(uintptr_t)majdevno);
974 	machtrace_attach((dev_info_t*)(uintptr_t)majdevno);
975 }
976 #undef SYSTRACE_MAJOR
977 
978 static uint64_t
systrace_getargval(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)979 systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
980 {
981 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
982 	uint64_t val = 0;
983 	uint64_t *uargs = NULL;
984 
985 	uthread_t uthread = current_uthread();
986 
987 	if (uthread) {
988 		uargs = uthread->t_dtrace_syscall_args;
989 	}
990 	if (!uargs) {
991 		return 0;
992 	}
993 	if (argno < 0 || argno >= SYSTRACE_NARGS) {
994 		return 0;
995 	}
996 
997 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
998 	val = uargs[argno];
999 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1000 	return val;
1001 }
1002 
1003 static void
systrace_getargdesc(void * arg,dtrace_id_t id,void * parg,dtrace_argdesc_t * desc)1004 systrace_getargdesc(void *arg, dtrace_id_t id, void *parg,
1005     dtrace_argdesc_t *desc)
1006 {
1007 #pragma unused(arg, id)
1008 	int sysnum = SYSTRACE_SYSNUM(parg);
1009 	uthread_t uthread = current_uthread();
1010 	uint64_t *uargs = NULL;
1011 
1012 	if (!uthread) {
1013 		desc->dtargd_ndx = DTRACE_ARGNONE;
1014 		return;
1015 	}
1016 
1017 	uargs = uthread->t_dtrace_syscall_args;
1018 
1019 	if (SYSTRACE_ISENTRY((uintptr_t)parg)) {
1020 		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx,
1021 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1022 	} else {
1023 		systrace_return_setargdesc(sysnum, desc->dtargd_ndx,
1024 		    desc->dtargd_native, sizeof(desc->dtargd_native));
1025 	}
1026 
1027 	if (desc->dtargd_native[0] == '\0') {
1028 		desc->dtargd_ndx = DTRACE_ARGNONE;
1029 	}
1030 }
1031 
1032 static uint64_t
machtrace_getarg(void * arg,dtrace_id_t id,void * parg,int argno,int aframes)1033 machtrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
1034 {
1035 #pragma unused(arg,id,parg,aframes)     /* __APPLE__ */
1036 	uint64_t val = 0;
1037 	syscall_arg_t *stack = (syscall_arg_t *)NULL;
1038 
1039 	uthread_t uthread = current_uthread();
1040 
1041 	if (uthread) {
1042 		stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args;
1043 	}
1044 
1045 	if (!stack) {
1046 		return 0;
1047 	}
1048 
1049 	if (argno < 0 || argno >= MACHTRACE_NARGS) {
1050 		return 0;
1051 	}
1052 
1053 	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
1054 	/* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
1055 	val = (uint64_t)*(stack + argno);
1056 	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
1057 	return val;
1058 }
1059