xref: /linux-6.15/kernel/trace/trace_syscalls.c (revision 1ae4a971)
1 #include <trace/syscall.h>
2 #include <trace/events/syscalls.h>
3 #include <linux/kernel.h>
4 #include <linux/ftrace.h>
5 #include <linux/perf_event.h>
6 #include <asm/syscall.h>
7 
8 #include "trace_output.h"
9 #include "trace.h"
10 
11 static DEFINE_MUTEX(syscall_trace_lock);
12 static int sys_refcount_enter;
13 static int sys_refcount_exit;
14 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 
17 extern unsigned long __start_syscalls_metadata[];
18 extern unsigned long __stop_syscalls_metadata[];
19 
20 static struct syscall_metadata **syscalls_metadata;
21 
22 static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23 {
24 	struct syscall_metadata *start;
25 	struct syscall_metadata *stop;
26 	char str[KSYM_SYMBOL_LEN];
27 
28 
29 	start = (struct syscall_metadata *)__start_syscalls_metadata;
30 	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32 
33 	for ( ; start < stop; start++) {
34 		/*
35 		 * Only compare after the "sys" prefix. Archs that use
36 		 * syscall wrappers may have syscalls symbols aliases prefixed
37 		 * with "SyS" instead of "sys", leading to an unwanted
38 		 * mismatch.
39 		 */
40 		if (start->name && !strcmp(start->name + 3, str + 3))
41 			return start;
42 	}
43 	return NULL;
44 }
45 
46 static struct syscall_metadata *syscall_nr_to_meta(int nr)
47 {
48 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 		return NULL;
50 
51 	return syscalls_metadata[nr];
52 }
53 
54 int syscall_name_to_nr(char *name)
55 {
56 	int i;
57 
58 	if (!syscalls_metadata)
59 		return -1;
60 
61 	for (i = 0; i < NR_syscalls; i++) {
62 		if (syscalls_metadata[i]) {
63 			if (!strcmp(syscalls_metadata[i]->name, name))
64 				return i;
65 		}
66 	}
67 	return -1;
68 }
69 
70 void set_syscall_enter_id(int num, int id)
71 {
72 	syscalls_metadata[num]->enter_id = id;
73 }
74 
75 void set_syscall_exit_id(int num, int id)
76 {
77 	syscalls_metadata[num]->exit_id = id;
78 }
79 
80 enum print_line_t
81 print_syscall_enter(struct trace_iterator *iter, int flags)
82 {
83 	struct trace_seq *s = &iter->seq;
84 	struct trace_entry *ent = iter->ent;
85 	struct syscall_trace_enter *trace;
86 	struct syscall_metadata *entry;
87 	int i, ret, syscall;
88 
89 	trace = (typeof(trace))ent;
90 	syscall = trace->nr;
91 	entry = syscall_nr_to_meta(syscall);
92 
93 	if (!entry)
94 		goto end;
95 
96 	if (entry->enter_id != ent->type) {
97 		WARN_ON_ONCE(1);
98 		goto end;
99 	}
100 
101 	ret = trace_seq_printf(s, "%s(", entry->name);
102 	if (!ret)
103 		return TRACE_TYPE_PARTIAL_LINE;
104 
105 	for (i = 0; i < entry->nb_args; i++) {
106 		/* parameter types */
107 		if (trace_flags & TRACE_ITER_VERBOSE) {
108 			ret = trace_seq_printf(s, "%s ", entry->types[i]);
109 			if (!ret)
110 				return TRACE_TYPE_PARTIAL_LINE;
111 		}
112 		/* parameter values */
113 		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
114 				       trace->args[i],
115 				       i == entry->nb_args - 1 ? "" : ", ");
116 		if (!ret)
117 			return TRACE_TYPE_PARTIAL_LINE;
118 	}
119 
120 	ret = trace_seq_putc(s, ')');
121 	if (!ret)
122 		return TRACE_TYPE_PARTIAL_LINE;
123 
124 end:
125 	ret =  trace_seq_putc(s, '\n');
126 	if (!ret)
127 		return TRACE_TYPE_PARTIAL_LINE;
128 
129 	return TRACE_TYPE_HANDLED;
130 }
131 
132 enum print_line_t
133 print_syscall_exit(struct trace_iterator *iter, int flags)
134 {
135 	struct trace_seq *s = &iter->seq;
136 	struct trace_entry *ent = iter->ent;
137 	struct syscall_trace_exit *trace;
138 	int syscall;
139 	struct syscall_metadata *entry;
140 	int ret;
141 
142 	trace = (typeof(trace))ent;
143 	syscall = trace->nr;
144 	entry = syscall_nr_to_meta(syscall);
145 
146 	if (!entry) {
147 		trace_seq_printf(s, "\n");
148 		return TRACE_TYPE_HANDLED;
149 	}
150 
151 	if (entry->exit_id != ent->type) {
152 		WARN_ON_ONCE(1);
153 		return TRACE_TYPE_UNHANDLED;
154 	}
155 
156 	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
157 				trace->ret);
158 	if (!ret)
159 		return TRACE_TYPE_PARTIAL_LINE;
160 
161 	return TRACE_TYPE_HANDLED;
162 }
163 
164 extern char *__bad_type_size(void);
165 
166 #define SYSCALL_FIELD(type, name)					\
167 	sizeof(type) != sizeof(trace.name) ?				\
168 		__bad_type_size() :					\
169 		#type, #name, offsetof(typeof(trace), name),		\
170 		sizeof(trace.name), is_signed_type(type)
171 
172 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
173 {
174 	int i;
175 	int nr;
176 	int ret;
177 	struct syscall_metadata *entry;
178 	struct syscall_trace_enter trace;
179 	int offset = offsetof(struct syscall_trace_enter, args);
180 
181 	nr = syscall_name_to_nr(call->data);
182 	entry = syscall_nr_to_meta(nr);
183 
184 	if (!entry)
185 		return 0;
186 
187 	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
188 			       "\tsigned:%u;\n",
189 			       SYSCALL_FIELD(int, nr));
190 	if (!ret)
191 		return 0;
192 
193 	for (i = 0; i < entry->nb_args; i++) {
194 		ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
195 				        entry->args[i]);
196 		if (!ret)
197 			return 0;
198 		ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
199 				       "\tsigned:%u;\n", offset,
200 				       sizeof(unsigned long),
201 				       is_signed_type(unsigned long));
202 		if (!ret)
203 			return 0;
204 		offset += sizeof(unsigned long);
205 	}
206 
207 	trace_seq_puts(s, "\nprint fmt: \"");
208 	for (i = 0; i < entry->nb_args; i++) {
209 		ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
210 				        sizeof(unsigned long),
211 					i == entry->nb_args - 1 ? "" : ", ");
212 		if (!ret)
213 			return 0;
214 	}
215 	trace_seq_putc(s, '"');
216 
217 	for (i = 0; i < entry->nb_args; i++) {
218 		ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
219 				       entry->args[i]);
220 		if (!ret)
221 			return 0;
222 	}
223 
224 	return trace_seq_putc(s, '\n');
225 }
226 
227 int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
228 {
229 	int ret;
230 	struct syscall_trace_exit trace;
231 
232 	ret = trace_seq_printf(s,
233 			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
234 			       "\tsigned:%u;\n"
235 			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
236 			       "\tsigned:%u;\n",
237 			       SYSCALL_FIELD(int, nr),
238 			       SYSCALL_FIELD(long, ret));
239 	if (!ret)
240 		return 0;
241 
242 	return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
243 }
244 
245 int syscall_enter_define_fields(struct ftrace_event_call *call)
246 {
247 	struct syscall_trace_enter trace;
248 	struct syscall_metadata *meta;
249 	int ret;
250 	int nr;
251 	int i;
252 	int offset = offsetof(typeof(trace), args);
253 
254 	nr = syscall_name_to_nr(call->data);
255 	meta = syscall_nr_to_meta(nr);
256 
257 	if (!meta)
258 		return 0;
259 
260 	ret = trace_define_common_fields(call);
261 	if (ret)
262 		return ret;
263 
264 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
265 	if (ret)
266 		return ret;
267 
268 	for (i = 0; i < meta->nb_args; i++) {
269 		ret = trace_define_field(call, meta->types[i],
270 					 meta->args[i], offset,
271 					 sizeof(unsigned long), 0,
272 					 FILTER_OTHER);
273 		offset += sizeof(unsigned long);
274 	}
275 
276 	return ret;
277 }
278 
279 int syscall_exit_define_fields(struct ftrace_event_call *call)
280 {
281 	struct syscall_trace_exit trace;
282 	int ret;
283 
284 	ret = trace_define_common_fields(call);
285 	if (ret)
286 		return ret;
287 
288 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
289 	if (ret)
290 		return ret;
291 
292 	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
293 				 FILTER_OTHER);
294 
295 	return ret;
296 }
297 
298 void ftrace_syscall_enter(struct pt_regs *regs, long id)
299 {
300 	struct syscall_trace_enter *entry;
301 	struct syscall_metadata *sys_data;
302 	struct ring_buffer_event *event;
303 	struct ring_buffer *buffer;
304 	int size;
305 	int syscall_nr;
306 
307 	syscall_nr = syscall_get_nr(current, regs);
308 	if (syscall_nr < 0)
309 		return;
310 	if (!test_bit(syscall_nr, enabled_enter_syscalls))
311 		return;
312 
313 	sys_data = syscall_nr_to_meta(syscall_nr);
314 	if (!sys_data)
315 		return;
316 
317 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
318 
319 	event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
320 						  size, 0, 0);
321 	if (!event)
322 		return;
323 
324 	entry = ring_buffer_event_data(event);
325 	entry->nr = syscall_nr;
326 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
327 
328 	if (!filter_current_check_discard(buffer, sys_data->enter_event,
329 					  entry, event))
330 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
331 }
332 
333 void ftrace_syscall_exit(struct pt_regs *regs, long ret)
334 {
335 	struct syscall_trace_exit *entry;
336 	struct syscall_metadata *sys_data;
337 	struct ring_buffer_event *event;
338 	struct ring_buffer *buffer;
339 	int syscall_nr;
340 
341 	syscall_nr = syscall_get_nr(current, regs);
342 	if (syscall_nr < 0)
343 		return;
344 	if (!test_bit(syscall_nr, enabled_exit_syscalls))
345 		return;
346 
347 	sys_data = syscall_nr_to_meta(syscall_nr);
348 	if (!sys_data)
349 		return;
350 
351 	event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
352 				sizeof(*entry), 0, 0);
353 	if (!event)
354 		return;
355 
356 	entry = ring_buffer_event_data(event);
357 	entry->nr = syscall_nr;
358 	entry->ret = syscall_get_return_value(current, regs);
359 
360 	if (!filter_current_check_discard(buffer, sys_data->exit_event,
361 					  entry, event))
362 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
363 }
364 
365 int reg_event_syscall_enter(struct ftrace_event_call *call)
366 {
367 	int ret = 0;
368 	int num;
369 	char *name;
370 
371 	name = (char *)call->data;
372 	num = syscall_name_to_nr(name);
373 	if (num < 0 || num >= NR_syscalls)
374 		return -ENOSYS;
375 	mutex_lock(&syscall_trace_lock);
376 	if (!sys_refcount_enter)
377 		ret = register_trace_sys_enter(ftrace_syscall_enter);
378 	if (ret) {
379 		pr_info("event trace: Could not activate"
380 				"syscall entry trace point");
381 	} else {
382 		set_bit(num, enabled_enter_syscalls);
383 		sys_refcount_enter++;
384 	}
385 	mutex_unlock(&syscall_trace_lock);
386 	return ret;
387 }
388 
389 void unreg_event_syscall_enter(struct ftrace_event_call *call)
390 {
391 	int num;
392 	char *name;
393 
394 	name = (char *)call->data;
395 	num = syscall_name_to_nr(name);
396 	if (num < 0 || num >= NR_syscalls)
397 		return;
398 	mutex_lock(&syscall_trace_lock);
399 	sys_refcount_enter--;
400 	clear_bit(num, enabled_enter_syscalls);
401 	if (!sys_refcount_enter)
402 		unregister_trace_sys_enter(ftrace_syscall_enter);
403 	mutex_unlock(&syscall_trace_lock);
404 }
405 
406 int reg_event_syscall_exit(struct ftrace_event_call *call)
407 {
408 	int ret = 0;
409 	int num;
410 	char *name;
411 
412 	name = call->data;
413 	num = syscall_name_to_nr(name);
414 	if (num < 0 || num >= NR_syscalls)
415 		return -ENOSYS;
416 	mutex_lock(&syscall_trace_lock);
417 	if (!sys_refcount_exit)
418 		ret = register_trace_sys_exit(ftrace_syscall_exit);
419 	if (ret) {
420 		pr_info("event trace: Could not activate"
421 				"syscall exit trace point");
422 	} else {
423 		set_bit(num, enabled_exit_syscalls);
424 		sys_refcount_exit++;
425 	}
426 	mutex_unlock(&syscall_trace_lock);
427 	return ret;
428 }
429 
430 void unreg_event_syscall_exit(struct ftrace_event_call *call)
431 {
432 	int num;
433 	char *name;
434 
435 	name = call->data;
436 	num = syscall_name_to_nr(name);
437 	if (num < 0 || num >= NR_syscalls)
438 		return;
439 	mutex_lock(&syscall_trace_lock);
440 	sys_refcount_exit--;
441 	clear_bit(num, enabled_exit_syscalls);
442 	if (!sys_refcount_exit)
443 		unregister_trace_sys_exit(ftrace_syscall_exit);
444 	mutex_unlock(&syscall_trace_lock);
445 }
446 
447 struct trace_event event_syscall_enter = {
448 	.trace			= print_syscall_enter,
449 };
450 
451 struct trace_event event_syscall_exit = {
452 	.trace			= print_syscall_exit,
453 };
454 
455 int __init init_ftrace_syscalls(void)
456 {
457 	struct syscall_metadata *meta;
458 	unsigned long addr;
459 	int i;
460 
461 	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
462 					NR_syscalls, GFP_KERNEL);
463 	if (!syscalls_metadata) {
464 		WARN_ON(1);
465 		return -ENOMEM;
466 	}
467 
468 	for (i = 0; i < NR_syscalls; i++) {
469 		addr = arch_syscall_addr(i);
470 		meta = find_syscall_meta(addr);
471 		syscalls_metadata[i] = meta;
472 	}
473 
474 	return 0;
475 }
476 core_initcall(init_ftrace_syscalls);
477 
478 #ifdef CONFIG_EVENT_PROFILE
479 
480 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
481 static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
482 static int sys_prof_refcount_enter;
483 static int sys_prof_refcount_exit;
484 
485 static void prof_syscall_enter(struct pt_regs *regs, long id)
486 {
487 	struct syscall_metadata *sys_data;
488 	struct syscall_trace_enter *rec;
489 	unsigned long flags;
490 	char *trace_buf;
491 	char *raw_data;
492 	int syscall_nr;
493 	int rctx;
494 	int size;
495 	int cpu;
496 
497 	syscall_nr = syscall_get_nr(current, regs);
498 	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
499 		return;
500 
501 	sys_data = syscall_nr_to_meta(syscall_nr);
502 	if (!sys_data)
503 		return;
504 
505 	/* get the size after alignment with the u32 buffer size field */
506 	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
507 	size = ALIGN(size + sizeof(u32), sizeof(u64));
508 	size -= sizeof(u32);
509 
510 	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
511 		      "profile buffer not large enough"))
512 		return;
513 
514 	/* Protect the per cpu buffer, begin the rcu read side */
515 	local_irq_save(flags);
516 
517 	rctx = perf_swevent_get_recursion_context();
518 	if (rctx < 0)
519 		goto end_recursion;
520 
521 	cpu = smp_processor_id();
522 
523 	trace_buf = rcu_dereference(perf_trace_buf);
524 
525 	if (!trace_buf)
526 		goto end;
527 
528 	raw_data = per_cpu_ptr(trace_buf, cpu);
529 
530 	/* zero the dead bytes from align to not leak stack to user */
531 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
532 
533 	rec = (struct syscall_trace_enter *) raw_data;
534 	tracing_generic_entry_update(&rec->ent, 0, 0);
535 	rec->ent.type = sys_data->enter_id;
536 	rec->nr = syscall_nr;
537 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
538 			       (unsigned long *)&rec->args);
539 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
540 
541 end:
542 	perf_swevent_put_recursion_context(rctx);
543 end_recursion:
544 	local_irq_restore(flags);
545 }
546 
547 int reg_prof_syscall_enter(char *name)
548 {
549 	int ret = 0;
550 	int num;
551 
552 	num = syscall_name_to_nr(name);
553 	if (num < 0 || num >= NR_syscalls)
554 		return -ENOSYS;
555 
556 	mutex_lock(&syscall_trace_lock);
557 	if (!sys_prof_refcount_enter)
558 		ret = register_trace_sys_enter(prof_syscall_enter);
559 	if (ret) {
560 		pr_info("event trace: Could not activate"
561 				"syscall entry trace point");
562 	} else {
563 		set_bit(num, enabled_prof_enter_syscalls);
564 		sys_prof_refcount_enter++;
565 	}
566 	mutex_unlock(&syscall_trace_lock);
567 	return ret;
568 }
569 
570 void unreg_prof_syscall_enter(char *name)
571 {
572 	int num;
573 
574 	num = syscall_name_to_nr(name);
575 	if (num < 0 || num >= NR_syscalls)
576 		return;
577 
578 	mutex_lock(&syscall_trace_lock);
579 	sys_prof_refcount_enter--;
580 	clear_bit(num, enabled_prof_enter_syscalls);
581 	if (!sys_prof_refcount_enter)
582 		unregister_trace_sys_enter(prof_syscall_enter);
583 	mutex_unlock(&syscall_trace_lock);
584 }
585 
586 static void prof_syscall_exit(struct pt_regs *regs, long ret)
587 {
588 	struct syscall_metadata *sys_data;
589 	struct syscall_trace_exit *rec;
590 	unsigned long flags;
591 	int syscall_nr;
592 	char *trace_buf;
593 	char *raw_data;
594 	int rctx;
595 	int size;
596 	int cpu;
597 
598 	syscall_nr = syscall_get_nr(current, regs);
599 	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
600 		return;
601 
602 	sys_data = syscall_nr_to_meta(syscall_nr);
603 	if (!sys_data)
604 		return;
605 
606 	/* We can probably do that at build time */
607 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
608 	size -= sizeof(u32);
609 
610 	/*
611 	 * Impossible, but be paranoid with the future
612 	 * How to put this check outside runtime?
613 	 */
614 	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
615 		"exit event has grown above profile buffer size"))
616 		return;
617 
618 	/* Protect the per cpu buffer, begin the rcu read side */
619 	local_irq_save(flags);
620 
621 	rctx = perf_swevent_get_recursion_context();
622 	if (rctx < 0)
623 		goto end_recursion;
624 
625 	cpu = smp_processor_id();
626 
627 	trace_buf = rcu_dereference(perf_trace_buf);
628 
629 	if (!trace_buf)
630 		goto end;
631 
632 	raw_data = per_cpu_ptr(trace_buf, cpu);
633 
634 	/* zero the dead bytes from align to not leak stack to user */
635 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
636 
637 	rec = (struct syscall_trace_exit *)raw_data;
638 
639 	tracing_generic_entry_update(&rec->ent, 0, 0);
640 	rec->ent.type = sys_data->exit_id;
641 	rec->nr = syscall_nr;
642 	rec->ret = syscall_get_return_value(current, regs);
643 
644 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
645 
646 end:
647 	perf_swevent_put_recursion_context(rctx);
648 end_recursion:
649 	local_irq_restore(flags);
650 }
651 
652 int reg_prof_syscall_exit(char *name)
653 {
654 	int ret = 0;
655 	int num;
656 
657 	num = syscall_name_to_nr(name);
658 	if (num < 0 || num >= NR_syscalls)
659 		return -ENOSYS;
660 
661 	mutex_lock(&syscall_trace_lock);
662 	if (!sys_prof_refcount_exit)
663 		ret = register_trace_sys_exit(prof_syscall_exit);
664 	if (ret) {
665 		pr_info("event trace: Could not activate"
666 				"syscall entry trace point");
667 	} else {
668 		set_bit(num, enabled_prof_exit_syscalls);
669 		sys_prof_refcount_exit++;
670 	}
671 	mutex_unlock(&syscall_trace_lock);
672 	return ret;
673 }
674 
675 void unreg_prof_syscall_exit(char *name)
676 {
677 	int num;
678 
679 	num = syscall_name_to_nr(name);
680 	if (num < 0 || num >= NR_syscalls)
681 		return;
682 
683 	mutex_lock(&syscall_trace_lock);
684 	sys_prof_refcount_exit--;
685 	clear_bit(num, enabled_prof_exit_syscalls);
686 	if (!sys_prof_refcount_exit)
687 		unregister_trace_sys_exit(prof_syscall_exit);
688 	mutex_unlock(&syscall_trace_lock);
689 }
690 
691 #endif
692 
693 
694