1 // Copyright (c) 2023 Apple Inc.  All rights reserved.
2 
3 #include <unistd.h>
4 #include <stdlib.h>
5 #include <stdio.h>
6 #include <pthread.h>
7 #include <string.h>
8 #include <mach/mach.h>
9 #include <mach/mach_time.h>
10 #include <sys/stat.h>
11 #include <sys/sysctl.h>
12 #include <stdatomic.h>
13 #include <sys/work_interval.h>
14 #include <ktrace.h>
15 #include <sys/kdebug.h>
16 
17 #include <darwintest.h>
18 #include <darwintest_utils.h>
19 #include "test_utils.h"
20 #include "sched_test_utils.h"
21 
22 T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
23     T_META_RADAR_COMPONENT_NAME("xnu"),
24     T_META_RADAR_COMPONENT_VERSION("scheduler"),
25     T_META_TAG_VM_NOT_ELIGIBLE,
26     T_META_REQUIRES_SYSCTL_NE("debug.sched_hygiene_debug_available", 0));
27 
28 static int BACKGROUND_PRI;
29 static int NUM_THREADS;
30 static uint64_t SLEEP_SECONDS;
31 static uint64_t INTERRUPT_DISABLE_TIMEOUT_NS;
32 
33 static uint64_t start_timestamp = 0ULL;
34 static volatile int sum = 0; // Keeps the spin-loop below from compiling-out
35 
36 static void *
make_tg_and_spin(__unused void * arg)37 make_tg_and_spin(__unused void *arg)
38 {
39 	int ret;
40 	assert(SLEEP_SECONDS > 2);
41 
42 	/* Create and join a new thread group (TG) */
43 	work_interval_t wi_handle;
44 	ret = work_interval_create(&wi_handle, WORK_INTERVAL_FLAG_GROUP);
45 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, 0, "work_interval_create");
46 
47 	/* Allow other threads a chance to get on-core and create/join their own TGs */
48 	uint64_t yield_deadline = start_timestamp + nanos_to_abs(1 * NSEC_PER_SEC);
49 	while (mach_absolute_time() < yield_deadline) {
50 		sched_yield();
51 	}
52 
53 	/*
54 	 * Remain runnable long enough for the sched_maintenance_thread to scan the
55 	 * many created TGs all at the same time in one scheduler tick.
56 	 */
57 	uint64_t spin_deadline = start_timestamp + nanos_to_abs((SLEEP_SECONDS - 2) * NSEC_PER_SEC);
58 	while (mach_absolute_time() < spin_deadline) {
59 		sum++;
60 	}
61 
62 	/*
63 	 * Terminate with about a second to spare of SLEEP_SECONDS, so that we have
64 	 * time to bring down the number of runnable thread groups before the test
65 	 * case reenables the previous kern.interrupt_masked_debug_mode value.
66 	 * Otherwise, a system failing this test could panic.
67 	 */
68 	return NULL;
69 }
70 
71 static void
start_threads(pthread_t * threads,void * (* start_routine)(void *),int priority,int num_threads)72 start_threads(pthread_t *threads, void *(*start_routine)(void *), int priority, int num_threads)
73 {
74 	int rv;
75 	pthread_attr_t attr;
76 
77 	rv = pthread_attr_init(&attr);
78 	T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_init");
79 
80 	for (int i = 0; i < num_threads; i++) {
81 		struct sched_param param = { .sched_priority = priority };
82 
83 		rv = pthread_attr_setschedparam(&attr, &param);
84 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setschedparam");
85 
86 		rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
87 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setdetachstate");
88 
89 		/* Make the thread stacks smaller, so pthread will let us make more */
90 		rv = pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
91 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setstacksize");
92 
93 		rv = pthread_create(&threads[i], &attr, start_routine, NULL);
94 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_create");
95 	}
96 
97 	rv = pthread_attr_destroy(&attr);
98 	T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_destroy");
99 }
100 
101 static uint64_t old_preemption_debug_mode = 0;
102 static size_t old_preemption_debug_mode_size = sizeof(old_preemption_debug_mode);
103 
104 static void
restore_preemption_disable_debug_mode(void)105 restore_preemption_disable_debug_mode(void)
106 {
107 	int ret = sysctlbyname("kern.sched_preemption_disable_debug_mode", NULL, NULL,
108 	    &old_preemption_debug_mode, old_preemption_debug_mode_size);
109 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.sched_preemption_disable_debug_mode");
110 	T_LOG("kern.sched_preemption_disable_debug_mode restored to previous value: %llu", old_preemption_debug_mode);
111 }
112 
113 static uint64_t old_interrupt_debug_mode = 0;
114 static size_t old_interrupt_debug_mode_size = sizeof(old_interrupt_debug_mode);
115 
116 static void
restore_interrupt_disable_debug_mode(void)117 restore_interrupt_disable_debug_mode(void)
118 {
119 	int ret = sysctlbyname("kern.interrupt_masked_debug_mode", NULL, NULL,
120 	    &old_interrupt_debug_mode, old_interrupt_debug_mode_size);
121 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_debug_mode");
122 	T_LOG("kern.interrupt_masked_debug_mode restored to previous value: %llu", old_interrupt_debug_mode);
123 }
124 
125 static uint64_t old_interrupt_disable_timeout = 0;
126 static size_t old_interrupt_disable_timeout_size = sizeof(old_interrupt_disable_timeout);
127 
128 static void
restore_interrupt_disable_timeout(void)129 restore_interrupt_disable_timeout(void)
130 {
131 	int ret = sysctlbyname("kern.interrupt_masked_threshold_mt", NULL, NULL,
132 	    &old_interrupt_disable_timeout, old_interrupt_disable_timeout_size);
133 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_threshold_mt");
134 	T_LOG("kern.interrupt_masked_threshold_mt restored to previous value: %llu", old_interrupt_disable_timeout);
135 }
136 
137 static char *trace_location = NULL;
138 
139 static void
delete_trace_file(void)140 delete_trace_file(void)
141 {
142 	int ret;
143 	/* Delete trace file in order to reclaim disk space on the test device */
144 	ret = remove(trace_location);
145 	T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_SUCCESS(ret, "remove trace file");
146 }
147 
148 static const char *ktrace_file_short_name = "overload_runqueue_with_thread_groups.ktrace";
149 
150 static void
save_collected_ktrace(char * trace_path)151 save_collected_ktrace(char *trace_path)
152 {
153 	int ret;
154 
155 	T_LOG("ktrace file saved at \"%s\"", trace_path);
156 	ret = chmod(trace_path, 0666);
157 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "chmod");
158 
159 	char compressed_path[MAXPATHLEN];
160 	snprintf(compressed_path, MAXPATHLEN, "%s.tar.gz", ktrace_file_short_name);
161 	ret = dt_resultfile(compressed_path, sizeof(compressed_path));
162 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_resultfile marking \"%s\" for collection", compressed_path);
163 	T_LOG("\"%s\" marked for upload", compressed_path);
164 
165 	char *tar_args[] = {"/usr/bin/tar", "-czvf", compressed_path, trace_path, NULL};
166 	pid_t tar_pid = dt_launch_tool_pipe(tar_args, false, NULL,
167 	    ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
168 		return true;
169 	},
170 	    ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
171 		T_LOG("[tar] Error msg: %s", data);
172 		return true;
173 	},
174 	    BUFFER_PATTERN_LINE, NULL);
175 
176 	T_QUIET; T_ASSERT_TRUE(tar_pid, "[tar] pid %d", tar_pid);
177 	ret = dt_waitpid(tar_pid, NULL, NULL, 0);
178 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_waitpid");
179 
180 	ret = chmod(compressed_path, 0666);
181 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "chmod");
182 }
183 
184 /*
185  * Parse the recorded ktrace file to see if we crossed the set interrupt-disabled
186  * timeout and thus failed the test.
187  */
188 static void
search_for_interrupt_disable_timeout_tracepoint(char * trace_path)189 search_for_interrupt_disable_timeout_tracepoint(char *trace_path)
190 {
191 	__block int ret;
192 	trace_location = trace_path;
193 	T_ATEND(delete_trace_file);
194 	ktrace_session_t read_session = ktrace_session_create();
195 	ret = ktrace_set_file(read_session, trace_path);
196 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_set_file");
197 	__block const char *offending_thread = NULL;
198 
199 	ktrace_events_single(read_session, MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_EXPIRED), ^(ktrace_event_t e) {
200 		if (offending_thread == NULL) {
201 		        T_LOG("Interrupts were held disabled for %llu ns, crossing the %llu ns threshold:", abs_to_nanos(e->arg1), INTERRUPT_DISABLE_TIMEOUT_NS);
202 		        ret = ktrace_print_trace_point(stdout, read_session, e, KTP_KIND_CSV,
203 		        KTP_FLAG_WALLTIME | KTP_FLAG_THREADNAME | KTP_FLAG_PID | KTP_FLAG_EVENTNAME | KTP_FLAG_EXECNAME);
204 		        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ktrace_print_trace_point output");
205 		        printf("\n"); // Flush output from ktrace_print_trace_point
206 		        offending_thread = ktrace_get_name_for_thread(read_session, e->threadid);
207 		        ktrace_end(read_session, 0);
208 		}
209 	});
210 
211 	ktrace_set_completion_handler(read_session, ^{
212 		if (offending_thread == NULL) {
213 		        T_PASS("Scheduler survived %d simulatenously runnable thread groups without disabling interrupts for more than %llu ns!", NUM_THREADS, INTERRUPT_DISABLE_TIMEOUT_NS);
214 		} else {
215 		        save_collected_ktrace(trace_path);
216 		        T_FAIL("Interrupts held disabled for more than %llu ns by thread \"%s\"", INTERRUPT_DISABLE_TIMEOUT_NS, offending_thread);
217 		}
218 		T_END;
219 	});
220 
221 	ret = ktrace_start(read_session, dispatch_get_main_queue());
222 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start");
223 }
224 
225 T_DECL(overload_runqueue_with_thread_groups,
226     "Overload the runqueue with distinct thread groups to verify that the scheduler"
227     "does not trip an interrupts-disabled timeout whenever it scans the runqueue",
228     T_META_ASROOT(true),
229     XNU_T_META_SOC_SPECIFIC,
230     T_META_ENABLED(TARGET_OS_IOS))
231 {
232 	if (platform_is_virtual_machine()) {
233 		T_SKIP("Test not supposed to run on virtual machine. rdar://132930927");
234 	}
235 
236 	BACKGROUND_PRI = 4;
237 	NUM_THREADS = 1000;
238 	SLEEP_SECONDS = 20;
239 	/* Matches DEFAULT_INTERRUPT_MASKED_TIMEOUT value in XNU */
240 	INTERRUPT_DISABLE_TIMEOUT_NS = 500 * NSEC_PER_USEC; // 500 microseconds
241 
242 	__block int ret;
243 
244 	/* Configure interrupts-disabled timeout to drop a ktracepoint */
245 	uint64_t emit_tracepoint_mode = 1;
246 	ret = sysctlbyname("kern.interrupt_masked_debug_mode",
247 	    &old_interrupt_debug_mode, &old_interrupt_debug_mode_size,
248 	    &emit_tracepoint_mode, sizeof(emit_tracepoint_mode));
249 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_debug_mode");
250 	T_ATEND(restore_interrupt_disable_debug_mode);
251 	/* Configure the preemption-disabled debug mode as well, to avoid panicking if the test fails */
252 	ret = sysctlbyname("kern.sched_preemption_disable_debug_mode",
253 	    &old_preemption_debug_mode, &old_preemption_debug_mode_size,
254 	    &emit_tracepoint_mode, sizeof(emit_tracepoint_mode));
255 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.sched_preemption_disable_debug_mode");
256 	T_ATEND(restore_preemption_disable_debug_mode);
257 
258 	/* Set interrupts-disabled timeout threshold */
259 	uint64_t disable_timeout = nanos_to_abs(INTERRUPT_DISABLE_TIMEOUT_NS);
260 	ret = sysctlbyname("kern.interrupt_masked_threshold_mt",
261 	    &old_interrupt_disable_timeout, &old_interrupt_disable_timeout_size,
262 	    &disable_timeout, sizeof(disable_timeout));
263 	T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_threshold_mt");
264 	T_ATEND(restore_interrupt_disable_timeout);
265 
266 	/* Use ktrace to observe if the interrupt-disable timeout drops a tracepoint */
267 	ktrace_session_t session = ktrace_session_create();
268 	T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(session, "ktrace_session_create");
269 	char filepath_arr[MAXPATHLEN] = "";
270 	const char *tmp_dir = dt_tmpdir();
271 	strlcpy(filepath_arr, tmp_dir, sizeof(filepath_arr));
272 	strlcat(filepath_arr, "/", sizeof(filepath_arr));
273 	strlcat(filepath_arr, ktrace_file_short_name, sizeof(filepath_arr));
274 	ret = remove(filepath_arr);
275 	T_QUIET; T_WITH_ERRNO; T_ASSERT_TRUE((ret == 0) || (errno == ENOENT), "remove");
276 	char *filepath = filepath_arr;
277 	ret = ktrace_events_filter(session, "C0x01", ^(__unused ktrace_event_t event){}); // records scheduler tracepoints
278 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_filter");
279 	ktrace_set_completion_handler(session, ^{
280 		search_for_interrupt_disable_timeout_tracepoint(filepath);
281 	});
282 	ret = ktrace_start_writing_path(session, filepath, 0);
283 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start_writing_path");
284 
285 	/* Spin up lots of threads, each creating and joining its own thread group */
286 	T_LOG("Creating %d threads at pri %d, each with a unique thread group", NUM_THREADS, BACKGROUND_PRI);
287 	start_timestamp = mach_absolute_time();
288 	pthread_t *bg_threads = malloc(sizeof(pthread_t) * (size_t)NUM_THREADS);
289 	start_threads(bg_threads, make_tg_and_spin, BACKGROUND_PRI, NUM_THREADS);
290 
291 	T_LOG("Waiting %llu seconds to see if the scheduler can handle it...", SLEEP_SECONDS);
292 	dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(SLEEP_SECONDS * NSEC_PER_SEC)), dispatch_get_main_queue(), ^{
293 		ktrace_end(session, 0);
294 	});
295 	dispatch_main();
296 }
297