xref: /xnu-11215/tests/setitimer.c (revision 8d741a5d)
1 #include <darwintest.h>
2 
3 #include <assert.h>
4 #include <mach/clock_types.h>
5 #include <unistd.h>
6 #include <stdlib.h>
7 #include <stdio.h>
8 #include <errno.h>
9 #include <err.h>
10 #include <sys/time.h>
11 #include <mach/mach.h>
12 #include <mach/mach_time.h>
13 #include <pthread.h>
14 #include <perfdata/perfdata.h>
15 #include <sys/sysctl.h>
16 #include <sys/stat.h>
17 #include <sys/mount.h>
18 #include <stdbool.h>
19 #include <signal.h>
20 #include <sys/resource.h>
21 #include <sys/resource_private.h>
22 #include <time.h>
23 #include <os/atomic_private.h>
24 #include <libproc.h>
25 #include <TargetConditionals.h>
26 
27 #if __has_include(<mach/mach_time_private.h>)
28 #include <mach/mach_time_private.h>
29 #else
30 kern_return_t           mach_get_times(uint64_t* absolute_time,
31     uint64_t* continuous_time,
32     struct timespec *tp);
33 #endif
34 
35 /*
36  * This test program creates up to 8 worker threads performing
37  * mixed workloads of system calls (which contribute to both
38  * user and system time), as well as spins in userspace (which
39  * only contribute to user time).
40  *
41  * setitimer(2) is used to program timers that fire signals
42  * after various thresholds. The signal handler detects
43  * which thread the signal was delivered on by matching the
44  * stack pointer to ranges for each thread.
45  *
46  * After the test scenario is complete, the distribution of
47  * threads which received interrupts is evaluated to match
48  * expected heuristics.
49  */
50 
51 T_GLOBAL_META(
52 	T_META_RUN_CONCURRENTLY(false),
53 	T_META_CHECK_LEAKS(false),
54 	T_META_ALL_VALID_ARCHS(true),
55 	T_META_RADAR_COMPONENT_NAME("xnu"),
56 	T_META_RADAR_COMPONENT_VERSION("scheduler"),
57 	T_META_OWNER("chimene"),
58 	T_META_ENABLED(TARGET_OS_OSX),
59 	T_META_TAG_VM_NOT_ELIGIBLE
60 	);
61 
62 static void *stat_thread(void *arg);
63 static void *statfs_thread(void *arg);
64 
65 static void alrm_handler(int, struct __siginfo *, void *);
66 
67 static semaphore_t gMainWaitForWorkers;
68 static semaphore_t gWorkersStart;
69 
70 static pthread_mutex_t gShouldExitMutex = PTHREAD_MUTEX_INITIALIZER;
71 static pthread_cond_t  gShouldExitCondition = PTHREAD_COND_INITIALIZER;
72 
73 static _Atomic bool gShouldExit = false;
74 
75 static const uint32_t max_threads = 9;
76 
77 static struct threadentry {
78 	pthread_t thread;
79 	uint64_t tid;
80 	void* stack_addr;
81 	size_t stack_size;
82 	bool expect_cpu_usage;
83 	uint32_t alrm_count;
84 	uint32_t vtalrm_count;
85 	uint32_t prof_count;
86 	uint32_t xcpu_count;
87 	struct thsc_time_cpi self_stats;
88 } __attribute__((aligned(128))) gThreadList[max_threads];
89 
90 static uint32_t nworkers;
91 static uint32_t nthreads;
92 
93 static double offcore_time_percent_threshold = 75.0;
94 
95 static bool is_rosetta = false;
96 
97 static mach_timebase_info_data_t timebase_info;
98 
99 static uint64_t
abs_to_nanos(uint64_t abs)100 abs_to_nanos(uint64_t abs)
101 {
102 	return abs * timebase_info.numer / timebase_info.denom;
103 }
104 
105 /* Some statistics APIs return host abstime instead of Rosetta-translated abstime */
106 static uint64_t
abs_to_nanos_host(uint64_t abstime)107 abs_to_nanos_host(uint64_t abstime)
108 {
109 	if (is_rosetta) {
110 		return abstime * 125 / 3;
111 	} else {
112 		return abs_to_nanos(abstime);
113 	}
114 }
115 
116 static int
processIsTranslated(void)117 processIsTranslated(void)
118 {
119 	int ret = 0;
120 	size_t size = sizeof(ret);
121 	if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) == -1) {
122 		if (errno == ENOENT) {
123 			return 0;
124 		} else {
125 			return -1;
126 		}
127 	}
128 	return ret;
129 }
130 
131 static void
fill_thread_stats(uint32_t i)132 fill_thread_stats(uint32_t i)
133 {
134 	struct threadentry *entry = &gThreadList[i];
135 
136 	int rv = thread_selfcounts(THSC_TIME_CPI, &entry->self_stats, sizeof(entry->self_stats));
137 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "thread_selfcounts(THSC_TIME_CPI)");
138 }
139 
140 T_DECL(setitimer,
141     "Test various setitimer delivered signals to CPU-burning threads")
142 {
143 	int rv;
144 	kern_return_t kr;
145 	uint32_t ncpu;
146 	size_t ncpu_size = sizeof(ncpu);
147 
148 	struct sched_param self_param = {.sched_priority = 47};
149 
150 	rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, &self_param);
151 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_setschedparam");
152 
153 	kr = mach_timebase_info(&timebase_info);
154 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
155 
156 	is_rosetta = processIsTranslated();
157 
158 	rv = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
159 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
160 
161 	if (ncpu < 2) {
162 		T_SKIP("%d CPUs not supported for test, returning success", ncpu);
163 	}
164 
165 	nworkers = MIN(max_threads - 1, ncpu);
166 	nthreads = nworkers + 1;
167 
168 	T_LOG("rosetta = %d\n", is_rosetta);
169 	T_LOG("hw.ncpu = %d\n", ncpu);
170 	T_LOG("nworkers = %d\n", nworkers);
171 	T_LOG("nthreads = %d\n", nthreads);
172 
173 	kr = semaphore_create(mach_task_self(), &gMainWaitForWorkers, SYNC_POLICY_FIFO, 0);
174 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
175 
176 	kr = semaphore_create(mach_task_self(), &gWorkersStart, SYNC_POLICY_FIFO, 0);
177 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
178 
179 	pthread_attr_t attr;
180 
181 	rv = pthread_attr_init(&attr);
182 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
183 
184 	struct sched_param child_param = {.sched_priority = 37};
185 
186 	rv = pthread_attr_setschedparam(&attr, &child_param);
187 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_set_qos_class_np");
188 
189 	for (uint32_t i = 0; i < nthreads; i++) {
190 		if (i == 0) {
191 			gThreadList[i].thread = pthread_self();
192 		} else {
193 			rv = pthread_create(&gThreadList[i].thread, &attr,
194 			    i % 2 ? stat_thread : statfs_thread,
195 			    (void *)(uintptr_t)i);
196 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
197 			gThreadList[i].expect_cpu_usage = i % 2 == 0 ? true : false;
198 		}
199 
200 		rv = pthread_threadid_np(gThreadList[i].thread, &gThreadList[i].tid);
201 		T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_threadid_np");
202 
203 		gThreadList[i].stack_addr = pthread_get_stackaddr_np(gThreadList[i].thread);
204 		gThreadList[i].stack_size = pthread_get_stacksize_np(gThreadList[i].thread);
205 	}
206 
207 	rv = pthread_attr_destroy(&attr);
208 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
209 
210 	for (uint32_t i = 1; i < nthreads; i++) {
211 		kr = semaphore_wait(gMainWaitForWorkers);
212 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait()");
213 	}
214 
215 	for (uint32_t i = 0; i < nthreads; i++) {
216 		T_LOG("Thread %p (0x%llx) checked in, stack %p/%p\n",
217 		    (void*)gThreadList[i].thread,
218 		    gThreadList[i].tid,
219 		    gThreadList[i].stack_addr,
220 		    (void *)gThreadList[i].stack_size);
221 	}
222 
223 	sigset_t sigmk;
224 	sigemptyset(&sigmk);
225 
226 	struct sigaction sigact = {
227 		.sa_sigaction = alrm_handler,
228 		.sa_mask = sigmk,
229 		.sa_flags = SA_SIGINFO,
230 	};
231 
232 	rv = sigaction(SIGALRM, &sigact, NULL);
233 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGALRM)");
234 
235 	rv = sigaction(SIGVTALRM, &sigact, NULL);
236 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGVTALRM)");
237 
238 	rv = sigaction(SIGPROF, &sigact, NULL);
239 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGPROF)");
240 
241 	rv = sigaction(SIGXCPU, &sigact, NULL);
242 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGXCPU)");
243 
244 	struct itimerval itime = {
245 		.it_interval.tv_sec = 0,
246 		.it_interval.tv_usec = 10000,
247 		.it_value.tv_sec = 0,
248 		.it_value.tv_usec = 10,  /* immediately */
249 	};
250 
251 	rv = setitimer(ITIMER_REAL, &itime, NULL);
252 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
253 
254 	rv = setitimer(ITIMER_VIRTUAL, &itime, NULL);
255 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
256 
257 	rv = setitimer(ITIMER_PROF, &itime, NULL);
258 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
259 
260 	struct rlimit rlim = {};
261 
262 	rv = getrlimit(RLIMIT_CPU, &rlim);
263 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrlimit(RLIMIT_CPU)");
264 
265 	rlim.rlim_cur = 1;
266 	rv = setrlimit(RLIMIT_CPU, &rlim);
267 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)");
268 
269 	rv = pthread_mutex_lock(&gShouldExitMutex);
270 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
271 
272 	kr = semaphore_signal_all(gWorkersStart);
273 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all()");
274 
275 	struct timespec timenow = {};
276 	uint64_t time_start;
277 
278 	kr = mach_get_times(&time_start, NULL, &timenow);
279 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_get_times()");
280 
281 	struct timespec timeout = {
282 		.tv_sec = timenow.tv_sec + 10,
283 		.tv_nsec = timenow.tv_nsec,
284 	};
285 
286 	uint64_t time_end = 0;
287 
288 	do {
289 		assert(os_atomic_load(&gShouldExit, relaxed) == false);
290 
291 		rv = pthread_cond_timedwait(&gShouldExitCondition, &gShouldExitMutex, &timeout);
292 		if (rv == ETIMEDOUT) {
293 			os_atomic_store(&gShouldExit, true, relaxed);
294 
295 			time_end = mach_absolute_time();
296 
297 			struct itimerval itime_stop = {
298 				.it_interval.tv_sec = 0,
299 				.it_interval.tv_usec = 0,
300 				.it_value.tv_sec = 0,
301 				.it_value.tv_usec = 0,  /* stop immediately */
302 			};
303 
304 			rv = setitimer(ITIMER_REAL, &itime_stop, NULL);
305 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
306 
307 			rv = setitimer(ITIMER_VIRTUAL, &itime_stop, NULL);
308 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)");
309 
310 			rv = setitimer(ITIMER_PROF, &itime_stop, NULL);
311 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)");
312 
313 			break;
314 		} else {
315 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_cond_timedwait(&gShouldExitCondition, ...)");
316 		}
317 	} while (true);
318 
319 	rv = pthread_mutex_unlock(&gShouldExitMutex);
320 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
321 
322 	for (uint32_t i = 1; i < nthreads; i++) {
323 		rv = pthread_join(gThreadList[i].thread, NULL);
324 		T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join");
325 	}
326 
327 	uint64_t test_duration = time_end - time_start;
328 	uint64_t test_duration_ns = abs_to_nanos(test_duration);
329 
330 	double elapsed_secs = (double) test_duration_ns / (uint64_t)NSEC_PER_SEC;
331 
332 	T_LOG("test duration %3.3f seconds\n", elapsed_secs);
333 
334 	fill_thread_stats(0);
335 
336 	struct rusage_info_v6 ru = {};
337 	rv = proc_pid_rusage(getpid(), RUSAGE_INFO_V6, (rusage_info_t *)&ru);
338 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "proc_pid_rusage");
339 
340 	uint64_t total_user_time_ns = abs_to_nanos_host(ru.ri_user_time);
341 	double total_user_time_s = (double)total_user_time_ns / (uint64_t)NSEC_PER_SEC;
342 
343 	uint64_t total_system_time_ns = abs_to_nanos_host(ru.ri_system_time);
344 	double total_system_time_s = (double)total_system_time_ns / (uint64_t)NSEC_PER_SEC;
345 
346 	uint64_t total_time_ns = (total_user_time_ns + total_system_time_ns);
347 	double total_time_s = (double)total_time_ns / (uint64_t)NSEC_PER_SEC;
348 
349 	uint64_t total_runnable_time_ns = abs_to_nanos_host(ru.ri_runnable_time);
350 	double total_runnable_time_s = (double)total_runnable_time_ns / (uint64_t)NSEC_PER_SEC;
351 
352 	uint64_t total_pending_time_ns = total_runnable_time_ns - (total_time_ns);
353 	double total_pending_time_s = (double)total_pending_time_ns / (uint64_t)NSEC_PER_SEC;
354 
355 	uint64_t total_p_time_ns = abs_to_nanos_host(ru.ri_user_ptime + ru.ri_system_ptime);
356 	double total_p_time_s = (double)total_p_time_ns / (uint64_t)NSEC_PER_SEC;
357 
358 	T_LOG("total usage: time: %3.3f user: %3.3f kernel: %3.3f runnable: %3.3f pending: %3.3f pcore: %3.3f\n",
359 	    total_time_s, total_user_time_s, total_system_time_s,
360 	    total_runnable_time_s, total_pending_time_s,
361 	    total_p_time_s);
362 
363 	/*
364 	 * "Good" data looks like:
365 	 *
366 	 * total usage: time: 77.696 user: 16.570 kernel: 61.126 runnable: 79.951 pending: 2.255 pcore: 72.719
367 	 * Thread        ALRM VTALRM   PROF   XCPU      inst        cycle               user                  kernel          offcore  type
368 	 * 0x16f78f000      0    251    811      0  27680301973  28913501188   3706622958 (  38.14%)   6012631083 (  61.86%)    2.81%  statfs
369 	 * 0x16f81b000      0      2    889      0  27962710058  28780576123    439297291 (   4.53%)   9259942583 (  95.47%)    3.01%  stat
370 	 * 0x16f8a7000      0    251    836      0  27558331077  28889228535   3699010000 (  38.08%)   6016015083 (  61.92%)    2.85%  statfs
371 	 * 0x16f933000      0      0    939      0  28078084696  28880195679    443067500 (   4.56%)   9269807666 (  95.44%)    2.87%  stat
372 	 * 0x16f9bf000      0    283    874      0  27691851016  28969873070   3710916750 (  38.16%)   6012783541 (  61.84%)    2.76%  statfs
373 	 * 0x16fa4b000      0      2    908      1  27945063330  28769971396    438583000 (   4.53%)   9252694291 (  95.47%)    3.09%  stat
374 	 * 0x16fad7000      0    262    889      0  27328496429  28772748055   3689245375 (  38.03%)   6011061458 (  61.97%)    3.00%  statfs
375 	 * 0x16fb63000      0      0    914      0  27942195343  28757254100    439690166 (   4.53%)   9256659500 (  95.47%)    3.04%  stat
376 	 * 0x1fe2bb400   1001      0      3      0     72144372    102339334      3532125 (   9.35%)     34249208 (  90.65%)   99.62%  main
377 	 */
378 	uint32_t total_alrm = 0;
379 	uint32_t total_vtalrm = 0;
380 	uint32_t total_prof = 0;
381 	uint32_t total_xcpu = 0;
382 	uint32_t total_vtalrm_in_cpubound = 0;
383 
384 	uint32_t total_threads_not_finding_cpus = 0;
385 
386 	T_LOG("Thread         ALRM VTALRM   PROF   XCPU      "
387 	    "inst        cycle               user                  kernel          "
388 	    "offcore type\n");
389 
390 	for (uint32_t i = 0; i < nthreads; i++) {
391 		uint64_t user_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_user_time_mach);
392 		uint64_t system_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_system_time_mach);
393 
394 
395 		uint64_t total_time = user_time + system_time;
396 
397 		double percentage_user = (double)user_time / (double) total_time * 100;
398 		double percentage_system = (double)system_time / (double) total_time * 100;
399 		double percentage_not_running = (double)(test_duration_ns - total_time) / (double) test_duration_ns * 100;
400 
401 		char* thread_type_str = "";
402 		char* warning_str = "";
403 
404 		if (i == 0) {
405 			thread_type_str = "main ";
406 		} else {
407 			thread_type_str = i % 2 ? "stat   " : "statfs ";
408 
409 			if (percentage_not_running > offcore_time_percent_threshold) {
410 				total_threads_not_finding_cpus++;
411 				warning_str = "** too much offcore time **";
412 			}
413 		}
414 
415 		T_LOG("0x%010llx %6d %6d %6d %6d %12lld %12lld %12lld (%7.2f%%) %12lld (%7.2f%%) %7.2f%% %s%s\n",
416 		    gThreadList[i].tid,
417 		    gThreadList[i].alrm_count,
418 		    gThreadList[i].vtalrm_count,
419 		    gThreadList[i].prof_count,
420 		    gThreadList[i].xcpu_count,
421 		    gThreadList[i].self_stats.ttci_instructions,
422 		    gThreadList[i].self_stats.ttci_cycles,
423 		    user_time, percentage_user,
424 		    system_time, percentage_system,
425 		    percentage_not_running,
426 		    thread_type_str, warning_str);
427 
428 		total_alrm += gThreadList[i].alrm_count;
429 		total_vtalrm += gThreadList[i].vtalrm_count;
430 		total_prof += gThreadList[i].prof_count;
431 		total_xcpu += gThreadList[i].xcpu_count;
432 
433 		if (gThreadList[i].expect_cpu_usage) {
434 			total_vtalrm_in_cpubound += gThreadList[i].vtalrm_count;
435 		}
436 	}
437 
438 	/*
439 	 * We expect all SIGALRM to go to the main thread, because it is the
440 	 * first thread in the process with the signal unmasked, and we
441 	 * never expect the signal handler itself to take >10ms
442 	 *
443 	 * This can happen if the main thread is preempted for the entire 10ms duration, though.
444 	 * Being high priority, it shouldn't be delayed for more than 10ms too often.
445 	 * Allow up to 10% to deliver to other threads.
446 	 */
447 	if ((double)gThreadList[0].alrm_count * 100 / total_alrm < 90.0) {
448 		T_FAIL("SIGALRM delivered to non-main thread more than 10%% of the time (%d of %d)",
449 		    gThreadList[0].alrm_count,
450 		    total_alrm);
451 	}
452 
453 	/* We expect all worker threads to find CPUs of their own for most of the test */
454 	if (total_threads_not_finding_cpus != 0) {
455 		T_FAIL("%d worker threads spent more than %2.0f%% of time off-core",
456 		    total_threads_not_finding_cpus, offcore_time_percent_threshold);
457 	}
458 
459 	/*
460 	 * SIGVTALRM is delivered based on user time, and we expect the busy
461 	 * threads to have an advantage and account for 80% (non-scientific) of events,
462 	 * since the other threads will spend more time in kernel mode.
463 	 */
464 	if (total_vtalrm_in_cpubound * 100 / total_vtalrm < 80) {
465 		T_FAIL("SIGVTALRM delivered to threads without extra userspace spin (only %d of %d)",
466 		    total_vtalrm_in_cpubound, total_vtalrm);
467 	}
468 
469 	/*
470 	 * SIGPROF is delivered based on user+system time, and we expect it to be distributed
471 	 * among non-blocked threads (so not the main thread, which only handles SIGALRM).
472 	 */
473 	if (gThreadList[0].prof_count * 100 / total_prof > 1) {
474 		T_FAIL("SIGPROF delivered to main thread more than 1%% (%d of %d)",
475 		    gThreadList[0].prof_count,
476 		    total_prof);
477 	}
478 
479 	/*
480 	 * SIGXCPU should be delivered exactly once.
481 	 */
482 	if (total_xcpu == 0) {
483 		T_FAIL("SIGXCPU delivered %d times (expected at least once)", total_xcpu);
484 	}
485 }
486 
487 static void *
stat_thread(void * arg)488 stat_thread(void *arg)
489 {
490 	kern_return_t kr;
491 	int rv;
492 
493 	/* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
494 	kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
495 	if (kr != KERN_ABORTED) {
496 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
497 	}
498 
499 	rv = pthread_mutex_lock(&gShouldExitMutex);
500 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
501 	rv = pthread_mutex_unlock(&gShouldExitMutex);
502 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
503 
504 	do {
505 		struct stat sb;
506 
507 		rv = stat("/", &sb);
508 		if (rv != 0) {
509 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "stat");
510 		}
511 	} while (os_atomic_load(&gShouldExit, relaxed) == false);
512 
513 	fill_thread_stats((uint32_t)(uintptr_t)arg);
514 
515 	return NULL;
516 }
517 
518 static void *
statfs_thread(void * arg)519 statfs_thread(void *arg)
520 {
521 	kern_return_t kr;
522 	uint64_t previous_spin_timestamp;
523 	int iteration = 0;
524 	int rv;
525 
526 	/* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
527 	kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
528 	if (kr != KERN_ABORTED) {
529 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
530 	}
531 
532 	rv = pthread_mutex_lock(&gShouldExitMutex);
533 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
534 	rv = pthread_mutex_unlock(&gShouldExitMutex);
535 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
536 
537 	previous_spin_timestamp = mach_absolute_time();
538 
539 	do {
540 		struct statfs sf;
541 
542 		/*
543 		 * Every so many system calls, inject a spin in userspace
544 		 * proportional to how much time was spent performing the
545 		 * system calls.
546 		 */
547 #define SYSCALL_ITERATIONS_BETWEEN_SPINS (10000)
548 		if (++iteration % SYSCALL_ITERATIONS_BETWEEN_SPINS == 0) {
549 			uint64_t now = mach_absolute_time();
550 			uint64_t spin_deadline = now + (now - previous_spin_timestamp) / 2;
551 
552 			while (mach_absolute_time() < spin_deadline) {
553 				;
554 			}
555 
556 			previous_spin_timestamp = mach_absolute_time();
557 		}
558 
559 		rv = statfs("/", &sf);
560 		if (rv != 0) {
561 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "statfs");
562 		}
563 	} while (os_atomic_load(&gShouldExit, relaxed) == false);
564 
565 	fill_thread_stats((uint32_t)(uintptr_t)arg);
566 
567 	return NULL;
568 }
569 
570 static void
alrm_handler(int signum,struct __siginfo * info __unused,void * uap)571 alrm_handler(int signum, struct __siginfo *info __unused, void *uap)
572 {
573 	ucontext_t *context = (ucontext_t *)uap;
574 	struct threadentry *entry = NULL;
575 	void *sp;
576 
577 #if defined(__arm64__)
578 	sp = (void *)__darwin_arm_thread_state64_get_sp((context->uc_mcontext)->__ss);
579 #elif defined(__i386__)
580 	sp = (void *)(context->uc_mcontext)->__ss.__esp;
581 #elif defined(__x86_64__)
582 	sp = (void *)(context->uc_mcontext)->__ss.__rsp;
583 #else
584 #error Unrecognized architecture
585 #endif
586 
587 	for (uint32_t i = 0; i < nworkers + 1; i++) {
588 		struct threadentry *t = &gThreadList[i];
589 		if (((uintptr_t)sp >= ((uintptr_t)t->stack_addr - t->stack_size) &&
590 		    ((uintptr_t)sp < (uintptr_t)t->stack_addr))) {
591 			entry = t;
592 			break;
593 		}
594 	}
595 
596 	if (entry == NULL) {
597 		T_ASSERT_FAIL("Signal %d delivered to unknown thread, SP=%p", signum, sp);
598 	}
599 
600 	switch (signum) {
601 	case SIGALRM:
602 		os_atomic_inc(&entry->alrm_count, relaxed);
603 		break;
604 	case SIGVTALRM:
605 		os_atomic_inc(&entry->vtalrm_count, relaxed);
606 		break;
607 	case SIGPROF:
608 		os_atomic_inc(&entry->prof_count, relaxed);
609 		break;
610 	case SIGXCPU:
611 		os_atomic_inc(&entry->xcpu_count, relaxed);
612 		break;
613 	}
614 }
615 
616 // When the SIGPROF signal was received.
617 static uint64_t sigprof_received_ns = 0;
618 
619 static const uint64_t ITIMER_PROF_SECS = 2;
620 #define PROF_EXTRA_THREAD_COUNT (1)
621 // Include the main thread as participating in consuming CPU.
622 static const uint64_t EXPECTED_PROF_DURATION_SECS =
623     ITIMER_PROF_SECS / (PROF_EXTRA_THREAD_COUNT + 1);
624 static const uint64_t EXTRA_TIMEOUT_SECS = 1;
625 
626 struct cpu_spin_args {
627 	bool spin_while_true;
628 };
629 
630 static void *
cpu_thread_main(void * arg)631 cpu_thread_main(void *arg)
632 {
633 	bool *spin = arg;
634 	while (*spin) {
635 	}
636 	return NULL;
637 }
638 
639 static void
sigprof_received(int __unused sig)640 sigprof_received(int __unused sig)
641 {
642 	sigprof_received_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
643 }
644 
645 T_DECL(setitimer_prof,
646     "ensure a single-threaded process doesn't receive early signals",
647     T_META_TAG_VM_PREFERRED)
648 {
649 	T_SETUPBEGIN;
650 	(void)signal(SIGPROF, sigprof_received);
651 
652 	uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
653 	uint64_t expected_end_ns = start_ns + (ITIMER_PROF_SECS * NSEC_PER_SEC);
654 	uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
655 
656 	struct itimerval prof_timer = {
657 		.it_value.tv_sec = ITIMER_PROF_SECS,
658 	};
659 
660 	int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
661 	T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
662 	    ITIMER_PROF_SECS);
663 	T_SETUPEND;
664 
665 	uint64_t last_ns = 0;
666 	while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
667 		if (sigprof_received_ns) {
668 			break;
669 		}
670 	}
671 
672 	T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
673 	    (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
674 	T_LOG("SIGPROF was delivered %+.6gms after expected duration",
675 	    (double)(last_ns - expected_end_ns) / 1e6);
676 	T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
677 	    (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
678 }
679 
680 T_DECL(setitimer_prof_multi_threaded,
681     "ensure a multi-threaded process doesn't receive early signals",
682     T_META_TAG_VM_PREFERRED)
683 {
684 	T_SETUPBEGIN;
685 	(void)signal(SIGPROF, sigprof_received);
686 
687 	pthread_t cpu_threads[PROF_EXTRA_THREAD_COUNT] = { 0 };
688 	bool spin_while_true = true;
689 	for (unsigned int i = 0; i < PROF_EXTRA_THREAD_COUNT; i++) {
690 		int error = pthread_create(&cpu_threads[i], NULL, cpu_thread_main, &spin_while_true);
691 		T_QUIET; T_ASSERT_POSIX_ZERO(error, "create thread %d", i);
692 	}
693 	T_LOG("spinning %d threads on CPU", PROF_EXTRA_THREAD_COUNT + 1);
694 
695 	uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
696 	uint64_t expected_end_ns = start_ns + (EXPECTED_PROF_DURATION_SECS * NSEC_PER_SEC);
697 	uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
698 
699 	struct itimerval prof_timer = {
700 		.it_value.tv_sec = ITIMER_PROF_SECS,
701 	};
702 
703 	int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
704 	T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
705 	    ITIMER_PROF_SECS);
706 	T_SETUPEND;
707 
708 	uint64_t last_ns = 0;
709 	while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
710 		if (sigprof_received_ns) {
711 			break;
712 		}
713 	}
714 
715 	spin_while_true = false;
716 	T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
717 	    (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
718 	T_LOG("SIGPROF was delivered %+.6gms after expected duration",
719 	    (double)(last_ns - expected_end_ns) / 1e6);
720 	T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
721 	    (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
722 }
723 
724 static uint64_t sigprof_received_usage_mach = 0;
725 extern uint64_t __thread_selfusage(void);
726 static int const ITERATION_COUNT = 50;
727 static uint64_t const ITIMER_PERF_PROF_US = 50000;
728 
729 static void
sigprof_received_usage(int __unused sig)730 sigprof_received_usage(int __unused sig)
731 {
732 	sigprof_received_usage_mach = __thread_selfusage();
733 }
734 
735 T_DECL(setitimer_prof_latency,
736     "measure the latency of delivering SIGPROF",
737     T_META_TAG_PERF, T_META_TAG_VM_NOT_ELIGIBLE,
738     T_META_TIMEOUT(10))
739 {
740 	T_SETUPBEGIN;
741 	(void)signal(SIGPROF, sigprof_received_usage);
742 
743 	struct itimerval prof_timer = {
744 		.it_value.tv_usec = ITIMER_PERF_PROF_US,
745 	};
746 	mach_timebase_info_data_t timebase = { 0 };
747 	int ret = mach_timebase_info(&timebase);
748 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "mach_timebase_info");
749 
750 	char pd_path[PATH_MAX] = "";
751 	pdwriter_t pd = pdwriter_open_tmp("xnu", "setitimer_sigprof_latency", 1, 0, pd_path, sizeof(pd_path));
752 	T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(pd, "opened perfdata file");
753 	T_SETUPEND;
754 
755 	T_LOG("running %d iterations, %lluus each", ITERATION_COUNT, ITIMER_PERF_PROF_US);
756 	for (int i = 0; i < ITERATION_COUNT; i++) {
757 		sigprof_received_usage_mach = 0;
758 
759 		uint64_t const start_usage_mach = __thread_selfusage();
760 
761 		ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
762 		T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
763 		    ITIMER_PROF_SECS);
764 
765 		while (sigprof_received_usage_mach == 0) {
766 			// XXX ITIMER_PROF only re-evaluates the timers at user/kernel boundaries.
767 			// Issue a trivial syscall (getpid has a fast path without making a syscall)
768 			// to ensure the system checks the timer.
769 			(void)getppid();
770 		}
771 
772 		uint64_t const end_usage_mach = start_usage_mach +
773 		    (ITIMER_PERF_PROF_US * NSEC_PER_USEC) * timebase.denom / timebase.numer;
774 		uint64_t const latency_mach = sigprof_received_usage_mach - end_usage_mach;
775 		uint64_t const latency_ns = latency_mach * timebase.denom / timebase.numer;
776 
777 		pdwriter_new_value(pd, "sigprof_latency", pdunit_ns, latency_ns);
778 	}
779 
780 	pdwriter_close(pd);
781 	T_LOG("wrote perfdata to %s", pd_path);
782 }
783