1 #include <stdio.h>
2 #include <unistd.h>
3 #include <stdlib.h>
4 #include <errno.h>
5 #include <string.h>
6 #include <assert.h>
7 #include <signal.h>
8 #include <spawn.h>
9 #include <spawn_private.h>
10 #include <stdint.h>
11 #include <sys/sysctl.h>
12 #include <sys/spawn_internal.h>
13 #include <sys/kern_memorystatus.h>
14 #include <mach-o/dyld.h>
15
16 #include <darwintest.h>
17 #include <darwintest_utils.h>
18
19 #include "test_utils.h"
20
21 T_GLOBAL_META(
22 T_META_NAMESPACE("xnu.vm"),
23 T_META_RADAR_COMPONENT_NAME("xnu"),
24 T_META_RADAR_COMPONENT_VERSION("VM"),
25 T_META_CHECK_LEAKS(false)
26 );
27
28 extern char **environ;
29
30 /*
31 * This test file contains two sub-tests which attempt to verify
32 * the allowing or not allowing of a corpse for crashreporter when
33 * a task exceeds its memory allocation limit. vm_map_fork() is the
34 * kernel routine used to generate a corpse task.
35 *
36 * A corpse is allowed to be taken if a task's memory resource limit that
37 * is exceeded is less than 1/4 of the system wide task limit.
38 * If the amount exceeds 1/4 the sytem wide limit, then the corpse is disallowed.
39 *
40 * If the device under test is already under pressure, the test
41 * could fail due to jetsam cutting in and killing the parent, child or
42 * other necessary testing processes.
43 */
44
45 /* Test variants */
46 #define TEST_ALLOWED 0x1
47 #define TEST_NOT_ALLOWED 0x2
48
49 /*
50 * Values which the kernel OR's into the PID when a corpse
51 * is either allowed or disallowed for the
52 * kern.memorystatus_vm_map_fork_pidwatch sysctl.
53 */
54 #define MEMORYSTATUS_VM_MAP_FORK_ALLOWED 0x100000000ul
55 #define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000ul
56
57 /*
58 * The memory allocation happens in a child process, this
59 * is stuff to deal with creating and managing the child.
60 * The child will only execute the T_HELPER_DECL.
61 */
62 static char testpath[PATH_MAX];
63 static uint32_t testpath_size = sizeof(testpath);
64 #define LIMIT_DELTA_MB 5 /* an arbitrary limit delta */
65 #define MEGABYTE (1024 * 1024)
66
67 /*
68 * The child process communicates back to parent via an exit() code.
69 */
70 enum child_exits {
71 NORMAL_EXIT = 0,
72 NO_MEMSIZE_ARG,
73 INVALID_MEMSIZE,
74 MALLOC_FAILED,
75 NUM_CHILD_EXIT
76 };
77 static char *child_exit_why[] = {
78 "normal exit",
79 "no memsize argument to child",
80 "invalid memsize argument to child",
81 "malloc() failed",
82 };
83
84 /*
85 * Set/Get the sysctl used to determine if corpse collection occurs.
86 * This is done by the kernel checking for a specific PID.
87 */
88 static void
set_memorystatus_vm_map_fork_pidwatch(pid_t pid)89 set_memorystatus_vm_map_fork_pidwatch(pid_t pid)
90 {
91 uint64_t new_value = (uint64_t)pid;
92 size_t new_len = sizeof(new_value);
93 int err;
94
95 err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", NULL, NULL, &new_value, new_len);
96 T_QUIET;
97 T_ASSERT_POSIX_SUCCESS(err, "set sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
98 return;
99 }
100
101 static uint64_t
get_memorystatus_vm_map_fork_pidwatch()102 get_memorystatus_vm_map_fork_pidwatch()
103 {
104 uint64_t value = 0;
105 size_t val_len = sizeof(value);
106 int err;
107
108 err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", &value, &val_len, NULL, 0);
109 T_QUIET;
110 T_ASSERT_POSIX_SUCCESS(err, "get sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
111
112 return value;
113 }
114
115 /*
116 * We want to avoid jetsam giving us bad results, if possible. So check if there's
117 * enough memory for the test to run, waiting briefly for some to free up.
118 */
119 static void
wait_for_free_mem(int need_mb)120 wait_for_free_mem(int need_mb)
121 {
122 int64_t memsize;
123 int memorystatus_level;
124 size_t size;
125 int64_t avail;
126 int err;
127 int try;
128
129 /*
130 * get amount of memory in the machine
131 */
132 size = sizeof(memsize);
133 err = sysctlbyname("hw.memsize", &memsize, &size, NULL, 0);
134 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(hw.memsize...) failed");
135
136 /*
137 * Use a loop to briefly sleep and recheck if short on memory.
138 */
139 try = 1;
140 for (;;) {
141 /*
142 * memorystatus_level is a percentage of memory available. For example 20 means 1/5 of memory.
143 * It currently doesn't exist on macOS but neither does jetsam, so pass the test there.
144 */
145 size = sizeof(memorystatus_level);
146 if (sysctlbyname("kern.memorystatus_level", &memorystatus_level, &size, NULL, 0) != 0) {
147 return;
148 }
149 T_QUIET; T_ASSERT_LE(memorystatus_level, 100, "memorystatus_level too high");
150 T_QUIET; T_ASSERT_GT(memorystatus_level, 0, "memorystatus_level negative");
151
152 /*
153 * jetsam kicks in at memory status level of 15%, so subtract that much out of what's available.
154 */
155 avail = MAX(0, (memsize * (memorystatus_level - 15)) / 100);
156
157 /*
158 * We're good to go if there's more than enough available.
159 */
160 if ((int64_t)need_mb * MEGABYTE < avail) {
161 return;
162 }
163
164 /*
165 * issue a message to log and sleep briefly to see if we can get more memory
166 */
167 if (try-- == 0) {
168 break;
169 }
170 T_LOG("Need %d MB, only %d MB available. sleeping 5 seconds for more to free. memorystatus_level %d",
171 need_mb, (int)(avail / MEGABYTE), memorystatus_level);
172 sleep(5);
173 }
174 T_SKIP("Needed %d MB, but only %d MB available. Skipping test to avoid jetsam issues.",
175 need_mb, (int)(avail / MEGABYTE));
176 }
177
178
179 /*
180 * The main test calls this to spawn child process which will run and
181 * exceed some memory limit. The child is initially suspended so that
182 * we can do the sysctl calls before it runs.
183 * Since this is a libdarwintest, the "-n" names the T_HELPER_DECL() that
184 * we want to run. The arguments specific to the test follow a "--".
185 */
186 static pid_t
spawn_child_process(char * const executable,char * const memlimit,short flags,int priority,int active_limit_mb,int inactive_limit_mb)187 spawn_child_process(
188 char * const executable,
189 char * const memlimit,
190 short flags,
191 int priority,
192 int active_limit_mb,
193 int inactive_limit_mb)
194 {
195 posix_spawnattr_t spawn_attrs;
196 int err;
197 pid_t child_pid;
198 char * const argv_child[] = { executable, "-n", "child_process", "--", memlimit, NULL };
199
200 err = posix_spawnattr_init(&spawn_attrs);
201 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, " posix_spawnattr_init() failed");
202
203 err = posix_spawnattr_setflags(&spawn_attrs, POSIX_SPAWN_START_SUSPENDED);
204 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, " posix_spawnattr_setflags() failed");
205
206 err = posix_spawnattr_setjetsam_ext(&spawn_attrs, flags, priority, active_limit_mb, inactive_limit_mb);
207 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, " posix_spawnattr_setjetsam_ext() failed");
208
209 err = posix_spawn(&child_pid, executable, NULL, &spawn_attrs, argv_child, environ);
210 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, " posix_spawn() failed");
211
212 return child_pid;
213 }
214
215
216 /*
217 * The parent calls this to continue the suspended child, then wait for its result.
218 * We collect its resource usage to vefiry the expected amount allocated.
219 */
220 static void
test_child_process(pid_t child_pid,int * status,struct rusage * ru)221 test_child_process(pid_t child_pid, int *status, struct rusage *ru)
222 {
223 int err = 0;
224 pid_t got_pid;
225
226 T_LOG(" continuing child[%d]\n", child_pid);
227
228 err = kill(child_pid, SIGCONT);
229 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, " kill(%d, SIGCONT) failed", child_pid);
230
231 T_LOG(" waiting for child[%d] to exit", child_pid);
232
233 got_pid = wait4(child_pid, status, 0, ru);
234 T_QUIET; T_ASSERT_EQ(child_pid, got_pid, " wait4(%d, ...) returned %d", child_pid, got_pid);
235 }
236
237 /*
238 * The child process executes this code. The easiest way, with given darwintest infrastructure,
239 * it has to return information is via exit status.
240 */
241 T_HELPER_DECL(child_process, "child allocates memory to failure")
242 {
243 #define BYTESPERALLOC MEGABYTE
244 #define BYTESINEXCESS (2 * MEGABYTE) /* 2 MB - arbitrary */
245 char *limit;
246 long limit_mb = 0;
247 long max_bytes_to_munch, bytes_remaining, bytes_this_munch;
248 void *mem = NULL;
249
250 /*
251 * This helper is run in a child process. The helper sees one argument
252 * as a string which is the amount of memory in megabytes to allocate.
253 */
254 if (argc != 1) {
255 exit(NO_MEMSIZE_ARG);
256 }
257
258 limit = argv[0];
259 errno = 0;
260 limit_mb = strtol(limit, NULL, 10);
261 if (errno != 0 || limit_mb <= 0) {
262 exit(INVALID_MEMSIZE);
263 }
264
265 /* Compute in excess of assigned limit */
266 max_bytes_to_munch = limit_mb * MEGABYTE;
267 max_bytes_to_munch += BYTESINEXCESS;
268
269 for (bytes_remaining = max_bytes_to_munch; bytes_remaining > 0; bytes_remaining -= bytes_this_munch) {
270 bytes_this_munch = MIN(bytes_remaining, BYTESPERALLOC);
271
272 mem = malloc((size_t)bytes_this_munch);
273 if (mem == NULL) {
274 exit(MALLOC_FAILED);
275 }
276 arc4random_buf(mem, (size_t)bytes_this_munch);
277 }
278
279 /* We chewed up all the memory we were asked to. */
280 exit(NORMAL_EXIT);
281 }
282
283
284 /*
285 * Actual test body.
286 */
287 static void
memorystatus_vm_map_fork_parent(int test_variant)288 memorystatus_vm_map_fork_parent(int test_variant)
289 {
290 int max_task_pmem = 0; /* MB */
291 size_t size = 0;
292 int active_limit_mb = 0;
293 int inactive_limit_mb = 0;
294 short flags = 0;
295 char memlimit_str[16];
296 pid_t child_pid;
297 int child_status;
298 uint64_t kernel_pidwatch_val;
299 uint64_t expected_pidwatch_val;
300 int ret;
301 struct rusage ru;
302 enum child_exits exit_val;
303
304 /*
305 * The code to set/get the pidwatch sysctl is only in
306 * development kernels. Skip the test if not on one.
307 */
308 if (!is_development_kernel()) {
309 T_SKIP("Can't test on release kernel");
310 }
311
312 /*
313 * Determine a memory limit based on system having one or not.
314 */
315 size = sizeof(max_task_pmem);
316 (void)sysctlbyname("kern.max_task_pmem", &max_task_pmem, &size, NULL, 0);
317 if (max_task_pmem <= 0) {
318 max_task_pmem = 0;
319 }
320
321 /* default limit is 1/4 of max task phys memory value */
322 active_limit_mb = max_task_pmem / 4;
323
324 #if TARGET_OS_WATCH
325
326 /*
327 * Larger memory watches have a raised corpse size limit.
328 * One coprse of 300Meg is allowed, others are 200M.
329 * We pick 300 or 200 based on which test is being done.
330 */
331 uint64_t hw_memsize = 0;
332 size = sizeof(hw_memsize);
333 T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.memsize", &hw_memsize, &size, NULL, 0), "read hw.memsize");
334 if (hw_memsize > 1024 * 1024 * 1024) {
335 if (test_variant == TEST_ALLOWED) {
336 active_limit_mb = MAX(active_limit_mb, 200);
337 } else {
338 active_limit_mb = MAX(active_limit_mb, 300);
339 }
340 }
341
342 #endif /* TARGET_OS_WATCH */
343
344 if (test_variant == TEST_ALLOWED) {
345 /*
346 * Tell the child to allocate less than 1/4 the system wide limit.
347 */
348 if (active_limit_mb <= LIMIT_DELTA_MB) {
349 active_limit_mb = LIMIT_DELTA_MB;
350 } else {
351 active_limit_mb -= LIMIT_DELTA_MB;
352 }
353 expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
354 } else { /* TEST_NOT_ALLOWED */
355 /*
356 * Tell the child to allocate more than 1/4 the system wide limit.
357 */
358 active_limit_mb += LIMIT_DELTA_MB;
359 if (max_task_pmem == 0) {
360 expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
361 } else {
362 expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED;
363 }
364 }
365 inactive_limit_mb = active_limit_mb;
366 T_LOG("using limit of %d Meg", active_limit_mb);
367
368 /*
369 * When run as part of a larger suite, a previous test
370 * may have left the system temporarily with too little
371 * memory to run this test. We try to detect if there is
372 * enough free memory to proceed, waiting a little bit
373 * for memory to free up.
374 */
375 wait_for_free_mem(active_limit_mb);
376
377 #if TARGET_OS_OSX
378 /*
379 * vm_map_fork() is always allowed on desktop.
380 */
381 expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
382 #endif
383
384 /*
385 * Prepare the arguments needed to spawn the child process.
386 */
387 memset(memlimit_str, 0, sizeof(memlimit_str));
388 (void)sprintf(memlimit_str, "%d", active_limit_mb);
389
390 ret = _NSGetExecutablePath(testpath, &testpath_size);
391 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "_NSGetExecutablePath(%s, ...)", testpath);
392
393 /*
394 * We put the child process in FOREGROUND to try and keep jetsam's hands off it.
395 */
396 child_pid = spawn_child_process(testpath, memlimit_str, flags,
397 JETSAM_PRIORITY_FOREGROUND, active_limit_mb, inactive_limit_mb);
398
399 expected_pidwatch_val |= (uint64_t)child_pid;
400
401 /*
402 * We only reach here if parent successfully spawned child process.
403 */
404 T_LOG(" spawned child_pid[%d] with memlimit %s (%d)MB\n",
405 child_pid, memlimit_str, active_limit_mb);
406
407 /*
408 * Set the kernel's pidwatch to look for the child.
409 */
410 (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
411 (void)set_memorystatus_vm_map_fork_pidwatch(child_pid);
412
413 /*
414 * Let the child run and wait for it to finish.
415 */
416 test_child_process(child_pid, &child_status, &ru);
417 T_LOG("Child exited with max_rss of %ld", ru.ru_maxrss);
418
419 /*
420 * Retrieve the kernel's pidwatch value. This should now indicate
421 * if the corpse was allowed or not.
422 */
423 kernel_pidwatch_val = get_memorystatus_vm_map_fork_pidwatch();
424 (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
425
426 /*
427 * If the child died abnormally, the test is invalid.
428 */
429 if (!WIFEXITED(child_status)) {
430 if (WIFSIGNALED(child_status)) {
431 /* jetsam kills a process with SIGKILL */
432 if (WTERMSIG(child_status) == SIGKILL) {
433 T_LOG("Child appears to have been a jetsam victim");
434 }
435 T_SKIP("Child terminated by signal %d test result invalid", WTERMSIG(child_status));
436 }
437 T_SKIP("child did not exit normally (status=%d) test result invalid", child_status);
438 }
439
440 /*
441 * We don't expect the child to exit for any other reason than success
442 */
443 exit_val = (enum child_exits)WEXITSTATUS(child_status);
444 T_QUIET; T_ASSERT_EQ(exit_val, NORMAL_EXIT, "child exit due to: %s",
445 (0 < exit_val && exit_val < NUM_CHILD_EXIT) ? child_exit_why[exit_val] : "unknown");
446
447 /*
448 * If the kernel aborted generating a corpse for other reasons, the test is invalid.
449 */
450 if (kernel_pidwatch_val == -1ull) {
451 T_SKIP("corpse generation was aborted by kernel");
452 }
453
454 /*
455 * We should always have made it through the vm_map_fork() checks in the kernel for this test.
456 */
457 T_QUIET; T_ASSERT_NE_ULLONG(kernel_pidwatch_val, (uint64_t)child_pid, "child didn't trigger corpse generation");
458
459 T_EXPECT_EQ(kernel_pidwatch_val, expected_pidwatch_val, "kernel value 0x%llx - expected 0x%llx",
460 kernel_pidwatch_val, expected_pidwatch_val);
461 }
462
463 /*
464 * The order of these 2 test functions is important. They will be executed by the test framwork in order.
465 *
466 * We test "not allowed first", then "allowed". If it were the other way around, the corpse from the "allowed"
467 * test would likely cause memory pressure and jetsam would likely kill the "not allowed" test.
468 */
469 T_DECL(memorystatus_vm_map_fork_test_not_allowed,
470 "test that corpse generation was not allowed",
471 T_META_ASROOT(true),
472 T_META_TAG_VM_PREFERRED,
473 T_META_ENABLED(false /* rdar://133953771 */))
474 {
475 memorystatus_vm_map_fork_parent(TEST_NOT_ALLOWED);
476 }
477
478 T_DECL(memorystatus_vm_map_fork_test_allowed,
479 "test corpse generation allowed",
480 T_META_ASROOT(true),
481 T_META_TAG_VM_PREFERRED,
482 T_META_ENABLED(false /* rdar://133953771 */))
483 {
484 memorystatus_vm_map_fork_parent(TEST_ALLOWED);
485 }
486