xref: /linux-6.15/lib/stackdepot.c (revision a6cd9570)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stack depot - a stack trace storage that avoids duplication.
4  *
5  * Internally, stack depot maintains a hash table of unique stacktraces. The
6  * stack traces themselves are stored contiguously one after another in a set
7  * of separate page allocations.
8  *
9  * Author: Alexander Potapenko <[email protected]>
10  * Copyright (C) 2016 Google, Inc.
11  *
12  * Based on the code by Dmitry Chernenkov.
13  */
14 
15 #define pr_fmt(fmt) "stackdepot: " fmt
16 
17 #include <linux/gfp.h>
18 #include <linux/jhash.h>
19 #include <linux/kernel.h>
20 #include <linux/kmsan.h>
21 #include <linux/mm.h>
22 #include <linux/mutex.h>
23 #include <linux/percpu.h>
24 #include <linux/printk.h>
25 #include <linux/slab.h>
26 #include <linux/spinlock.h>
27 #include <linux/stacktrace.h>
28 #include <linux/stackdepot.h>
29 #include <linux/string.h>
30 #include <linux/types.h>
31 #include <linux/memblock.h>
32 #include <linux/kasan-enabled.h>
33 
34 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
35 
36 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
37 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
38 #define DEPOT_STACK_ALIGN 4
39 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
40 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
41 			       STACK_DEPOT_EXTRA_BITS)
42 #define DEPOT_POOLS_CAP 8192
43 #define DEPOT_MAX_POOLS \
44 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
45 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
46 
47 /* Compact structure that stores a reference to a stack. */
48 union handle_parts {
49 	depot_stack_handle_t handle;
50 	struct {
51 		u32 pool_index	: DEPOT_POOL_INDEX_BITS;
52 		u32 offset	: DEPOT_OFFSET_BITS;
53 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
54 	};
55 };
56 
57 struct stack_record {
58 	struct stack_record *next;	/* Link in hash table or freelist */
59 	u32 hash;			/* Hash in hash table */
60 	u32 size;			/* Number of stored frames */
61 	union handle_parts handle;
62 	unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
63 };
64 
65 #define DEPOT_STACK_RECORD_SIZE \
66 	ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
67 
68 static bool stack_depot_disabled;
69 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
70 static bool __stack_depot_early_init_passed __initdata;
71 
72 /* Use one hash table bucket per 16 KB of memory. */
73 #define STACK_HASH_TABLE_SCALE 14
74 /* Limit the number of buckets between 4K and 1M. */
75 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
76 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
77 /* Initial seed for jhash2. */
78 #define STACK_HASH_SEED 0x9747b28c
79 
80 /* Hash table of pointers to stored stack traces. */
81 static struct stack_record **stack_table;
82 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
83 static unsigned int stack_bucket_number_order;
84 /* Hash mask for indexing the table. */
85 static unsigned int stack_hash_mask;
86 
87 /* Array of memory regions that store stack traces. */
88 static void *stack_pools[DEPOT_MAX_POOLS];
89 /* Newly allocated pool that is not yet added to stack_pools. */
90 static void *new_pool;
91 /* Number of pools in stack_pools. */
92 static int pools_num;
93 /* Next stack in the freelist of stack records within stack_pools. */
94 static struct stack_record *next_stack;
95 /*
96  * Stack depot tries to keep an extra pool allocated even before it runs out
97  * of space in the currently used pool. This flag marks whether this extra pool
98  * needs to be allocated. It has the value 0 when either an extra pool is not
99  * yet allocated or if the limit on the number of pools is reached.
100  */
101 static bool new_pool_required = true;
102 /* Lock that protects the variables above. */
103 static DEFINE_RWLOCK(pool_rwlock);
104 
105 static int __init disable_stack_depot(char *str)
106 {
107 	return kstrtobool(str, &stack_depot_disabled);
108 }
109 early_param("stack_depot_disable", disable_stack_depot);
110 
111 void __init stack_depot_request_early_init(void)
112 {
113 	/* Too late to request early init now. */
114 	WARN_ON(__stack_depot_early_init_passed);
115 
116 	__stack_depot_early_init_requested = true;
117 }
118 
119 /* Allocates a hash table via memblock. Can only be used during early boot. */
120 int __init stack_depot_early_init(void)
121 {
122 	unsigned long entries = 0;
123 
124 	/* This function must be called only once, from mm_init(). */
125 	if (WARN_ON(__stack_depot_early_init_passed))
126 		return 0;
127 	__stack_depot_early_init_passed = true;
128 
129 	/*
130 	 * Print disabled message even if early init has not been requested:
131 	 * stack_depot_init() will not print one.
132 	 */
133 	if (stack_depot_disabled) {
134 		pr_info("disabled\n");
135 		return 0;
136 	}
137 
138 	/*
139 	 * If KASAN is enabled, use the maximum order: KASAN is frequently used
140 	 * in fuzzing scenarios, which leads to a large number of different
141 	 * stack traces being stored in stack depot.
142 	 */
143 	if (kasan_enabled() && !stack_bucket_number_order)
144 		stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
145 
146 	/*
147 	 * Check if early init has been requested after setting
148 	 * stack_bucket_number_order: stack_depot_init() uses its value.
149 	 */
150 	if (!__stack_depot_early_init_requested)
151 		return 0;
152 
153 	/*
154 	 * If stack_bucket_number_order is not set, leave entries as 0 to rely
155 	 * on the automatic calculations performed by alloc_large_system_hash.
156 	 */
157 	if (stack_bucket_number_order)
158 		entries = 1UL << stack_bucket_number_order;
159 	pr_info("allocating hash table via alloc_large_system_hash\n");
160 	stack_table = alloc_large_system_hash("stackdepot",
161 						sizeof(struct stack_record *),
162 						entries,
163 						STACK_HASH_TABLE_SCALE,
164 						HASH_EARLY | HASH_ZERO,
165 						NULL,
166 						&stack_hash_mask,
167 						1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
168 						1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
169 	if (!stack_table) {
170 		pr_err("hash table allocation failed, disabling\n");
171 		stack_depot_disabled = true;
172 		return -ENOMEM;
173 	}
174 
175 	return 0;
176 }
177 
178 /* Allocates a hash table via kvcalloc. Can be used after boot. */
179 int stack_depot_init(void)
180 {
181 	static DEFINE_MUTEX(stack_depot_init_mutex);
182 	unsigned long entries;
183 	int ret = 0;
184 
185 	mutex_lock(&stack_depot_init_mutex);
186 
187 	if (stack_depot_disabled || stack_table)
188 		goto out_unlock;
189 
190 	/*
191 	 * Similarly to stack_depot_early_init, use stack_bucket_number_order
192 	 * if assigned, and rely on automatic scaling otherwise.
193 	 */
194 	if (stack_bucket_number_order) {
195 		entries = 1UL << stack_bucket_number_order;
196 	} else {
197 		int scale = STACK_HASH_TABLE_SCALE;
198 
199 		entries = nr_free_buffer_pages();
200 		entries = roundup_pow_of_two(entries);
201 
202 		if (scale > PAGE_SHIFT)
203 			entries >>= (scale - PAGE_SHIFT);
204 		else
205 			entries <<= (PAGE_SHIFT - scale);
206 	}
207 
208 	if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
209 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
210 	if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
211 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
212 
213 	pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
214 	stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL);
215 	if (!stack_table) {
216 		pr_err("hash table allocation failed, disabling\n");
217 		stack_depot_disabled = true;
218 		ret = -ENOMEM;
219 		goto out_unlock;
220 	}
221 	stack_hash_mask = entries - 1;
222 
223 out_unlock:
224 	mutex_unlock(&stack_depot_init_mutex);
225 
226 	return ret;
227 }
228 EXPORT_SYMBOL_GPL(stack_depot_init);
229 
230 /* Initializes a stack depol pool. */
231 static void depot_init_pool(void *pool)
232 {
233 	const int records_in_pool = DEPOT_POOL_SIZE / DEPOT_STACK_RECORD_SIZE;
234 	int i, offset;
235 
236 	lockdep_assert_held_write(&pool_rwlock);
237 
238 	/* Initialize handles and link stack records to each other. */
239 	for (i = 0, offset = 0;
240 	     offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
241 	     i++, offset += DEPOT_STACK_RECORD_SIZE) {
242 		struct stack_record *stack = pool + offset;
243 
244 		stack->handle.pool_index = pools_num;
245 		stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
246 		stack->handle.extra = 0;
247 
248 		if (i < records_in_pool - 1)
249 			stack->next = (void *)stack + DEPOT_STACK_RECORD_SIZE;
250 		else
251 			stack->next = NULL;
252 	}
253 
254 	/* Link stack records into the freelist. */
255 	WARN_ON(next_stack);
256 	next_stack = pool;
257 
258 	/* Save reference to the pool to be used by depot_fetch_stack(). */
259 	stack_pools[pools_num] = pool;
260 	pools_num++;
261 }
262 
263 /* Keeps the preallocated memory to be used for a new stack depot pool. */
264 static void depot_keep_new_pool(void **prealloc)
265 {
266 	lockdep_assert_held_write(&pool_rwlock);
267 
268 	/*
269 	 * If a new pool is already saved or the maximum number of
270 	 * pools is reached, do not use the preallocated memory.
271 	 */
272 	if (!new_pool_required)
273 		return;
274 
275 	/*
276 	 * Use the preallocated memory for the new pool
277 	 * as long as we do not exceed the maximum number of pools.
278 	 */
279 	if (pools_num < DEPOT_MAX_POOLS) {
280 		new_pool = *prealloc;
281 		*prealloc = NULL;
282 	}
283 
284 	/*
285 	 * At this point, either a new pool is kept or the maximum
286 	 * number of pools is reached. In either case, take note that
287 	 * keeping another pool is not required.
288 	 */
289 	new_pool_required = false;
290 }
291 
292 /* Updates references to the current and the next stack depot pools. */
293 static bool depot_update_pools(void **prealloc)
294 {
295 	lockdep_assert_held_write(&pool_rwlock);
296 
297 	/* Check if we still have objects in the freelist. */
298 	if (next_stack)
299 		goto out_keep_prealloc;
300 
301 	/* Check if we have a new pool saved and use it. */
302 	if (new_pool) {
303 		depot_init_pool(new_pool);
304 		new_pool = NULL;
305 
306 		/* Take note that we might need a new new_pool. */
307 		if (pools_num < DEPOT_MAX_POOLS)
308 			new_pool_required = true;
309 
310 		/* Try keeping the preallocated memory for new_pool. */
311 		goto out_keep_prealloc;
312 	}
313 
314 	/* Bail out if we reached the pool limit. */
315 	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
316 		WARN_ONCE(1, "Stack depot reached limit capacity");
317 		return false;
318 	}
319 
320 	/* Check if we have preallocated memory and use it. */
321 	if (*prealloc) {
322 		depot_init_pool(*prealloc);
323 		*prealloc = NULL;
324 		return true;
325 	}
326 
327 	return false;
328 
329 out_keep_prealloc:
330 	/* Keep the preallocated memory for a new pool if required. */
331 	if (*prealloc)
332 		depot_keep_new_pool(prealloc);
333 	return true;
334 }
335 
336 /* Allocates a new stack in a stack depot pool. */
337 static struct stack_record *
338 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
339 {
340 	struct stack_record *stack;
341 
342 	lockdep_assert_held_write(&pool_rwlock);
343 
344 	/* Update current and new pools if required and possible. */
345 	if (!depot_update_pools(prealloc))
346 		return NULL;
347 
348 	/* Check if we have a stack record to save the stack trace. */
349 	stack = next_stack;
350 	if (!stack)
351 		return NULL;
352 
353 	/* Advance the freelist. */
354 	next_stack = stack->next;
355 
356 	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
357 	if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
358 		size = CONFIG_STACKDEPOT_MAX_FRAMES;
359 
360 	/* Save the stack trace. */
361 	stack->next = NULL;
362 	stack->hash = hash;
363 	stack->size = size;
364 	/* stack->handle is already filled in by depot_init_pool(). */
365 	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
366 
367 	/*
368 	 * Let KMSAN know the stored stack record is initialized. This shall
369 	 * prevent false positive reports if instrumented code accesses it.
370 	 */
371 	kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
372 
373 	return stack;
374 }
375 
376 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
377 {
378 	union handle_parts parts = { .handle = handle };
379 	void *pool;
380 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
381 	struct stack_record *stack;
382 
383 	lockdep_assert_held_read(&pool_rwlock);
384 
385 	if (parts.pool_index > pools_num) {
386 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
387 		     parts.pool_index, pools_num, handle);
388 		return NULL;
389 	}
390 
391 	pool = stack_pools[parts.pool_index];
392 	if (!pool)
393 		return NULL;
394 
395 	stack = pool + offset;
396 	return stack;
397 }
398 
399 /* Calculates the hash for a stack. */
400 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
401 {
402 	return jhash2((u32 *)entries,
403 		      array_size(size,  sizeof(*entries)) / sizeof(u32),
404 		      STACK_HASH_SEED);
405 }
406 
407 /*
408  * Non-instrumented version of memcmp().
409  * Does not check the lexicographical order, only the equality.
410  */
411 static inline
412 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
413 			unsigned int n)
414 {
415 	for ( ; n-- ; u1++, u2++) {
416 		if (*u1 != *u2)
417 			return 1;
418 	}
419 	return 0;
420 }
421 
422 /* Finds a stack in a bucket of the hash table. */
423 static inline struct stack_record *find_stack(struct stack_record *bucket,
424 					     unsigned long *entries, int size,
425 					     u32 hash)
426 {
427 	struct stack_record *found;
428 
429 	lockdep_assert_held(&pool_rwlock);
430 
431 	for (found = bucket; found; found = found->next) {
432 		if (found->hash == hash &&
433 		    found->size == size &&
434 		    !stackdepot_memcmp(entries, found->entries, size))
435 			return found;
436 	}
437 	return NULL;
438 }
439 
440 depot_stack_handle_t __stack_depot_save(unsigned long *entries,
441 					unsigned int nr_entries,
442 					gfp_t alloc_flags, bool can_alloc)
443 {
444 	struct stack_record *found = NULL, **bucket;
445 	depot_stack_handle_t handle = 0;
446 	struct page *page = NULL;
447 	void *prealloc = NULL;
448 	bool need_alloc = false;
449 	unsigned long flags;
450 	u32 hash;
451 
452 	/*
453 	 * If this stack trace is from an interrupt, including anything before
454 	 * interrupt entry usually leads to unbounded stack depot growth.
455 	 *
456 	 * Since use of filter_irq_stacks() is a requirement to ensure stack
457 	 * depot can efficiently deduplicate interrupt stacks, always
458 	 * filter_irq_stacks() to simplify all callers' use of stack depot.
459 	 */
460 	nr_entries = filter_irq_stacks(entries, nr_entries);
461 
462 	if (unlikely(nr_entries == 0) || stack_depot_disabled)
463 		return 0;
464 
465 	hash = hash_stack(entries, nr_entries);
466 	bucket = &stack_table[hash & stack_hash_mask];
467 
468 	read_lock_irqsave(&pool_rwlock, flags);
469 
470 	/* Fast path: look the stack trace up without full locking. */
471 	found = find_stack(*bucket, entries, nr_entries, hash);
472 	if (found) {
473 		read_unlock_irqrestore(&pool_rwlock, flags);
474 		goto exit;
475 	}
476 
477 	/* Take note if another stack pool needs to be allocated. */
478 	if (new_pool_required)
479 		need_alloc = true;
480 
481 	read_unlock_irqrestore(&pool_rwlock, flags);
482 
483 	/*
484 	 * Allocate memory for a new pool if required now:
485 	 * we won't be able to do that under the lock.
486 	 */
487 	if (unlikely(can_alloc && need_alloc)) {
488 		/*
489 		 * Zero out zone modifiers, as we don't have specific zone
490 		 * requirements. Keep the flags related to allocation in atomic
491 		 * contexts and I/O.
492 		 */
493 		alloc_flags &= ~GFP_ZONEMASK;
494 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
495 		alloc_flags |= __GFP_NOWARN;
496 		page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
497 		if (page)
498 			prealloc = page_address(page);
499 	}
500 
501 	write_lock_irqsave(&pool_rwlock, flags);
502 
503 	found = find_stack(*bucket, entries, nr_entries, hash);
504 	if (!found) {
505 		struct stack_record *new =
506 			depot_alloc_stack(entries, nr_entries, hash, &prealloc);
507 
508 		if (new) {
509 			new->next = *bucket;
510 			*bucket = new;
511 			found = new;
512 		}
513 	} else if (prealloc) {
514 		/*
515 		 * Stack depot already contains this stack trace, but let's
516 		 * keep the preallocated memory for future.
517 		 */
518 		depot_keep_new_pool(&prealloc);
519 	}
520 
521 	write_unlock_irqrestore(&pool_rwlock, flags);
522 exit:
523 	if (prealloc) {
524 		/* Stack depot didn't use this memory, free it. */
525 		free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
526 	}
527 	if (found)
528 		handle = found->handle.handle;
529 	return handle;
530 }
531 EXPORT_SYMBOL_GPL(__stack_depot_save);
532 
533 depot_stack_handle_t stack_depot_save(unsigned long *entries,
534 				      unsigned int nr_entries,
535 				      gfp_t alloc_flags)
536 {
537 	return __stack_depot_save(entries, nr_entries, alloc_flags, true);
538 }
539 EXPORT_SYMBOL_GPL(stack_depot_save);
540 
541 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
542 			       unsigned long **entries)
543 {
544 	struct stack_record *stack;
545 	unsigned long flags;
546 
547 	*entries = NULL;
548 	/*
549 	 * Let KMSAN know *entries is initialized. This shall prevent false
550 	 * positive reports if instrumented code accesses it.
551 	 */
552 	kmsan_unpoison_memory(entries, sizeof(*entries));
553 
554 	if (!handle || stack_depot_disabled)
555 		return 0;
556 
557 	read_lock_irqsave(&pool_rwlock, flags);
558 
559 	stack = depot_fetch_stack(handle);
560 
561 	read_unlock_irqrestore(&pool_rwlock, flags);
562 
563 	*entries = stack->entries;
564 	return stack->size;
565 }
566 EXPORT_SYMBOL_GPL(stack_depot_fetch);
567 
568 void stack_depot_print(depot_stack_handle_t stack)
569 {
570 	unsigned long *entries;
571 	unsigned int nr_entries;
572 
573 	nr_entries = stack_depot_fetch(stack, &entries);
574 	if (nr_entries > 0)
575 		stack_trace_print(entries, nr_entries, 0);
576 }
577 EXPORT_SYMBOL_GPL(stack_depot_print);
578 
579 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
580 		       int spaces)
581 {
582 	unsigned long *entries;
583 	unsigned int nr_entries;
584 
585 	nr_entries = stack_depot_fetch(handle, &entries);
586 	return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
587 						spaces) : 0;
588 }
589 EXPORT_SYMBOL_GPL(stack_depot_snprint);
590 
591 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
592 			depot_stack_handle_t handle, unsigned int extra_bits)
593 {
594 	union handle_parts parts = { .handle = handle };
595 
596 	/* Don't set extra bits on empty handles. */
597 	if (!handle)
598 		return 0;
599 
600 	parts.extra = extra_bits;
601 	return parts.handle;
602 }
603 EXPORT_SYMBOL(stack_depot_set_extra_bits);
604 
605 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
606 {
607 	union handle_parts parts = { .handle = handle };
608 
609 	return parts.extra;
610 }
611 EXPORT_SYMBOL(stack_depot_get_extra_bits);
612