xref: /linux-6.15/lib/stackdepot.c (revision 3ee34eab)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stack depot - a stack trace storage that avoids duplication.
4  *
5  * Internally, stack depot maintains a hash table of unique stacktraces. The
6  * stack traces themselves are stored contiguously one after another in a set
7  * of separate page allocations.
8  *
9  * Author: Alexander Potapenko <[email protected]>
10  * Copyright (C) 2016 Google, Inc.
11  *
12  * Based on the code by Dmitry Chernenkov.
13  */
14 
15 #define pr_fmt(fmt) "stackdepot: " fmt
16 
17 #include <linux/debugfs.h>
18 #include <linux/gfp.h>
19 #include <linux/jhash.h>
20 #include <linux/kernel.h>
21 #include <linux/kmsan.h>
22 #include <linux/list.h>
23 #include <linux/mm.h>
24 #include <linux/mutex.h>
25 #include <linux/poison.h>
26 #include <linux/printk.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/refcount.h>
30 #include <linux/slab.h>
31 #include <linux/spinlock.h>
32 #include <linux/stacktrace.h>
33 #include <linux/stackdepot.h>
34 #include <linux/string.h>
35 #include <linux/types.h>
36 #include <linux/memblock.h>
37 #include <linux/kasan-enabled.h>
38 
39 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
40 
41 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
42 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
43 #define DEPOT_STACK_ALIGN 4
44 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
45 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
46 			       STACK_DEPOT_EXTRA_BITS)
47 #define DEPOT_POOLS_CAP 8192
48 /* The pool_index is offset by 1 so the first record does not have a 0 handle. */
49 #define DEPOT_MAX_POOLS \
50 	(((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
51 	 (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
52 
53 /* Compact structure that stores a reference to a stack. */
54 union handle_parts {
55 	depot_stack_handle_t handle;
56 	struct {
57 		u32 pool_index	: DEPOT_POOL_INDEX_BITS; /* pool_index is offset by 1 */
58 		u32 offset	: DEPOT_OFFSET_BITS;
59 		u32 extra	: STACK_DEPOT_EXTRA_BITS;
60 	};
61 };
62 
63 struct stack_record {
64 	struct list_head hash_list;	/* Links in the hash table */
65 	u32 hash;			/* Hash in hash table */
66 	u32 size;			/* Number of stored frames */
67 	union handle_parts handle;	/* Constant after initialization */
68 	refcount_t count;
69 	union {
70 		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
71 		struct {
72 			/*
73 			 * An important invariant of the implementation is to
74 			 * only place a stack record onto the freelist iff its
75 			 * refcount is zero. Because stack records with a zero
76 			 * refcount are never considered as valid, it is safe to
77 			 * union @entries and freelist management state below.
78 			 * Conversely, as soon as an entry is off the freelist
79 			 * and its refcount becomes non-zero, the below must not
80 			 * be accessed until being placed back on the freelist.
81 			 */
82 			struct list_head free_list;	/* Links in the freelist */
83 			unsigned long rcu_state;	/* RCU cookie */
84 		};
85 	};
86 };
87 
88 static bool stack_depot_disabled;
89 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
90 static bool __stack_depot_early_init_passed __initdata;
91 
92 /* Use one hash table bucket per 16 KB of memory. */
93 #define STACK_HASH_TABLE_SCALE 14
94 /* Limit the number of buckets between 4K and 1M. */
95 #define STACK_BUCKET_NUMBER_ORDER_MIN 12
96 #define STACK_BUCKET_NUMBER_ORDER_MAX 20
97 /* Initial seed for jhash2. */
98 #define STACK_HASH_SEED 0x9747b28c
99 
100 /* Hash table of stored stack records. */
101 static struct list_head *stack_table;
102 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */
103 static unsigned int stack_bucket_number_order;
104 /* Hash mask for indexing the table. */
105 static unsigned int stack_hash_mask;
106 
107 /* Array of memory regions that store stack records. */
108 static void *stack_pools[DEPOT_MAX_POOLS];
109 /* Newly allocated pool that is not yet added to stack_pools. */
110 static void *new_pool;
111 /* Number of pools in stack_pools. */
112 static int pools_num;
113 /* Offset to the unused space in the currently used pool. */
114 static size_t pool_offset = DEPOT_POOL_SIZE;
115 /* Freelist of stack records within stack_pools. */
116 static LIST_HEAD(free_stacks);
117 /* The lock must be held when performing pool or freelist modifications. */
118 static DEFINE_RAW_SPINLOCK(pool_lock);
119 
120 /* Statistics counters for debugfs. */
121 enum depot_counter_id {
122 	DEPOT_COUNTER_REFD_ALLOCS,
123 	DEPOT_COUNTER_REFD_FREES,
124 	DEPOT_COUNTER_REFD_INUSE,
125 	DEPOT_COUNTER_FREELIST_SIZE,
126 	DEPOT_COUNTER_PERSIST_COUNT,
127 	DEPOT_COUNTER_PERSIST_BYTES,
128 	DEPOT_COUNTER_COUNT,
129 };
130 static long counters[DEPOT_COUNTER_COUNT];
131 static const char *const counter_names[] = {
132 	[DEPOT_COUNTER_REFD_ALLOCS]	= "refcounted_allocations",
133 	[DEPOT_COUNTER_REFD_FREES]	= "refcounted_frees",
134 	[DEPOT_COUNTER_REFD_INUSE]	= "refcounted_in_use",
135 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
136 	[DEPOT_COUNTER_PERSIST_COUNT]	= "persistent_count",
137 	[DEPOT_COUNTER_PERSIST_BYTES]	= "persistent_bytes",
138 };
139 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
140 
141 static int __init disable_stack_depot(char *str)
142 {
143 	return kstrtobool(str, &stack_depot_disabled);
144 }
145 early_param("stack_depot_disable", disable_stack_depot);
146 
147 void __init stack_depot_request_early_init(void)
148 {
149 	/* Too late to request early init now. */
150 	WARN_ON(__stack_depot_early_init_passed);
151 
152 	__stack_depot_early_init_requested = true;
153 }
154 
155 /* Initialize list_head's within the hash table. */
156 static void init_stack_table(unsigned long entries)
157 {
158 	unsigned long i;
159 
160 	for (i = 0; i < entries; i++)
161 		INIT_LIST_HEAD(&stack_table[i]);
162 }
163 
164 /* Allocates a hash table via memblock. Can only be used during early boot. */
165 int __init stack_depot_early_init(void)
166 {
167 	unsigned long entries = 0;
168 
169 	/* This function must be called only once, from mm_init(). */
170 	if (WARN_ON(__stack_depot_early_init_passed))
171 		return 0;
172 	__stack_depot_early_init_passed = true;
173 
174 	/*
175 	 * Print disabled message even if early init has not been requested:
176 	 * stack_depot_init() will not print one.
177 	 */
178 	if (stack_depot_disabled) {
179 		pr_info("disabled\n");
180 		return 0;
181 	}
182 
183 	/*
184 	 * If KASAN is enabled, use the maximum order: KASAN is frequently used
185 	 * in fuzzing scenarios, which leads to a large number of different
186 	 * stack traces being stored in stack depot.
187 	 */
188 	if (kasan_enabled() && !stack_bucket_number_order)
189 		stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
190 
191 	/*
192 	 * Check if early init has been requested after setting
193 	 * stack_bucket_number_order: stack_depot_init() uses its value.
194 	 */
195 	if (!__stack_depot_early_init_requested)
196 		return 0;
197 
198 	/*
199 	 * If stack_bucket_number_order is not set, leave entries as 0 to rely
200 	 * on the automatic calculations performed by alloc_large_system_hash().
201 	 */
202 	if (stack_bucket_number_order)
203 		entries = 1UL << stack_bucket_number_order;
204 	pr_info("allocating hash table via alloc_large_system_hash\n");
205 	stack_table = alloc_large_system_hash("stackdepot",
206 						sizeof(struct list_head),
207 						entries,
208 						STACK_HASH_TABLE_SCALE,
209 						HASH_EARLY,
210 						NULL,
211 						&stack_hash_mask,
212 						1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
213 						1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
214 	if (!stack_table) {
215 		pr_err("hash table allocation failed, disabling\n");
216 		stack_depot_disabled = true;
217 		return -ENOMEM;
218 	}
219 	if (!entries) {
220 		/*
221 		 * Obtain the number of entries that was calculated by
222 		 * alloc_large_system_hash().
223 		 */
224 		entries = stack_hash_mask + 1;
225 	}
226 	init_stack_table(entries);
227 
228 	return 0;
229 }
230 
231 /* Allocates a hash table via kvcalloc. Can be used after boot. */
232 int stack_depot_init(void)
233 {
234 	static DEFINE_MUTEX(stack_depot_init_mutex);
235 	unsigned long entries;
236 	int ret = 0;
237 
238 	mutex_lock(&stack_depot_init_mutex);
239 
240 	if (stack_depot_disabled || stack_table)
241 		goto out_unlock;
242 
243 	/*
244 	 * Similarly to stack_depot_early_init, use stack_bucket_number_order
245 	 * if assigned, and rely on automatic scaling otherwise.
246 	 */
247 	if (stack_bucket_number_order) {
248 		entries = 1UL << stack_bucket_number_order;
249 	} else {
250 		int scale = STACK_HASH_TABLE_SCALE;
251 
252 		entries = nr_free_buffer_pages();
253 		entries = roundup_pow_of_two(entries);
254 
255 		if (scale > PAGE_SHIFT)
256 			entries >>= (scale - PAGE_SHIFT);
257 		else
258 			entries <<= (PAGE_SHIFT - scale);
259 	}
260 
261 	if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
262 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
263 	if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
264 		entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
265 
266 	pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
267 	stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
268 	if (!stack_table) {
269 		pr_err("hash table allocation failed, disabling\n");
270 		stack_depot_disabled = true;
271 		ret = -ENOMEM;
272 		goto out_unlock;
273 	}
274 	stack_hash_mask = entries - 1;
275 	init_stack_table(entries);
276 
277 out_unlock:
278 	mutex_unlock(&stack_depot_init_mutex);
279 
280 	return ret;
281 }
282 EXPORT_SYMBOL_GPL(stack_depot_init);
283 
284 /*
285  * Initializes new stack pool, and updates the list of pools.
286  */
287 static bool depot_init_pool(void **prealloc)
288 {
289 	lockdep_assert_held(&pool_lock);
290 
291 	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
292 		/* Bail out if we reached the pool limit. */
293 		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
294 		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
295 		WARN_ONCE(1, "Stack depot reached limit capacity");
296 		return false;
297 	}
298 
299 	if (!new_pool && *prealloc) {
300 		/* We have preallocated memory, use it. */
301 		WRITE_ONCE(new_pool, *prealloc);
302 		*prealloc = NULL;
303 	}
304 
305 	if (!new_pool)
306 		return false; /* new_pool and *prealloc are NULL */
307 
308 	/* Save reference to the pool to be used by depot_fetch_stack(). */
309 	stack_pools[pools_num] = new_pool;
310 
311 	/*
312 	 * Stack depot tries to keep an extra pool allocated even before it runs
313 	 * out of space in the currently used pool.
314 	 *
315 	 * To indicate that a new preallocation is needed new_pool is reset to
316 	 * NULL; do not reset to NULL if we have reached the maximum number of
317 	 * pools.
318 	 */
319 	if (pools_num < DEPOT_MAX_POOLS)
320 		WRITE_ONCE(new_pool, NULL);
321 	else
322 		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
323 
324 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
325 	WRITE_ONCE(pools_num, pools_num + 1);
326 	ASSERT_EXCLUSIVE_WRITER(pools_num);
327 
328 	pool_offset = 0;
329 
330 	return true;
331 }
332 
333 /* Keeps the preallocated memory to be used for a new stack depot pool. */
334 static void depot_keep_new_pool(void **prealloc)
335 {
336 	lockdep_assert_held(&pool_lock);
337 
338 	/*
339 	 * If a new pool is already saved or the maximum number of
340 	 * pools is reached, do not use the preallocated memory.
341 	 */
342 	if (new_pool)
343 		return;
344 
345 	WRITE_ONCE(new_pool, *prealloc);
346 	*prealloc = NULL;
347 }
348 
349 /*
350  * Try to initialize a new stack record from the current pool, a cached pool, or
351  * the current pre-allocation.
352  */
353 static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
354 {
355 	struct stack_record *stack;
356 	void *current_pool;
357 	u32 pool_index;
358 
359 	lockdep_assert_held(&pool_lock);
360 
361 	if (pool_offset + size > DEPOT_POOL_SIZE) {
362 		if (!depot_init_pool(prealloc))
363 			return NULL;
364 	}
365 
366 	if (WARN_ON_ONCE(pools_num < 1))
367 		return NULL;
368 	pool_index = pools_num - 1;
369 	current_pool = stack_pools[pool_index];
370 	if (WARN_ON_ONCE(!current_pool))
371 		return NULL;
372 
373 	stack = current_pool + pool_offset;
374 
375 	/* Pre-initialize handle once. */
376 	stack->handle.pool_index = pool_index + 1;
377 	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
378 	stack->handle.extra = 0;
379 	INIT_LIST_HEAD(&stack->hash_list);
380 
381 	pool_offset += size;
382 
383 	return stack;
384 }
385 
386 /* Try to find next free usable entry from the freelist. */
387 static struct stack_record *depot_pop_free(void)
388 {
389 	struct stack_record *stack;
390 
391 	lockdep_assert_held(&pool_lock);
392 
393 	if (list_empty(&free_stacks))
394 		return NULL;
395 
396 	/*
397 	 * We maintain the invariant that the elements in front are least
398 	 * recently used, and are therefore more likely to be associated with an
399 	 * RCU grace period in the past. Consequently it is sufficient to only
400 	 * check the first entry.
401 	 */
402 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
403 	if (!poll_state_synchronize_rcu(stack->rcu_state))
404 		return NULL;
405 
406 	list_del(&stack->free_list);
407 	counters[DEPOT_COUNTER_FREELIST_SIZE]--;
408 
409 	return stack;
410 }
411 
412 static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
413 {
414 	const size_t used = flex_array_size(s, entries, nr_entries);
415 	const size_t unused = sizeof(s->entries) - used;
416 
417 	WARN_ON_ONCE(sizeof(s->entries) < used);
418 
419 	return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
420 }
421 
422 /* Allocates a new stack in a stack depot pool. */
423 static struct stack_record *
424 depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
425 {
426 	struct stack_record *stack = NULL;
427 	size_t record_size;
428 
429 	lockdep_assert_held(&pool_lock);
430 
431 	/* This should already be checked by public API entry points. */
432 	if (WARN_ON_ONCE(!nr_entries))
433 		return NULL;
434 
435 	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
436 	if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
437 		nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
438 
439 	if (flags & STACK_DEPOT_FLAG_GET) {
440 		/*
441 		 * Evictable entries have to allocate the max. size so they may
442 		 * safely be re-used by differently sized allocations.
443 		 */
444 		record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
445 		stack = depot_pop_free();
446 	} else {
447 		record_size = depot_stack_record_size(stack, nr_entries);
448 	}
449 
450 	if (!stack) {
451 		stack = depot_pop_free_pool(prealloc, record_size);
452 		if (!stack)
453 			return NULL;
454 	}
455 
456 	/* Save the stack trace. */
457 	stack->hash = hash;
458 	stack->size = nr_entries;
459 	/* stack->handle is already filled in by depot_pop_free_pool(). */
460 	memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
461 
462 	if (flags & STACK_DEPOT_FLAG_GET) {
463 		refcount_set(&stack->count, 1);
464 		counters[DEPOT_COUNTER_REFD_ALLOCS]++;
465 		counters[DEPOT_COUNTER_REFD_INUSE]++;
466 	} else {
467 		/* Warn on attempts to switch to refcounting this entry. */
468 		refcount_set(&stack->count, REFCOUNT_SATURATED);
469 		counters[DEPOT_COUNTER_PERSIST_COUNT]++;
470 		counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
471 	}
472 
473 	/*
474 	 * Let KMSAN know the stored stack record is initialized. This shall
475 	 * prevent false positive reports if instrumented code accesses it.
476 	 */
477 	kmsan_unpoison_memory(stack, record_size);
478 
479 	return stack;
480 }
481 
482 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
483 {
484 	const int pools_num_cached = READ_ONCE(pools_num);
485 	union handle_parts parts = { .handle = handle };
486 	void *pool;
487 	u32 pool_index = parts.pool_index - 1;
488 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
489 	struct stack_record *stack;
490 
491 	lockdep_assert_not_held(&pool_lock);
492 
493 	if (pool_index > pools_num_cached) {
494 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
495 		     pool_index, pools_num_cached, handle);
496 		return NULL;
497 	}
498 
499 	pool = stack_pools[pool_index];
500 	if (WARN_ON(!pool))
501 		return NULL;
502 
503 	stack = pool + offset;
504 	if (WARN_ON(!refcount_read(&stack->count)))
505 		return NULL;
506 
507 	return stack;
508 }
509 
510 /* Links stack into the freelist. */
511 static void depot_free_stack(struct stack_record *stack)
512 {
513 	unsigned long flags;
514 
515 	lockdep_assert_not_held(&pool_lock);
516 
517 	raw_spin_lock_irqsave(&pool_lock, flags);
518 	printk_deferred_enter();
519 
520 	/*
521 	 * Remove the entry from the hash list. Concurrent list traversal may
522 	 * still observe the entry, but since the refcount is zero, this entry
523 	 * will no longer be considered as valid.
524 	 */
525 	list_del_rcu(&stack->hash_list);
526 
527 	/*
528 	 * Due to being used from constrained contexts such as the allocators,
529 	 * NMI, or even RCU itself, stack depot cannot rely on primitives that
530 	 * would sleep (such as synchronize_rcu()) or recursively call into
531 	 * stack depot again (such as call_rcu()).
532 	 *
533 	 * Instead, get an RCU cookie, so that we can ensure this entry isn't
534 	 * moved onto another list until the next grace period, and concurrent
535 	 * RCU list traversal remains safe.
536 	 */
537 	stack->rcu_state = get_state_synchronize_rcu();
538 
539 	/*
540 	 * Add the entry to the freelist tail, so that older entries are
541 	 * considered first - their RCU cookie is more likely to no longer be
542 	 * associated with the current grace period.
543 	 */
544 	list_add_tail(&stack->free_list, &free_stacks);
545 
546 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
547 	counters[DEPOT_COUNTER_REFD_FREES]++;
548 	counters[DEPOT_COUNTER_REFD_INUSE]--;
549 
550 	printk_deferred_exit();
551 	raw_spin_unlock_irqrestore(&pool_lock, flags);
552 }
553 
554 /* Calculates the hash for a stack. */
555 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
556 {
557 	return jhash2((u32 *)entries,
558 		      array_size(size,  sizeof(*entries)) / sizeof(u32),
559 		      STACK_HASH_SEED);
560 }
561 
562 /*
563  * Non-instrumented version of memcmp().
564  * Does not check the lexicographical order, only the equality.
565  */
566 static inline
567 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
568 			unsigned int n)
569 {
570 	for ( ; n-- ; u1++, u2++) {
571 		if (*u1 != *u2)
572 			return 1;
573 	}
574 	return 0;
575 }
576 
577 /* Finds a stack in a bucket of the hash table. */
578 static inline struct stack_record *find_stack(struct list_head *bucket,
579 					      unsigned long *entries, int size,
580 					      u32 hash, depot_flags_t flags)
581 {
582 	struct stack_record *stack, *ret = NULL;
583 
584 	/*
585 	 * Stack depot may be used from instrumentation that instruments RCU or
586 	 * tracing itself; use variant that does not call into RCU and cannot be
587 	 * traced.
588 	 *
589 	 * Note: Such use cases must take care when using refcounting to evict
590 	 * unused entries, because the stack record free-then-reuse code paths
591 	 * do call into RCU.
592 	 */
593 	rcu_read_lock_sched_notrace();
594 
595 	list_for_each_entry_rcu(stack, bucket, hash_list) {
596 		if (stack->hash != hash || stack->size != size)
597 			continue;
598 
599 		/*
600 		 * This may race with depot_free_stack() accessing the freelist
601 		 * management state unioned with @entries. The refcount is zero
602 		 * in that case and the below refcount_inc_not_zero() will fail.
603 		 */
604 		if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
605 			continue;
606 
607 		/*
608 		 * Try to increment refcount. If this succeeds, the stack record
609 		 * is valid and has not yet been freed.
610 		 *
611 		 * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
612 		 * to then call stack_depot_put() later, and we can assume that
613 		 * a stack record is never placed back on the freelist.
614 		 */
615 		if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
616 			continue;
617 
618 		ret = stack;
619 		break;
620 	}
621 
622 	rcu_read_unlock_sched_notrace();
623 
624 	return ret;
625 }
626 
627 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
628 					    unsigned int nr_entries,
629 					    gfp_t alloc_flags,
630 					    depot_flags_t depot_flags)
631 {
632 	struct list_head *bucket;
633 	struct stack_record *found = NULL;
634 	depot_stack_handle_t handle = 0;
635 	struct page *page = NULL;
636 	void *prealloc = NULL;
637 	bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
638 	unsigned long flags;
639 	u32 hash;
640 
641 	if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
642 		return 0;
643 
644 	/*
645 	 * If this stack trace is from an interrupt, including anything before
646 	 * interrupt entry usually leads to unbounded stack depot growth.
647 	 *
648 	 * Since use of filter_irq_stacks() is a requirement to ensure stack
649 	 * depot can efficiently deduplicate interrupt stacks, always
650 	 * filter_irq_stacks() to simplify all callers' use of stack depot.
651 	 */
652 	nr_entries = filter_irq_stacks(entries, nr_entries);
653 
654 	if (unlikely(nr_entries == 0) || stack_depot_disabled)
655 		return 0;
656 
657 	hash = hash_stack(entries, nr_entries);
658 	bucket = &stack_table[hash & stack_hash_mask];
659 
660 	/* Fast path: look the stack trace up without locking. */
661 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
662 	if (found)
663 		goto exit;
664 
665 	/*
666 	 * Allocate memory for a new pool if required now:
667 	 * we won't be able to do that under the lock.
668 	 */
669 	if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
670 		/*
671 		 * Zero out zone modifiers, as we don't have specific zone
672 		 * requirements. Keep the flags related to allocation in atomic
673 		 * contexts and I/O.
674 		 */
675 		alloc_flags &= ~GFP_ZONEMASK;
676 		alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
677 		alloc_flags |= __GFP_NOWARN;
678 		page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
679 		if (page)
680 			prealloc = page_address(page);
681 	}
682 
683 	raw_spin_lock_irqsave(&pool_lock, flags);
684 	printk_deferred_enter();
685 
686 	/* Try to find again, to avoid concurrently inserting duplicates. */
687 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
688 	if (!found) {
689 		struct stack_record *new =
690 			depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
691 
692 		if (new) {
693 			/*
694 			 * This releases the stack record into the bucket and
695 			 * makes it visible to readers in find_stack().
696 			 */
697 			list_add_rcu(&new->hash_list, bucket);
698 			found = new;
699 		}
700 	}
701 
702 	if (prealloc) {
703 		/*
704 		 * Either stack depot already contains this stack trace, or
705 		 * depot_alloc_stack() did not consume the preallocated memory.
706 		 * Try to keep the preallocated memory for future.
707 		 */
708 		depot_keep_new_pool(&prealloc);
709 	}
710 
711 	printk_deferred_exit();
712 	raw_spin_unlock_irqrestore(&pool_lock, flags);
713 exit:
714 	if (prealloc) {
715 		/* Stack depot didn't use this memory, free it. */
716 		free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
717 	}
718 	if (found)
719 		handle = found->handle.handle;
720 	return handle;
721 }
722 EXPORT_SYMBOL_GPL(stack_depot_save_flags);
723 
724 depot_stack_handle_t stack_depot_save(unsigned long *entries,
725 				      unsigned int nr_entries,
726 				      gfp_t alloc_flags)
727 {
728 	return stack_depot_save_flags(entries, nr_entries, alloc_flags,
729 				      STACK_DEPOT_FLAG_CAN_ALLOC);
730 }
731 EXPORT_SYMBOL_GPL(stack_depot_save);
732 
733 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
734 			       unsigned long **entries)
735 {
736 	struct stack_record *stack;
737 
738 	*entries = NULL;
739 	/*
740 	 * Let KMSAN know *entries is initialized. This shall prevent false
741 	 * positive reports if instrumented code accesses it.
742 	 */
743 	kmsan_unpoison_memory(entries, sizeof(*entries));
744 
745 	if (!handle || stack_depot_disabled)
746 		return 0;
747 
748 	stack = depot_fetch_stack(handle);
749 	/*
750 	 * Should never be NULL, otherwise this is a use-after-put (or just a
751 	 * corrupt handle).
752 	 */
753 	if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
754 		return 0;
755 
756 	*entries = stack->entries;
757 	return stack->size;
758 }
759 EXPORT_SYMBOL_GPL(stack_depot_fetch);
760 
761 void stack_depot_put(depot_stack_handle_t handle)
762 {
763 	struct stack_record *stack;
764 
765 	if (!handle || stack_depot_disabled)
766 		return;
767 
768 	stack = depot_fetch_stack(handle);
769 	/*
770 	 * Should always be able to find the stack record, otherwise this is an
771 	 * unbalanced put attempt (or corrupt handle).
772 	 */
773 	if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
774 		return;
775 
776 	if (refcount_dec_and_test(&stack->count))
777 		depot_free_stack(stack);
778 }
779 EXPORT_SYMBOL_GPL(stack_depot_put);
780 
781 void stack_depot_print(depot_stack_handle_t stack)
782 {
783 	unsigned long *entries;
784 	unsigned int nr_entries;
785 
786 	nr_entries = stack_depot_fetch(stack, &entries);
787 	if (nr_entries > 0)
788 		stack_trace_print(entries, nr_entries, 0);
789 }
790 EXPORT_SYMBOL_GPL(stack_depot_print);
791 
792 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
793 		       int spaces)
794 {
795 	unsigned long *entries;
796 	unsigned int nr_entries;
797 
798 	nr_entries = stack_depot_fetch(handle, &entries);
799 	return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
800 						spaces) : 0;
801 }
802 EXPORT_SYMBOL_GPL(stack_depot_snprint);
803 
804 depot_stack_handle_t __must_check stack_depot_set_extra_bits(
805 			depot_stack_handle_t handle, unsigned int extra_bits)
806 {
807 	union handle_parts parts = { .handle = handle };
808 
809 	/* Don't set extra bits on empty handles. */
810 	if (!handle)
811 		return 0;
812 
813 	parts.extra = extra_bits;
814 	return parts.handle;
815 }
816 EXPORT_SYMBOL(stack_depot_set_extra_bits);
817 
818 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
819 {
820 	union handle_parts parts = { .handle = handle };
821 
822 	return parts.extra;
823 }
824 EXPORT_SYMBOL(stack_depot_get_extra_bits);
825 
826 static int stats_show(struct seq_file *seq, void *v)
827 {
828 	/*
829 	 * data race ok: These are just statistics counters, and approximate
830 	 * statistics are ok for debugging.
831 	 */
832 	seq_printf(seq, "pools: %d\n", data_race(pools_num));
833 	for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
834 		seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
835 
836 	return 0;
837 }
838 DEFINE_SHOW_ATTRIBUTE(stats);
839 
840 static int depot_debugfs_init(void)
841 {
842 	struct dentry *dir;
843 
844 	if (stack_depot_disabled)
845 		return 0;
846 
847 	dir = debugfs_create_dir("stackdepot", NULL);
848 	debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
849 	return 0;
850 }
851 late_initcall(depot_debugfs_init);
852