1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stack depot - a stack trace storage that avoids duplication. 4 * 5 * Internally, stack depot maintains a hash table of unique stacktraces. The 6 * stack traces themselves are stored contiguously one after another in a set 7 * of separate page allocations. 8 * 9 * Author: Alexander Potapenko <[email protected]> 10 * Copyright (C) 2016 Google, Inc. 11 * 12 * Based on the code by Dmitry Chernenkov. 13 */ 14 15 #define pr_fmt(fmt) "stackdepot: " fmt 16 17 #include <linux/gfp.h> 18 #include <linux/jhash.h> 19 #include <linux/kernel.h> 20 #include <linux/kmsan.h> 21 #include <linux/list.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/percpu.h> 25 #include <linux/printk.h> 26 #include <linux/refcount.h> 27 #include <linux/slab.h> 28 #include <linux/spinlock.h> 29 #include <linux/stacktrace.h> 30 #include <linux/stackdepot.h> 31 #include <linux/string.h> 32 #include <linux/types.h> 33 #include <linux/memblock.h> 34 #include <linux/kasan-enabled.h> 35 36 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) 37 38 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ 39 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) 40 #define DEPOT_STACK_ALIGN 4 41 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) 42 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ 43 STACK_DEPOT_EXTRA_BITS) 44 #define DEPOT_POOLS_CAP 8192 45 #define DEPOT_MAX_POOLS \ 46 (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ 47 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) 48 49 /* Compact structure that stores a reference to a stack. */ 50 union handle_parts { 51 depot_stack_handle_t handle; 52 struct { 53 u32 pool_index : DEPOT_POOL_INDEX_BITS; 54 u32 offset : DEPOT_OFFSET_BITS; 55 u32 extra : STACK_DEPOT_EXTRA_BITS; 56 }; 57 }; 58 59 struct stack_record { 60 struct list_head list; /* Links in hash table or freelist */ 61 u32 hash; /* Hash in hash table */ 62 u32 size; /* Number of stored frames */ 63 union handle_parts handle; 64 refcount_t count; 65 unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ 66 }; 67 68 #define DEPOT_STACK_RECORD_SIZE \ 69 ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) 70 71 static bool stack_depot_disabled; 72 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); 73 static bool __stack_depot_early_init_passed __initdata; 74 75 /* Use one hash table bucket per 16 KB of memory. */ 76 #define STACK_HASH_TABLE_SCALE 14 77 /* Limit the number of buckets between 4K and 1M. */ 78 #define STACK_BUCKET_NUMBER_ORDER_MIN 12 79 #define STACK_BUCKET_NUMBER_ORDER_MAX 20 80 /* Initial seed for jhash2. */ 81 #define STACK_HASH_SEED 0x9747b28c 82 83 /* Hash table of stored stack records. */ 84 static struct list_head *stack_table; 85 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ 86 static unsigned int stack_bucket_number_order; 87 /* Hash mask for indexing the table. */ 88 static unsigned int stack_hash_mask; 89 90 /* Array of memory regions that store stack records. */ 91 static void *stack_pools[DEPOT_MAX_POOLS]; 92 /* Newly allocated pool that is not yet added to stack_pools. */ 93 static void *new_pool; 94 /* Number of pools in stack_pools. */ 95 static int pools_num; 96 /* Freelist of stack records within stack_pools. */ 97 static LIST_HEAD(free_stacks); 98 /* 99 * Stack depot tries to keep an extra pool allocated even before it runs out 100 * of space in the currently used pool. This flag marks whether this extra pool 101 * needs to be allocated. It has the value 0 when either an extra pool is not 102 * yet allocated or if the limit on the number of pools is reached. 103 */ 104 static bool new_pool_required = true; 105 /* Lock that protects the variables above. */ 106 static DEFINE_RWLOCK(pool_rwlock); 107 108 static int __init disable_stack_depot(char *str) 109 { 110 return kstrtobool(str, &stack_depot_disabled); 111 } 112 early_param("stack_depot_disable", disable_stack_depot); 113 114 void __init stack_depot_request_early_init(void) 115 { 116 /* Too late to request early init now. */ 117 WARN_ON(__stack_depot_early_init_passed); 118 119 __stack_depot_early_init_requested = true; 120 } 121 122 /* Initialize list_head's within the hash table. */ 123 static void init_stack_table(unsigned long entries) 124 { 125 unsigned long i; 126 127 for (i = 0; i < entries; i++) 128 INIT_LIST_HEAD(&stack_table[i]); 129 } 130 131 /* Allocates a hash table via memblock. Can only be used during early boot. */ 132 int __init stack_depot_early_init(void) 133 { 134 unsigned long entries = 0; 135 136 /* This function must be called only once, from mm_init(). */ 137 if (WARN_ON(__stack_depot_early_init_passed)) 138 return 0; 139 __stack_depot_early_init_passed = true; 140 141 /* 142 * Print disabled message even if early init has not been requested: 143 * stack_depot_init() will not print one. 144 */ 145 if (stack_depot_disabled) { 146 pr_info("disabled\n"); 147 return 0; 148 } 149 150 /* 151 * If KASAN is enabled, use the maximum order: KASAN is frequently used 152 * in fuzzing scenarios, which leads to a large number of different 153 * stack traces being stored in stack depot. 154 */ 155 if (kasan_enabled() && !stack_bucket_number_order) 156 stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; 157 158 /* 159 * Check if early init has been requested after setting 160 * stack_bucket_number_order: stack_depot_init() uses its value. 161 */ 162 if (!__stack_depot_early_init_requested) 163 return 0; 164 165 /* 166 * If stack_bucket_number_order is not set, leave entries as 0 to rely 167 * on the automatic calculations performed by alloc_large_system_hash(). 168 */ 169 if (stack_bucket_number_order) 170 entries = 1UL << stack_bucket_number_order; 171 pr_info("allocating hash table via alloc_large_system_hash\n"); 172 stack_table = alloc_large_system_hash("stackdepot", 173 sizeof(struct list_head), 174 entries, 175 STACK_HASH_TABLE_SCALE, 176 HASH_EARLY, 177 NULL, 178 &stack_hash_mask, 179 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, 180 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); 181 if (!stack_table) { 182 pr_err("hash table allocation failed, disabling\n"); 183 stack_depot_disabled = true; 184 return -ENOMEM; 185 } 186 if (!entries) { 187 /* 188 * Obtain the number of entries that was calculated by 189 * alloc_large_system_hash(). 190 */ 191 entries = stack_hash_mask + 1; 192 } 193 init_stack_table(entries); 194 195 return 0; 196 } 197 198 /* Allocates a hash table via kvcalloc. Can be used after boot. */ 199 int stack_depot_init(void) 200 { 201 static DEFINE_MUTEX(stack_depot_init_mutex); 202 unsigned long entries; 203 int ret = 0; 204 205 mutex_lock(&stack_depot_init_mutex); 206 207 if (stack_depot_disabled || stack_table) 208 goto out_unlock; 209 210 /* 211 * Similarly to stack_depot_early_init, use stack_bucket_number_order 212 * if assigned, and rely on automatic scaling otherwise. 213 */ 214 if (stack_bucket_number_order) { 215 entries = 1UL << stack_bucket_number_order; 216 } else { 217 int scale = STACK_HASH_TABLE_SCALE; 218 219 entries = nr_free_buffer_pages(); 220 entries = roundup_pow_of_two(entries); 221 222 if (scale > PAGE_SHIFT) 223 entries >>= (scale - PAGE_SHIFT); 224 else 225 entries <<= (PAGE_SHIFT - scale); 226 } 227 228 if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) 229 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; 230 if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) 231 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; 232 233 pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); 234 stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL); 235 if (!stack_table) { 236 pr_err("hash table allocation failed, disabling\n"); 237 stack_depot_disabled = true; 238 ret = -ENOMEM; 239 goto out_unlock; 240 } 241 stack_hash_mask = entries - 1; 242 init_stack_table(entries); 243 244 out_unlock: 245 mutex_unlock(&stack_depot_init_mutex); 246 247 return ret; 248 } 249 EXPORT_SYMBOL_GPL(stack_depot_init); 250 251 /* Initializes a stack depol pool. */ 252 static void depot_init_pool(void *pool) 253 { 254 int offset; 255 256 lockdep_assert_held_write(&pool_rwlock); 257 258 WARN_ON(!list_empty(&free_stacks)); 259 260 /* Initialize handles and link stack records into the freelist. */ 261 for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; 262 offset += DEPOT_STACK_RECORD_SIZE) { 263 struct stack_record *stack = pool + offset; 264 265 stack->handle.pool_index = pools_num; 266 stack->handle.offset = offset >> DEPOT_STACK_ALIGN; 267 stack->handle.extra = 0; 268 269 list_add(&stack->list, &free_stacks); 270 } 271 272 /* Save reference to the pool to be used by depot_fetch_stack(). */ 273 stack_pools[pools_num] = pool; 274 pools_num++; 275 } 276 277 /* Keeps the preallocated memory to be used for a new stack depot pool. */ 278 static void depot_keep_new_pool(void **prealloc) 279 { 280 lockdep_assert_held_write(&pool_rwlock); 281 282 /* 283 * If a new pool is already saved or the maximum number of 284 * pools is reached, do not use the preallocated memory. 285 */ 286 if (!new_pool_required) 287 return; 288 289 /* 290 * Use the preallocated memory for the new pool 291 * as long as we do not exceed the maximum number of pools. 292 */ 293 if (pools_num < DEPOT_MAX_POOLS) { 294 new_pool = *prealloc; 295 *prealloc = NULL; 296 } 297 298 /* 299 * At this point, either a new pool is kept or the maximum 300 * number of pools is reached. In either case, take note that 301 * keeping another pool is not required. 302 */ 303 new_pool_required = false; 304 } 305 306 /* Updates references to the current and the next stack depot pools. */ 307 static bool depot_update_pools(void **prealloc) 308 { 309 lockdep_assert_held_write(&pool_rwlock); 310 311 /* Check if we still have objects in the freelist. */ 312 if (!list_empty(&free_stacks)) 313 goto out_keep_prealloc; 314 315 /* Check if we have a new pool saved and use it. */ 316 if (new_pool) { 317 depot_init_pool(new_pool); 318 new_pool = NULL; 319 320 /* Take note that we might need a new new_pool. */ 321 if (pools_num < DEPOT_MAX_POOLS) 322 new_pool_required = true; 323 324 /* Try keeping the preallocated memory for new_pool. */ 325 goto out_keep_prealloc; 326 } 327 328 /* Bail out if we reached the pool limit. */ 329 if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { 330 WARN_ONCE(1, "Stack depot reached limit capacity"); 331 return false; 332 } 333 334 /* Check if we have preallocated memory and use it. */ 335 if (*prealloc) { 336 depot_init_pool(*prealloc); 337 *prealloc = NULL; 338 return true; 339 } 340 341 return false; 342 343 out_keep_prealloc: 344 /* Keep the preallocated memory for a new pool if required. */ 345 if (*prealloc) 346 depot_keep_new_pool(prealloc); 347 return true; 348 } 349 350 /* Allocates a new stack in a stack depot pool. */ 351 static struct stack_record * 352 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) 353 { 354 struct stack_record *stack; 355 356 lockdep_assert_held_write(&pool_rwlock); 357 358 /* Update current and new pools if required and possible. */ 359 if (!depot_update_pools(prealloc)) 360 return NULL; 361 362 /* Check if we have a stack record to save the stack trace. */ 363 if (list_empty(&free_stacks)) 364 return NULL; 365 366 /* Get and unlink the first entry from the freelist. */ 367 stack = list_first_entry(&free_stacks, struct stack_record, list); 368 list_del(&stack->list); 369 370 /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ 371 if (size > CONFIG_STACKDEPOT_MAX_FRAMES) 372 size = CONFIG_STACKDEPOT_MAX_FRAMES; 373 374 /* Save the stack trace. */ 375 stack->hash = hash; 376 stack->size = size; 377 /* stack->handle is already filled in by depot_init_pool(). */ 378 refcount_set(&stack->count, 1); 379 memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); 380 381 /* 382 * Let KMSAN know the stored stack record is initialized. This shall 383 * prevent false positive reports if instrumented code accesses it. 384 */ 385 kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); 386 387 return stack; 388 } 389 390 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) 391 { 392 union handle_parts parts = { .handle = handle }; 393 void *pool; 394 size_t offset = parts.offset << DEPOT_STACK_ALIGN; 395 struct stack_record *stack; 396 397 lockdep_assert_held(&pool_rwlock); 398 399 if (parts.pool_index > pools_num) { 400 WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", 401 parts.pool_index, pools_num, handle); 402 return NULL; 403 } 404 405 pool = stack_pools[parts.pool_index]; 406 if (!pool) 407 return NULL; 408 409 stack = pool + offset; 410 return stack; 411 } 412 413 /* Links stack into the freelist. */ 414 static void depot_free_stack(struct stack_record *stack) 415 { 416 lockdep_assert_held_write(&pool_rwlock); 417 418 list_add(&stack->list, &free_stacks); 419 } 420 421 /* Calculates the hash for a stack. */ 422 static inline u32 hash_stack(unsigned long *entries, unsigned int size) 423 { 424 return jhash2((u32 *)entries, 425 array_size(size, sizeof(*entries)) / sizeof(u32), 426 STACK_HASH_SEED); 427 } 428 429 /* 430 * Non-instrumented version of memcmp(). 431 * Does not check the lexicographical order, only the equality. 432 */ 433 static inline 434 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, 435 unsigned int n) 436 { 437 for ( ; n-- ; u1++, u2++) { 438 if (*u1 != *u2) 439 return 1; 440 } 441 return 0; 442 } 443 444 /* Finds a stack in a bucket of the hash table. */ 445 static inline struct stack_record *find_stack(struct list_head *bucket, 446 unsigned long *entries, int size, 447 u32 hash) 448 { 449 struct list_head *pos; 450 struct stack_record *found; 451 452 lockdep_assert_held(&pool_rwlock); 453 454 list_for_each(pos, bucket) { 455 found = list_entry(pos, struct stack_record, list); 456 if (found->hash == hash && 457 found->size == size && 458 !stackdepot_memcmp(entries, found->entries, size)) 459 return found; 460 } 461 return NULL; 462 } 463 464 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, 465 unsigned int nr_entries, 466 gfp_t alloc_flags, 467 depot_flags_t depot_flags) 468 { 469 struct list_head *bucket; 470 struct stack_record *found = NULL; 471 depot_stack_handle_t handle = 0; 472 struct page *page = NULL; 473 void *prealloc = NULL; 474 bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; 475 bool need_alloc = false; 476 unsigned long flags; 477 u32 hash; 478 479 if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) 480 return 0; 481 482 /* 483 * If this stack trace is from an interrupt, including anything before 484 * interrupt entry usually leads to unbounded stack depot growth. 485 * 486 * Since use of filter_irq_stacks() is a requirement to ensure stack 487 * depot can efficiently deduplicate interrupt stacks, always 488 * filter_irq_stacks() to simplify all callers' use of stack depot. 489 */ 490 nr_entries = filter_irq_stacks(entries, nr_entries); 491 492 if (unlikely(nr_entries == 0) || stack_depot_disabled) 493 return 0; 494 495 hash = hash_stack(entries, nr_entries); 496 bucket = &stack_table[hash & stack_hash_mask]; 497 498 read_lock_irqsave(&pool_rwlock, flags); 499 500 /* Fast path: look the stack trace up without full locking. */ 501 found = find_stack(bucket, entries, nr_entries, hash); 502 if (found) { 503 if (depot_flags & STACK_DEPOT_FLAG_GET) 504 refcount_inc(&found->count); 505 read_unlock_irqrestore(&pool_rwlock, flags); 506 goto exit; 507 } 508 509 /* Take note if another stack pool needs to be allocated. */ 510 if (new_pool_required) 511 need_alloc = true; 512 513 read_unlock_irqrestore(&pool_rwlock, flags); 514 515 /* 516 * Allocate memory for a new pool if required now: 517 * we won't be able to do that under the lock. 518 */ 519 if (unlikely(can_alloc && need_alloc)) { 520 /* 521 * Zero out zone modifiers, as we don't have specific zone 522 * requirements. Keep the flags related to allocation in atomic 523 * contexts and I/O. 524 */ 525 alloc_flags &= ~GFP_ZONEMASK; 526 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); 527 alloc_flags |= __GFP_NOWARN; 528 page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); 529 if (page) 530 prealloc = page_address(page); 531 } 532 533 write_lock_irqsave(&pool_rwlock, flags); 534 535 found = find_stack(bucket, entries, nr_entries, hash); 536 if (!found) { 537 struct stack_record *new = 538 depot_alloc_stack(entries, nr_entries, hash, &prealloc); 539 540 if (new) { 541 list_add(&new->list, bucket); 542 found = new; 543 } 544 } else { 545 if (depot_flags & STACK_DEPOT_FLAG_GET) 546 refcount_inc(&found->count); 547 /* 548 * Stack depot already contains this stack trace, but let's 549 * keep the preallocated memory for future. 550 */ 551 if (prealloc) 552 depot_keep_new_pool(&prealloc); 553 } 554 555 write_unlock_irqrestore(&pool_rwlock, flags); 556 exit: 557 if (prealloc) { 558 /* Stack depot didn't use this memory, free it. */ 559 free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); 560 } 561 if (found) 562 handle = found->handle.handle; 563 return handle; 564 } 565 EXPORT_SYMBOL_GPL(stack_depot_save_flags); 566 567 depot_stack_handle_t stack_depot_save(unsigned long *entries, 568 unsigned int nr_entries, 569 gfp_t alloc_flags) 570 { 571 return stack_depot_save_flags(entries, nr_entries, alloc_flags, 572 STACK_DEPOT_FLAG_CAN_ALLOC); 573 } 574 EXPORT_SYMBOL_GPL(stack_depot_save); 575 576 unsigned int stack_depot_fetch(depot_stack_handle_t handle, 577 unsigned long **entries) 578 { 579 struct stack_record *stack; 580 unsigned long flags; 581 582 *entries = NULL; 583 /* 584 * Let KMSAN know *entries is initialized. This shall prevent false 585 * positive reports if instrumented code accesses it. 586 */ 587 kmsan_unpoison_memory(entries, sizeof(*entries)); 588 589 if (!handle || stack_depot_disabled) 590 return 0; 591 592 read_lock_irqsave(&pool_rwlock, flags); 593 594 stack = depot_fetch_stack(handle); 595 596 read_unlock_irqrestore(&pool_rwlock, flags); 597 598 *entries = stack->entries; 599 return stack->size; 600 } 601 EXPORT_SYMBOL_GPL(stack_depot_fetch); 602 603 void stack_depot_put(depot_stack_handle_t handle) 604 { 605 struct stack_record *stack; 606 unsigned long flags; 607 608 if (!handle || stack_depot_disabled) 609 return; 610 611 write_lock_irqsave(&pool_rwlock, flags); 612 613 stack = depot_fetch_stack(handle); 614 if (WARN_ON(!stack)) 615 goto out; 616 617 if (refcount_dec_and_test(&stack->count)) { 618 /* Unlink stack from the hash table. */ 619 list_del(&stack->list); 620 621 /* Free stack. */ 622 depot_free_stack(stack); 623 } 624 625 out: 626 write_unlock_irqrestore(&pool_rwlock, flags); 627 } 628 EXPORT_SYMBOL_GPL(stack_depot_put); 629 630 void stack_depot_print(depot_stack_handle_t stack) 631 { 632 unsigned long *entries; 633 unsigned int nr_entries; 634 635 nr_entries = stack_depot_fetch(stack, &entries); 636 if (nr_entries > 0) 637 stack_trace_print(entries, nr_entries, 0); 638 } 639 EXPORT_SYMBOL_GPL(stack_depot_print); 640 641 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, 642 int spaces) 643 { 644 unsigned long *entries; 645 unsigned int nr_entries; 646 647 nr_entries = stack_depot_fetch(handle, &entries); 648 return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, 649 spaces) : 0; 650 } 651 EXPORT_SYMBOL_GPL(stack_depot_snprint); 652 653 depot_stack_handle_t __must_check stack_depot_set_extra_bits( 654 depot_stack_handle_t handle, unsigned int extra_bits) 655 { 656 union handle_parts parts = { .handle = handle }; 657 658 /* Don't set extra bits on empty handles. */ 659 if (!handle) 660 return 0; 661 662 parts.extra = extra_bits; 663 return parts.handle; 664 } 665 EXPORT_SYMBOL(stack_depot_set_extra_bits); 666 667 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) 668 { 669 union handle_parts parts = { .handle = handle }; 670 671 return parts.extra; 672 } 673 EXPORT_SYMBOL(stack_depot_get_extra_bits); 674