1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stack depot - a stack trace storage that avoids duplication. 4 * 5 * Internally, stack depot maintains a hash table of unique stacktraces. The 6 * stack traces themselves are stored contiguously one after another in a set 7 * of separate page allocations. 8 * 9 * Author: Alexander Potapenko <[email protected]> 10 * Copyright (C) 2016 Google, Inc. 11 * 12 * Based on the code by Dmitry Chernenkov. 13 */ 14 15 #define pr_fmt(fmt) "stackdepot: " fmt 16 17 #include <linux/gfp.h> 18 #include <linux/jhash.h> 19 #include <linux/kernel.h> 20 #include <linux/kmsan.h> 21 #include <linux/list.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/percpu.h> 25 #include <linux/printk.h> 26 #include <linux/slab.h> 27 #include <linux/spinlock.h> 28 #include <linux/stacktrace.h> 29 #include <linux/stackdepot.h> 30 #include <linux/string.h> 31 #include <linux/types.h> 32 #include <linux/memblock.h> 33 #include <linux/kasan-enabled.h> 34 35 #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) 36 37 #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ 38 #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) 39 #define DEPOT_STACK_ALIGN 4 40 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) 41 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ 42 STACK_DEPOT_EXTRA_BITS) 43 #define DEPOT_POOLS_CAP 8192 44 #define DEPOT_MAX_POOLS \ 45 (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ 46 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) 47 48 /* Compact structure that stores a reference to a stack. */ 49 union handle_parts { 50 depot_stack_handle_t handle; 51 struct { 52 u32 pool_index : DEPOT_POOL_INDEX_BITS; 53 u32 offset : DEPOT_OFFSET_BITS; 54 u32 extra : STACK_DEPOT_EXTRA_BITS; 55 }; 56 }; 57 58 struct stack_record { 59 struct list_head list; /* Links in hash table or freelist */ 60 u32 hash; /* Hash in hash table */ 61 u32 size; /* Number of stored frames */ 62 union handle_parts handle; 63 unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ 64 }; 65 66 #define DEPOT_STACK_RECORD_SIZE \ 67 ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) 68 69 static bool stack_depot_disabled; 70 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); 71 static bool __stack_depot_early_init_passed __initdata; 72 73 /* Use one hash table bucket per 16 KB of memory. */ 74 #define STACK_HASH_TABLE_SCALE 14 75 /* Limit the number of buckets between 4K and 1M. */ 76 #define STACK_BUCKET_NUMBER_ORDER_MIN 12 77 #define STACK_BUCKET_NUMBER_ORDER_MAX 20 78 /* Initial seed for jhash2. */ 79 #define STACK_HASH_SEED 0x9747b28c 80 81 /* Hash table of stored stack records. */ 82 static struct list_head *stack_table; 83 /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ 84 static unsigned int stack_bucket_number_order; 85 /* Hash mask for indexing the table. */ 86 static unsigned int stack_hash_mask; 87 88 /* Array of memory regions that store stack records. */ 89 static void *stack_pools[DEPOT_MAX_POOLS]; 90 /* Newly allocated pool that is not yet added to stack_pools. */ 91 static void *new_pool; 92 /* Number of pools in stack_pools. */ 93 static int pools_num; 94 /* Freelist of stack records within stack_pools. */ 95 static LIST_HEAD(free_stacks); 96 /* 97 * Stack depot tries to keep an extra pool allocated even before it runs out 98 * of space in the currently used pool. This flag marks whether this extra pool 99 * needs to be allocated. It has the value 0 when either an extra pool is not 100 * yet allocated or if the limit on the number of pools is reached. 101 */ 102 static bool new_pool_required = true; 103 /* Lock that protects the variables above. */ 104 static DEFINE_RWLOCK(pool_rwlock); 105 106 static int __init disable_stack_depot(char *str) 107 { 108 return kstrtobool(str, &stack_depot_disabled); 109 } 110 early_param("stack_depot_disable", disable_stack_depot); 111 112 void __init stack_depot_request_early_init(void) 113 { 114 /* Too late to request early init now. */ 115 WARN_ON(__stack_depot_early_init_passed); 116 117 __stack_depot_early_init_requested = true; 118 } 119 120 /* Initialize list_head's within the hash table. */ 121 static void init_stack_table(unsigned long entries) 122 { 123 unsigned long i; 124 125 for (i = 0; i < entries; i++) 126 INIT_LIST_HEAD(&stack_table[i]); 127 } 128 129 /* Allocates a hash table via memblock. Can only be used during early boot. */ 130 int __init stack_depot_early_init(void) 131 { 132 unsigned long entries = 0; 133 134 /* This function must be called only once, from mm_init(). */ 135 if (WARN_ON(__stack_depot_early_init_passed)) 136 return 0; 137 __stack_depot_early_init_passed = true; 138 139 /* 140 * Print disabled message even if early init has not been requested: 141 * stack_depot_init() will not print one. 142 */ 143 if (stack_depot_disabled) { 144 pr_info("disabled\n"); 145 return 0; 146 } 147 148 /* 149 * If KASAN is enabled, use the maximum order: KASAN is frequently used 150 * in fuzzing scenarios, which leads to a large number of different 151 * stack traces being stored in stack depot. 152 */ 153 if (kasan_enabled() && !stack_bucket_number_order) 154 stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; 155 156 /* 157 * Check if early init has been requested after setting 158 * stack_bucket_number_order: stack_depot_init() uses its value. 159 */ 160 if (!__stack_depot_early_init_requested) 161 return 0; 162 163 /* 164 * If stack_bucket_number_order is not set, leave entries as 0 to rely 165 * on the automatic calculations performed by alloc_large_system_hash(). 166 */ 167 if (stack_bucket_number_order) 168 entries = 1UL << stack_bucket_number_order; 169 pr_info("allocating hash table via alloc_large_system_hash\n"); 170 stack_table = alloc_large_system_hash("stackdepot", 171 sizeof(struct list_head), 172 entries, 173 STACK_HASH_TABLE_SCALE, 174 HASH_EARLY, 175 NULL, 176 &stack_hash_mask, 177 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, 178 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); 179 if (!stack_table) { 180 pr_err("hash table allocation failed, disabling\n"); 181 stack_depot_disabled = true; 182 return -ENOMEM; 183 } 184 if (!entries) { 185 /* 186 * Obtain the number of entries that was calculated by 187 * alloc_large_system_hash(). 188 */ 189 entries = stack_hash_mask + 1; 190 } 191 init_stack_table(entries); 192 193 return 0; 194 } 195 196 /* Allocates a hash table via kvcalloc. Can be used after boot. */ 197 int stack_depot_init(void) 198 { 199 static DEFINE_MUTEX(stack_depot_init_mutex); 200 unsigned long entries; 201 int ret = 0; 202 203 mutex_lock(&stack_depot_init_mutex); 204 205 if (stack_depot_disabled || stack_table) 206 goto out_unlock; 207 208 /* 209 * Similarly to stack_depot_early_init, use stack_bucket_number_order 210 * if assigned, and rely on automatic scaling otherwise. 211 */ 212 if (stack_bucket_number_order) { 213 entries = 1UL << stack_bucket_number_order; 214 } else { 215 int scale = STACK_HASH_TABLE_SCALE; 216 217 entries = nr_free_buffer_pages(); 218 entries = roundup_pow_of_two(entries); 219 220 if (scale > PAGE_SHIFT) 221 entries >>= (scale - PAGE_SHIFT); 222 else 223 entries <<= (PAGE_SHIFT - scale); 224 } 225 226 if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) 227 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; 228 if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) 229 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; 230 231 pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); 232 stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL); 233 if (!stack_table) { 234 pr_err("hash table allocation failed, disabling\n"); 235 stack_depot_disabled = true; 236 ret = -ENOMEM; 237 goto out_unlock; 238 } 239 stack_hash_mask = entries - 1; 240 init_stack_table(entries); 241 242 out_unlock: 243 mutex_unlock(&stack_depot_init_mutex); 244 245 return ret; 246 } 247 EXPORT_SYMBOL_GPL(stack_depot_init); 248 249 /* Initializes a stack depol pool. */ 250 static void depot_init_pool(void *pool) 251 { 252 int offset; 253 254 lockdep_assert_held_write(&pool_rwlock); 255 256 WARN_ON(!list_empty(&free_stacks)); 257 258 /* Initialize handles and link stack records into the freelist. */ 259 for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; 260 offset += DEPOT_STACK_RECORD_SIZE) { 261 struct stack_record *stack = pool + offset; 262 263 stack->handle.pool_index = pools_num; 264 stack->handle.offset = offset >> DEPOT_STACK_ALIGN; 265 stack->handle.extra = 0; 266 267 list_add(&stack->list, &free_stacks); 268 } 269 270 /* Save reference to the pool to be used by depot_fetch_stack(). */ 271 stack_pools[pools_num] = pool; 272 pools_num++; 273 } 274 275 /* Keeps the preallocated memory to be used for a new stack depot pool. */ 276 static void depot_keep_new_pool(void **prealloc) 277 { 278 lockdep_assert_held_write(&pool_rwlock); 279 280 /* 281 * If a new pool is already saved or the maximum number of 282 * pools is reached, do not use the preallocated memory. 283 */ 284 if (!new_pool_required) 285 return; 286 287 /* 288 * Use the preallocated memory for the new pool 289 * as long as we do not exceed the maximum number of pools. 290 */ 291 if (pools_num < DEPOT_MAX_POOLS) { 292 new_pool = *prealloc; 293 *prealloc = NULL; 294 } 295 296 /* 297 * At this point, either a new pool is kept or the maximum 298 * number of pools is reached. In either case, take note that 299 * keeping another pool is not required. 300 */ 301 new_pool_required = false; 302 } 303 304 /* Updates references to the current and the next stack depot pools. */ 305 static bool depot_update_pools(void **prealloc) 306 { 307 lockdep_assert_held_write(&pool_rwlock); 308 309 /* Check if we still have objects in the freelist. */ 310 if (!list_empty(&free_stacks)) 311 goto out_keep_prealloc; 312 313 /* Check if we have a new pool saved and use it. */ 314 if (new_pool) { 315 depot_init_pool(new_pool); 316 new_pool = NULL; 317 318 /* Take note that we might need a new new_pool. */ 319 if (pools_num < DEPOT_MAX_POOLS) 320 new_pool_required = true; 321 322 /* Try keeping the preallocated memory for new_pool. */ 323 goto out_keep_prealloc; 324 } 325 326 /* Bail out if we reached the pool limit. */ 327 if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { 328 WARN_ONCE(1, "Stack depot reached limit capacity"); 329 return false; 330 } 331 332 /* Check if we have preallocated memory and use it. */ 333 if (*prealloc) { 334 depot_init_pool(*prealloc); 335 *prealloc = NULL; 336 return true; 337 } 338 339 return false; 340 341 out_keep_prealloc: 342 /* Keep the preallocated memory for a new pool if required. */ 343 if (*prealloc) 344 depot_keep_new_pool(prealloc); 345 return true; 346 } 347 348 /* Allocates a new stack in a stack depot pool. */ 349 static struct stack_record * 350 depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) 351 { 352 struct stack_record *stack; 353 354 lockdep_assert_held_write(&pool_rwlock); 355 356 /* Update current and new pools if required and possible. */ 357 if (!depot_update_pools(prealloc)) 358 return NULL; 359 360 /* Check if we have a stack record to save the stack trace. */ 361 if (list_empty(&free_stacks)) 362 return NULL; 363 364 /* Get and unlink the first entry from the freelist. */ 365 stack = list_first_entry(&free_stacks, struct stack_record, list); 366 list_del(&stack->list); 367 368 /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ 369 if (size > CONFIG_STACKDEPOT_MAX_FRAMES) 370 size = CONFIG_STACKDEPOT_MAX_FRAMES; 371 372 /* Save the stack trace. */ 373 stack->hash = hash; 374 stack->size = size; 375 /* stack->handle is already filled in by depot_init_pool(). */ 376 memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); 377 378 /* 379 * Let KMSAN know the stored stack record is initialized. This shall 380 * prevent false positive reports if instrumented code accesses it. 381 */ 382 kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); 383 384 return stack; 385 } 386 387 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) 388 { 389 union handle_parts parts = { .handle = handle }; 390 void *pool; 391 size_t offset = parts.offset << DEPOT_STACK_ALIGN; 392 struct stack_record *stack; 393 394 lockdep_assert_held_read(&pool_rwlock); 395 396 if (parts.pool_index > pools_num) { 397 WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", 398 parts.pool_index, pools_num, handle); 399 return NULL; 400 } 401 402 pool = stack_pools[parts.pool_index]; 403 if (!pool) 404 return NULL; 405 406 stack = pool + offset; 407 return stack; 408 } 409 410 /* Calculates the hash for a stack. */ 411 static inline u32 hash_stack(unsigned long *entries, unsigned int size) 412 { 413 return jhash2((u32 *)entries, 414 array_size(size, sizeof(*entries)) / sizeof(u32), 415 STACK_HASH_SEED); 416 } 417 418 /* 419 * Non-instrumented version of memcmp(). 420 * Does not check the lexicographical order, only the equality. 421 */ 422 static inline 423 int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, 424 unsigned int n) 425 { 426 for ( ; n-- ; u1++, u2++) { 427 if (*u1 != *u2) 428 return 1; 429 } 430 return 0; 431 } 432 433 /* Finds a stack in a bucket of the hash table. */ 434 static inline struct stack_record *find_stack(struct list_head *bucket, 435 unsigned long *entries, int size, 436 u32 hash) 437 { 438 struct list_head *pos; 439 struct stack_record *found; 440 441 lockdep_assert_held(&pool_rwlock); 442 443 list_for_each(pos, bucket) { 444 found = list_entry(pos, struct stack_record, list); 445 if (found->hash == hash && 446 found->size == size && 447 !stackdepot_memcmp(entries, found->entries, size)) 448 return found; 449 } 450 return NULL; 451 } 452 453 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, 454 unsigned int nr_entries, 455 gfp_t alloc_flags, 456 depot_flags_t depot_flags) 457 { 458 struct list_head *bucket; 459 struct stack_record *found = NULL; 460 depot_stack_handle_t handle = 0; 461 struct page *page = NULL; 462 void *prealloc = NULL; 463 bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; 464 bool need_alloc = false; 465 unsigned long flags; 466 u32 hash; 467 468 if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) 469 return 0; 470 471 /* 472 * If this stack trace is from an interrupt, including anything before 473 * interrupt entry usually leads to unbounded stack depot growth. 474 * 475 * Since use of filter_irq_stacks() is a requirement to ensure stack 476 * depot can efficiently deduplicate interrupt stacks, always 477 * filter_irq_stacks() to simplify all callers' use of stack depot. 478 */ 479 nr_entries = filter_irq_stacks(entries, nr_entries); 480 481 if (unlikely(nr_entries == 0) || stack_depot_disabled) 482 return 0; 483 484 hash = hash_stack(entries, nr_entries); 485 bucket = &stack_table[hash & stack_hash_mask]; 486 487 read_lock_irqsave(&pool_rwlock, flags); 488 489 /* Fast path: look the stack trace up without full locking. */ 490 found = find_stack(bucket, entries, nr_entries, hash); 491 if (found) { 492 read_unlock_irqrestore(&pool_rwlock, flags); 493 goto exit; 494 } 495 496 /* Take note if another stack pool needs to be allocated. */ 497 if (new_pool_required) 498 need_alloc = true; 499 500 read_unlock_irqrestore(&pool_rwlock, flags); 501 502 /* 503 * Allocate memory for a new pool if required now: 504 * we won't be able to do that under the lock. 505 */ 506 if (unlikely(can_alloc && need_alloc)) { 507 /* 508 * Zero out zone modifiers, as we don't have specific zone 509 * requirements. Keep the flags related to allocation in atomic 510 * contexts and I/O. 511 */ 512 alloc_flags &= ~GFP_ZONEMASK; 513 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); 514 alloc_flags |= __GFP_NOWARN; 515 page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); 516 if (page) 517 prealloc = page_address(page); 518 } 519 520 write_lock_irqsave(&pool_rwlock, flags); 521 522 found = find_stack(bucket, entries, nr_entries, hash); 523 if (!found) { 524 struct stack_record *new = 525 depot_alloc_stack(entries, nr_entries, hash, &prealloc); 526 527 if (new) { 528 list_add(&new->list, bucket); 529 found = new; 530 } 531 } else if (prealloc) { 532 /* 533 * Stack depot already contains this stack trace, but let's 534 * keep the preallocated memory for future. 535 */ 536 depot_keep_new_pool(&prealloc); 537 } 538 539 write_unlock_irqrestore(&pool_rwlock, flags); 540 exit: 541 if (prealloc) { 542 /* Stack depot didn't use this memory, free it. */ 543 free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); 544 } 545 if (found) 546 handle = found->handle.handle; 547 return handle; 548 } 549 EXPORT_SYMBOL_GPL(stack_depot_save_flags); 550 551 depot_stack_handle_t stack_depot_save(unsigned long *entries, 552 unsigned int nr_entries, 553 gfp_t alloc_flags) 554 { 555 return stack_depot_save_flags(entries, nr_entries, alloc_flags, 556 STACK_DEPOT_FLAG_CAN_ALLOC); 557 } 558 EXPORT_SYMBOL_GPL(stack_depot_save); 559 560 unsigned int stack_depot_fetch(depot_stack_handle_t handle, 561 unsigned long **entries) 562 { 563 struct stack_record *stack; 564 unsigned long flags; 565 566 *entries = NULL; 567 /* 568 * Let KMSAN know *entries is initialized. This shall prevent false 569 * positive reports if instrumented code accesses it. 570 */ 571 kmsan_unpoison_memory(entries, sizeof(*entries)); 572 573 if (!handle || stack_depot_disabled) 574 return 0; 575 576 read_lock_irqsave(&pool_rwlock, flags); 577 578 stack = depot_fetch_stack(handle); 579 580 read_unlock_irqrestore(&pool_rwlock, flags); 581 582 *entries = stack->entries; 583 return stack->size; 584 } 585 EXPORT_SYMBOL_GPL(stack_depot_fetch); 586 587 void stack_depot_print(depot_stack_handle_t stack) 588 { 589 unsigned long *entries; 590 unsigned int nr_entries; 591 592 nr_entries = stack_depot_fetch(stack, &entries); 593 if (nr_entries > 0) 594 stack_trace_print(entries, nr_entries, 0); 595 } 596 EXPORT_SYMBOL_GPL(stack_depot_print); 597 598 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, 599 int spaces) 600 { 601 unsigned long *entries; 602 unsigned int nr_entries; 603 604 nr_entries = stack_depot_fetch(handle, &entries); 605 return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, 606 spaces) : 0; 607 } 608 EXPORT_SYMBOL_GPL(stack_depot_snprint); 609 610 depot_stack_handle_t __must_check stack_depot_set_extra_bits( 611 depot_stack_handle_t handle, unsigned int extra_bits) 612 { 613 union handle_parts parts = { .handle = handle }; 614 615 /* Don't set extra bits on empty handles. */ 616 if (!handle) 617 return 0; 618 619 parts.extra = extra_bits; 620 return parts.handle; 621 } 622 EXPORT_SYMBOL(stack_depot_set_extra_bits); 623 624 unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) 625 { 626 union handle_parts parts = { .handle = handle }; 627 628 return parts.extra; 629 } 630 EXPORT_SYMBOL(stack_depot_get_extra_bits); 631