1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Debugging module statistics. 4 * 5 * Copyright (C) 2023 Luis Chamberlain <[email protected]> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/string.h> 10 #include <linux/printk.h> 11 #include <linux/slab.h> 12 #include <linux/list.h> 13 #include <linux/debugfs.h> 14 #include <linux/rculist.h> 15 #include <linux/math.h> 16 17 #include "internal.h" 18 19 /** 20 * DOC: module debugging statistics overview 21 * 22 * Enabling CONFIG_MODULE_STATS enables module debugging statistics which 23 * are useful to monitor and root cause memory pressure issues with module 24 * loading. These statistics are useful to allow us to improve production 25 * workloads. 26 * 27 * The current module debugging statistics supported help keep track of module 28 * loading failures to enable improvements either for kernel module auto-loading 29 * usage (request_module()) or interactions with userspace. Statistics are 30 * provided to track all possible failures in the finit_module() path and memory 31 * wasted in this process space. Each of the failure counters are associated 32 * to a type of module loading failure which is known to incur a certain amount 33 * of memory allocation loss. In the worst case loading a module will fail after 34 * a 3 step memory allocation process: 35 * 36 * a) memory allocated with kernel_read_file_from_fd() 37 * b) module decompression processes the file read from 38 * kernel_read_file_from_fd(), and vmap() is used to map 39 * the decompressed module to a new local buffer which represents 40 * a copy of the decompressed module passed from userspace. The buffer 41 * from kernel_read_file_from_fd() is freed right away. 42 * c) layout_and_allocate() allocates space for the final resting 43 * place where we would keep the module if it were to be processed 44 * successfully. 45 * 46 * If a failure occurs after these three different allocations only one 47 * counter will be incremented with the summation of the allocated bytes freed 48 * incurred during this failure. Likewise, if module loading failed only after 49 * step b) a separate counter is used and incremented for the bytes freed and 50 * not used during both of those allocations. 51 * 52 * Virtual memory space can be limited, for example on x86 virtual memory size 53 * defaults to 128 MiB. We should strive to limit and avoid wasting virtual 54 * memory allocations when possible. These module debugging statistics help 55 * to evaluate how much memory is being wasted on bootup due to module loading 56 * failures. 57 * 58 * All counters are designed to be incremental. Atomic counters are used so to 59 * remain simple and avoid delays and deadlocks. 60 */ 61 62 /** 63 * DOC: dup_failed_modules - tracks duplicate failed modules 64 * 65 * Linked list of modules which failed to be loaded because an already existing 66 * module with the same name was already being processed or already loaded. 67 * The finit_module() system call incurs heavy virtual memory allocations. In 68 * the worst case an finit_module() system call can end up allocating virtual 69 * memory 3 times: 70 * 71 * 1) kernel_read_file_from_fd() call uses vmalloc() 72 * 2) optional module decompression uses vmap() 73 * 3) layout_and allocate() can use vzalloc() or an arch specific variation of 74 * vmalloc to deal with ELF sections requiring special permissions 75 * 76 * In practice on a typical boot today most finit_module() calls fail due to 77 * the module with the same name already being loaded or about to be processed. 78 * All virtual memory allocated to these failed modules will be freed with 79 * no functional use. 80 * 81 * To help with this the dup_failed_modules allows us to track modules which 82 * failed to load due to the fact that a module was already loaded or being 83 * processed. There are only two points at which we can fail such calls, 84 * we list them below along with the number of virtual memory allocation 85 * calls: 86 * 87 * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before 88 * layout_and_allocate(). 89 * - with module decompression: 2 virtual memory allocation calls 90 * - without module decompression: 1 virtual memory allocation calls 91 * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() 92 * - with module decompression 3 virtual memory allocation calls 93 * - without module decompression 2 virtual memory allocation calls 94 * 95 * We should strive to get this list to be as small as possible. If this list 96 * is not empty it is a reflection of possible work or optimizations possible 97 * either in-kernel or in userspace. 98 */ 99 static LIST_HEAD(dup_failed_modules); 100 101 /** 102 * DOC: module statistics debugfs counters 103 * 104 * The total amount of wasted virtual memory allocation space during module 105 * loading can be computed by adding the total from the summation: 106 * 107 * * @invalid_kread_bytes + 108 * @invalid_decompress_bytes + 109 * @invalid_becoming_bytes + 110 * @invalid_mod_bytes 111 * 112 * The following debugfs counters are available to inspect module loading 113 * failures: 114 * 115 * * total_mod_size: total bytes ever used by all modules we've dealt with on 116 * this system 117 * * total_text_size: total bytes of the .text and .init.text ELF section 118 * sizes we've dealt with on this system 119 * * invalid_kread_bytes: bytes allocated and then freed on failures which 120 * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() 121 * uses vmalloc(). These should typically not happen unless your system is 122 * under memory pressure. 123 * * invalid_decompress_bytes: number of bytes allocated and freed due to 124 * memory allocations in the module decompression path that use vmap(). 125 * These typically should not happen unless your system is under memory 126 * pressure. 127 * * invalid_becoming_bytes: total number of bytes allocated and freed used 128 * used to read the kernel module userspace wants us to read before we 129 * promote it to be processed to be added to our @modules linked list. These 130 * failures can happen if we had a check in between a successful kernel_read_file_from_fd() 131 * call and right before we allocate the our private memory for the module 132 * which would be kept if the module is successfully loaded. The most common 133 * reason for this failure is when userspace is racing to load a module 134 * which it does not yet see loaded. The first module to succeed in 135 * add_unformed_module() will add a module to our &modules list and 136 * subsequent loads of modules with the same name will error out at the 137 * end of early_mod_check(). The check for module_patient_check_exists() 138 * at the end of early_mod_check() prevents duplicate allocations 139 * on layout_and_allocate() for modules already being processed. These 140 * duplicate failed modules are non-fatal, however they typically are 141 * indicative of userspace not seeing a module in userspace loaded yet and 142 * unnecessarily trying to load a module before the kernel even has a chance 143 * to begin to process prior requests. Although duplicate failures can be 144 * non-fatal, we should try to reduce vmalloc() pressure proactively, so 145 * ideally after boot this will be close to as 0 as possible. If module 146 * decompression was used we also add to this counter the cost of the 147 * initial kernel_read_file_from_fd() of the compressed module. If module 148 * decompression was not used the value represents the total allocated and 149 * freed bytes in kernel_read_file_from_fd() calls for these type of 150 * failures. These failures can occur because: 151 * 152 * * module_sig_check() - module signature checks 153 * * elf_validity_cache_copy() - some ELF validation issue 154 * * early_mod_check(): 155 * 156 * * blacklisting 157 * * failed to rewrite section headers 158 * * version magic 159 * * live patch requirements didn't check out 160 * * the module was detected as being already present 161 * 162 * * invalid_mod_bytes: these are the total number of bytes allocated and 163 * freed due to failures after we did all the sanity checks of the module 164 * which userspace passed to us and after our first check that the module 165 * is unique. A module can still fail to load if we detect the module is 166 * loaded after we allocate space for it with layout_and_allocate(), we do 167 * this check right before processing the module as live and run its 168 * initialization routines. Note that you have a failure of this type it 169 * also means the respective kernel_read_file_from_fd() memory space was 170 * also freed and not used, and so we increment this counter with twice 171 * the size of the module. Additionally if you used module decompression 172 * the size of the compressed module is also added to this counter. 173 * 174 * * modcount: how many modules we've loaded in our kernel life time 175 * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() 176 * * failed_decompress: how many failed module decompression attempts we've had. 177 * These really should not happen unless your compression / decompression 178 * might be broken. 179 * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() 180 * it and before we allocate memory for it with layout_and_allocate(). This 181 * counter is never incremented if you manage to validate the module and 182 * call layout_and_allocate() for it. 183 * * failed_load_modules: how many modules failed once we've allocated our 184 * private space for our module using layout_and_allocate(). These failures 185 * should hopefully mostly be dealt with already. Races in theory could 186 * still exist here, but it would just mean the kernel had started processing 187 * two threads concurrently up to early_mod_check() and one thread won. 188 * These failures are good signs the kernel or userspace is doing something 189 * seriously stupid or that could be improved. We should strive to fix these, 190 * but it is perhaps not easy to fix them. A recent example are the modules 191 * requests incurred for frequency modules, a separate module request was 192 * being issued for each CPU on a system. 193 */ 194 195 atomic_long_t total_mod_size; 196 atomic_long_t total_text_size; 197 atomic_long_t invalid_kread_bytes; 198 atomic_long_t invalid_decompress_bytes; 199 static atomic_long_t invalid_becoming_bytes; 200 static atomic_long_t invalid_mod_bytes; 201 atomic_t modcount; 202 atomic_t failed_kreads; 203 atomic_t failed_decompress; 204 static atomic_t failed_becoming; 205 static atomic_t failed_load_modules; 206 207 static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) 208 { 209 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && 210 test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) 211 return "Becoming & Load"; 212 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) 213 return "Becoming"; 214 if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) 215 return "Load"; 216 return "Bug-on-stats"; 217 } 218 219 void mod_stat_bump_invalid(struct load_info *info, int flags) 220 { 221 atomic_long_add(info->len * 2, &invalid_mod_bytes); 222 atomic_inc(&failed_load_modules); 223 #if defined(CONFIG_MODULE_DECOMPRESS) 224 if (flags & MODULE_INIT_COMPRESSED_FILE) 225 atomic_long_add(info->compressed_len, &invalid_mod_byte); 226 #endif 227 } 228 229 void mod_stat_bump_becoming(struct load_info *info, int flags) 230 { 231 atomic_inc(&failed_becoming); 232 atomic_long_add(info->len, &invalid_becoming_bytes); 233 #if defined(CONFIG_MODULE_DECOMPRESS) 234 if (flags & MODULE_INIT_COMPRESSED_FILE) 235 atomic_long_add(info->compressed_len, &invalid_becoming_bytes); 236 #endif 237 } 238 239 int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) 240 { 241 struct mod_fail_load *mod_fail; 242 243 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, 244 lockdep_is_held(&module_mutex)) { 245 if (!strcmp(mod_fail->name, name)) { 246 atomic_long_inc(&mod_fail->count); 247 __set_bit(reason, &mod_fail->dup_fail_mask); 248 goto out; 249 } 250 } 251 252 mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); 253 if (!mod_fail) 254 return -ENOMEM; 255 memcpy(mod_fail->name, name, strlen(name)); 256 __set_bit(reason, &mod_fail->dup_fail_mask); 257 atomic_long_inc(&mod_fail->count); 258 list_add_rcu(&mod_fail->list, &dup_failed_modules); 259 out: 260 return 0; 261 } 262 263 /* 264 * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the 265 * 112 module prints within 8k. 266 * 267 * 1024 + (64*112) = 8k 268 */ 269 #define MAX_PREAMBLE 1024 270 #define MAX_FAILED_MOD_PRINT 112 271 #define MAX_BYTES_PER_MOD 64 272 static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, 273 size_t count, loff_t *ppos) 274 { 275 struct mod_fail_load *mod_fail; 276 unsigned int len, size, count_failed = 0; 277 char *buf; 278 u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; 279 u64 total_size, text_size, ikread_bytes, ibecoming_bytes, idecompress_bytes, imod_bytes, 280 total_virtual_lost; 281 282 live_mod_count = atomic_read(&modcount); 283 fkreads = atomic_read(&failed_kreads); 284 fdecompress = atomic_read(&failed_decompress); 285 fbecoming = atomic_read(&failed_becoming); 286 floads = atomic_read(&failed_load_modules); 287 288 total_size = atomic64_read(&total_mod_size); 289 text_size = atomic64_read(&total_text_size); 290 ikread_bytes = atomic64_read(&invalid_kread_bytes); 291 idecompress_bytes = atomic64_read(&invalid_decompress_bytes); 292 ibecoming_bytes = atomic64_read(&invalid_becoming_bytes); 293 imod_bytes = atomic64_read(&invalid_mod_bytes); 294 295 total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; 296 297 size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), 298 (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; 299 buf = kzalloc(size, GFP_KERNEL); 300 if (buf == NULL) 301 return -ENOMEM; 302 303 /* The beginning of our debug preamble */ 304 len = scnprintf(buf + 0, size - len, "%25s\t%u\n", "Mods ever loaded", live_mod_count); 305 306 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); 307 308 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", 309 fdecompress); 310 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); 311 312 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); 313 314 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total module size", total_size); 315 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total mod text size", text_size); 316 317 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kread bytes", ikread_bytes); 318 319 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed decompress bytes", 320 idecompress_bytes); 321 322 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed becoming bytes", ibecoming_bytes); 323 324 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kmod bytes", imod_bytes); 325 326 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Virtual mem wasted bytes", total_virtual_lost); 327 328 if (live_mod_count && total_size) { 329 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod size", 330 DIV_ROUND_UP(total_size, live_mod_count)); 331 } 332 333 if (live_mod_count && text_size) { 334 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod text size", 335 DIV_ROUND_UP(text_size, live_mod_count)); 336 } 337 338 /* 339 * We use WARN_ON_ONCE() for the counters to ensure we always have parity 340 * for keeping tabs on a type of failure with one type of byte counter. 341 * The counters for imod_bytes does not increase for fkreads failures 342 * for example, and so on. 343 */ 344 345 WARN_ON_ONCE(ikread_bytes && !fkreads); 346 if (fkreads && ikread_bytes) { 347 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail kread bytes", 348 DIV_ROUND_UP(ikread_bytes, fkreads)); 349 } 350 351 WARN_ON_ONCE(ibecoming_bytes && !fbecoming); 352 if (fbecoming && ibecoming_bytes) { 353 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail becoming bytes", 354 DIV_ROUND_UP(ibecoming_bytes, fbecoming)); 355 } 356 357 WARN_ON_ONCE(idecompress_bytes && !fdecompress); 358 if (fdecompress && idecompress_bytes) { 359 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail decomp bytes", 360 DIV_ROUND_UP(idecompress_bytes, fdecompress)); 361 } 362 363 WARN_ON_ONCE(imod_bytes && !floads); 364 if (floads && imod_bytes) { 365 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average fail load bytes", 366 DIV_ROUND_UP(imod_bytes, floads)); 367 } 368 369 /* End of our debug preamble header. */ 370 371 /* Catch when we've gone beyond our expected preamble */ 372 WARN_ON_ONCE(len >= MAX_PREAMBLE); 373 374 if (list_empty(&dup_failed_modules)) 375 goto out; 376 377 len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); 378 len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", 379 "Module-name", "How-many-times", "Reason"); 380 mutex_lock(&module_mutex); 381 382 383 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { 384 if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) 385 goto out_unlock; 386 len += scnprintf(buf + len, size - len, "%25s\t%15llu\t%25s\n", mod_fail->name, 387 atomic64_read(&mod_fail->count), mod_fail_to_str(mod_fail)); 388 } 389 out_unlock: 390 mutex_unlock(&module_mutex); 391 out: 392 kfree(buf); 393 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 394 } 395 #undef MAX_PREAMBLE 396 #undef MAX_FAILED_MOD_PRINT 397 #undef MAX_BYTES_PER_MOD 398 399 static const struct file_operations fops_mod_stats = { 400 .read = read_file_mod_stats, 401 .open = simple_open, 402 .owner = THIS_MODULE, 403 .llseek = default_llseek, 404 }; 405 406 #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) 407 #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) 408 static int __init module_stats_init(void) 409 { 410 mod_debug_add_ulong(total_mod_size); 411 mod_debug_add_ulong(total_text_size); 412 mod_debug_add_ulong(invalid_kread_bytes); 413 mod_debug_add_ulong(invalid_decompress_bytes); 414 mod_debug_add_ulong(invalid_becoming_bytes); 415 mod_debug_add_ulong(invalid_mod_bytes); 416 417 mod_debug_add_atomic(modcount); 418 mod_debug_add_atomic(failed_kreads); 419 mod_debug_add_atomic(failed_decompress); 420 mod_debug_add_atomic(failed_becoming); 421 mod_debug_add_atomic(failed_load_modules); 422 423 debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); 424 425 return 0; 426 } 427 #undef mod_debug_add_ulong 428 #undef mod_debug_add_atomic 429 module_init(module_stats_init); 430