xref: /linux-6.15/kernel/module/stats.c (revision df3e764d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Debugging module statistics.
4  *
5  * Copyright (C) 2023 Luis Chamberlain <[email protected]>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/string.h>
10 #include <linux/printk.h>
11 #include <linux/slab.h>
12 #include <linux/list.h>
13 #include <linux/debugfs.h>
14 #include <linux/rculist.h>
15 #include <linux/math.h>
16 
17 #include "internal.h"
18 
19 /**
20  * DOC: module debugging statistics overview
21  *
22  * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
23  * are useful to monitor and root cause memory pressure issues with module
24  * loading. These statistics are useful to allow us to improve production
25  * workloads.
26  *
27  * The current module debugging statistics supported help keep track of module
28  * loading failures to enable improvements either for kernel module auto-loading
29  * usage (request_module()) or interactions with userspace. Statistics are
30  * provided to track all possible failures in the finit_module() path and memory
31  * wasted in this process space.  Each of the failure counters are associated
32  * to a type of module loading failure which is known to incur a certain amount
33  * of memory allocation loss. In the worst case loading a module will fail after
34  * a 3 step memory allocation process:
35  *
36  *   a) memory allocated with kernel_read_file_from_fd()
37  *   b) module decompression processes the file read from
38  *      kernel_read_file_from_fd(), and vmap() is used to map
39  *      the decompressed module to a new local buffer which represents
40  *      a copy of the decompressed module passed from userspace. The buffer
41  *      from kernel_read_file_from_fd() is freed right away.
42  *   c) layout_and_allocate() allocates space for the final resting
43  *      place where we would keep the module if it were to be processed
44  *      successfully.
45  *
46  * If a failure occurs after these three different allocations only one
47  * counter will be incremented with the summation of the allocated bytes freed
48  * incurred during this failure. Likewise, if module loading failed only after
49  * step b) a separate counter is used and incremented for the bytes freed and
50  * not used during both of those allocations.
51  *
52  * Virtual memory space can be limited, for example on x86 virtual memory size
53  * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
54  * memory allocations when possible. These module debugging statistics help
55  * to evaluate how much memory is being wasted on bootup due to module loading
56  * failures.
57  *
58  * All counters are designed to be incremental. Atomic counters are used so to
59  * remain simple and avoid delays and deadlocks.
60  */
61 
62 /**
63  * DOC: dup_failed_modules - tracks duplicate failed modules
64  *
65  * Linked list of modules which failed to be loaded because an already existing
66  * module with the same name was already being processed or already loaded.
67  * The finit_module() system call incurs heavy virtual memory allocations. In
68  * the worst case an finit_module() system call can end up allocating virtual
69  * memory 3 times:
70  *
71  *   1) kernel_read_file_from_fd() call uses vmalloc()
72  *   2) optional module decompression uses vmap()
73  *   3) layout_and allocate() can use vzalloc() or an arch specific variation of
74  *      vmalloc to deal with ELF sections requiring special permissions
75  *
76  * In practice on a typical boot today most finit_module() calls fail due to
77  * the module with the same name already being loaded or about to be processed.
78  * All virtual memory allocated to these failed modules will be freed with
79  * no functional use.
80  *
81  * To help with this the dup_failed_modules allows us to track modules which
82  * failed to load due to the fact that a module was already loaded or being
83  * processed.  There are only two points at which we can fail such calls,
84  * we list them below along with the number of virtual memory allocation
85  * calls:
86  *
87  *   a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
88  *	layout_and_allocate(). This does not yet happen.
89  *	- with module decompression: 2 virtual memory allocation calls
90  *	- without module decompression: 1 virtual memory allocation calls
91  *   b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
92  *   	- with module decompression 3 virtual memory allocation calls
93  *   	- without module decompression 2 virtual memory allocation calls
94  *
95  * We should strive to get this list to be as small as possible. If this list
96  * is not empty it is a reflection of possible work or optimizations possible
97  * either in-kernel or in userspace.
98  */
99 static LIST_HEAD(dup_failed_modules);
100 
101 /**
102  * DOC: module statistics debugfs counters
103  *
104  * The total amount of wasted virtual memory allocation space during module
105  * loading can be computed by adding the total from the summation:
106  *
107  *   * @invalid_kread_bytes +
108  *     @invalid_decompress_bytes +
109  *     @invalid_becoming_bytes +
110  *     @invalid_mod_bytes
111  *
112  * The following debugfs counters are available to inspect module loading
113  * failures:
114  *
115  *   * total_mod_size: total bytes ever used by all modules we've dealt with on
116  *     this system
117  *   * total_text_size: total bytes of the .text and .init.text ELF section
118  *     sizes we've dealt with on this system
119  *   * invalid_kread_bytes: bytes allocated and then freed on failures which
120  *     happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
121  *     uses vmalloc(). These should typically not happen unless your system is
122  *     under memory pressure.
123  *   * invalid_decompress_bytes: number of bytes allocated and freed due to
124  *     memory allocations in the module decompression path that use vmap().
125  *     These typically should not happen unless your system is under memory
126  *     pressure.
127  *   * invalid_becoming_bytes: total number of bytes allocated and freed used
128  *     used to read the kernel module userspace wants us to read before we
129  *     promote it to be processed to be added to our @modules linked list.
130  *     These failures could in theory happen if we had a check in
131  *     between a successful kernel_read_file_from_fd()
132  *     call and right before we allocate the our private memory for the module
133  *     which would be kept if the module is successfully loaded. The most common
134  *     reason for this failure is when userspace is racing to load a module
135  *     which it does not yet see loaded. The first module to succeed in
136  *     add_unformed_module() will add a module to our &modules list and
137  *     subsequent loads of modules with the same name will error out at the
138  *     end of early_mod_check(). A check for module_patient_check_exists()
139  *     at the end of early_mod_check() could be added to prevent duplicate allocations
140  *     on layout_and_allocate() for modules already being processed. These
141  *     duplicate failed modules are non-fatal, however they typically are
142  *     indicative of userspace not seeing a module in userspace loaded yet and
143  *     unnecessarily trying to load a module before the kernel even has a chance
144  *     to begin to process prior requests. Although duplicate failures can be
145  *     non-fatal, we should try to reduce vmalloc() pressure proactively, so
146  *     ideally after boot this will be close to as 0 as possible.  If module
147  *     decompression was used we also add to this counter the cost of the
148  *     initial kernel_read_file_from_fd() of the compressed module. If module
149  *     decompression was not used the value represents the total allocated and
150  *     freed bytes in kernel_read_file_from_fd() calls for these type of
151  *     failures. These failures can occur because:
152  *
153  *    * module_sig_check() - module signature checks
154  *    * elf_validity_cache_copy() - some ELF validation issue
155  *    * early_mod_check():
156  *
157  *      * blacklisting
158  *      * failed to rewrite section headers
159  *      * version magic
160  *      * live patch requirements didn't check out
161  *      * the module was detected as being already present
162  *
163  *   * invalid_mod_bytes: these are the total number of bytes allocated and
164  *     freed due to failures after we did all the sanity checks of the module
165  *     which userspace passed to us and after our first check that the module
166  *     is unique.  A module can still fail to load if we detect the module is
167  *     loaded after we allocate space for it with layout_and_allocate(), we do
168  *     this check right before processing the module as live and run its
169  *     initialization routines. Note that you have a failure of this type it
170  *     also means the respective kernel_read_file_from_fd() memory space was
171  *     also freed and not used, and so we increment this counter with twice
172  *     the size of the module. Additionally if you used module decompression
173  *     the size of the compressed module is also added to this counter.
174  *
175  *  * modcount: how many modules we've loaded in our kernel life time
176  *  * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
177  *  * failed_decompress: how many failed module decompression attempts we've had.
178  *    These really should not happen unless your compression / decompression
179  *    might be broken.
180  *  * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
181  *    it and before we allocate memory for it with layout_and_allocate(). This
182  *    counter is never incremented if you manage to validate the module and
183  *    call layout_and_allocate() for it.
184  *  * failed_load_modules: how many modules failed once we've allocated our
185  *    private space for our module using layout_and_allocate(). These failures
186  *    should hopefully mostly be dealt with already. Races in theory could
187  *    still exist here, but it would just mean the kernel had started processing
188  *    two threads concurrently up to early_mod_check() and one thread won.
189  *    These failures are good signs the kernel or userspace is doing something
190  *    seriously stupid or that could be improved. We should strive to fix these,
191  *    but it is perhaps not easy to fix them. A recent example are the modules
192  *    requests incurred for frequency modules, a separate module request was
193  *    being issued for each CPU on a system.
194  */
195 
196 atomic_long_t total_mod_size;
197 atomic_long_t total_text_size;
198 atomic_long_t invalid_kread_bytes;
199 atomic_long_t invalid_decompress_bytes;
200 static atomic_long_t invalid_becoming_bytes;
201 static atomic_long_t invalid_mod_bytes;
202 atomic_t modcount;
203 atomic_t failed_kreads;
204 atomic_t failed_decompress;
205 static atomic_t failed_becoming;
206 static atomic_t failed_load_modules;
207 
208 static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
209 {
210 	if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
211 	    test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
212 		return "Becoming & Load";
213 	if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
214 		return "Becoming";
215 	if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
216 		return "Load";
217 	return "Bug-on-stats";
218 }
219 
220 void mod_stat_bump_invalid(struct load_info *info, int flags)
221 {
222 	atomic_long_add(info->len * 2, &invalid_mod_bytes);
223 	atomic_inc(&failed_load_modules);
224 #if defined(CONFIG_MODULE_DECOMPRESS)
225 	if (flags & MODULE_INIT_COMPRESSED_FILE)
226 		atomic_long_add(info->compressed_len, &invalid_mod_byte);
227 #endif
228 }
229 
230 void mod_stat_bump_becoming(struct load_info *info, int flags)
231 {
232 	atomic_inc(&failed_becoming);
233 	atomic_long_add(info->len, &invalid_becoming_bytes);
234 #if defined(CONFIG_MODULE_DECOMPRESS)
235 	if (flags & MODULE_INIT_COMPRESSED_FILE)
236 		atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
237 #endif
238 }
239 
240 int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
241 {
242 	struct mod_fail_load *mod_fail;
243 
244 	list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
245 				lockdep_is_held(&module_mutex)) {
246 		if (!strcmp(mod_fail->name, name)) {
247 			atomic_long_inc(&mod_fail->count);
248 			__set_bit(reason, &mod_fail->dup_fail_mask);
249 			goto out;
250 		}
251 	}
252 
253 	mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
254 	if (!mod_fail)
255 		return -ENOMEM;
256 	memcpy(mod_fail->name, name, strlen(name));
257 	__set_bit(reason, &mod_fail->dup_fail_mask);
258 	atomic_long_inc(&mod_fail->count);
259 	list_add_rcu(&mod_fail->list, &dup_failed_modules);
260 out:
261 	return 0;
262 }
263 
264 /*
265  * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
266  * 112 module prints within 8k.
267  *
268  * 1024 + (64*112) = 8k
269  */
270 #define MAX_PREAMBLE 1024
271 #define MAX_FAILED_MOD_PRINT 112
272 #define MAX_BYTES_PER_MOD 64
273 static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
274 				   size_t count, loff_t *ppos)
275 {
276 	struct mod_fail_load *mod_fail;
277 	unsigned int len, size, count_failed = 0;
278 	char *buf;
279 	u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
280 	u64 total_size, text_size, ikread_bytes, ibecoming_bytes, idecompress_bytes, imod_bytes,
281 	    total_virtual_lost;
282 
283 	live_mod_count = atomic_read(&modcount);
284 	fkreads = atomic_read(&failed_kreads);
285 	fdecompress = atomic_read(&failed_decompress);
286 	fbecoming = atomic_read(&failed_becoming);
287 	floads = atomic_read(&failed_load_modules);
288 
289 	total_size = atomic64_read(&total_mod_size);
290 	text_size = atomic64_read(&total_text_size);
291 	ikread_bytes = atomic64_read(&invalid_kread_bytes);
292 	idecompress_bytes = atomic64_read(&invalid_decompress_bytes);
293 	ibecoming_bytes = atomic64_read(&invalid_becoming_bytes);
294 	imod_bytes = atomic64_read(&invalid_mod_bytes);
295 
296 	total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
297 
298 	size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
299 				  (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
300 	buf = kzalloc(size, GFP_KERNEL);
301 	if (buf == NULL)
302 		return -ENOMEM;
303 
304 	/* The beginning of our debug preamble */
305 	len = scnprintf(buf + 0, size - len, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
306 
307 	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
308 
309 	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
310 			 fdecompress);
311 	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
312 
313 	len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
314 
315 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total module size", total_size);
316 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total mod text size", text_size);
317 
318 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kread bytes", ikread_bytes);
319 
320 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed decompress bytes",
321 			 idecompress_bytes);
322 
323 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed becoming bytes", ibecoming_bytes);
324 
325 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kmod bytes", imod_bytes);
326 
327 	len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Virtual mem wasted bytes", total_virtual_lost);
328 
329 	if (live_mod_count && total_size) {
330 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod size",
331 				 DIV_ROUND_UP(total_size, live_mod_count));
332 	}
333 
334 	if (live_mod_count && text_size) {
335 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod text size",
336 				 DIV_ROUND_UP(text_size, live_mod_count));
337 	}
338 
339 	/*
340 	 * We use WARN_ON_ONCE() for the counters to ensure we always have parity
341 	 * for keeping tabs on a type of failure with one type of byte counter.
342 	 * The counters for imod_bytes does not increase for fkreads failures
343 	 * for example, and so on.
344 	 */
345 
346 	WARN_ON_ONCE(ikread_bytes && !fkreads);
347 	if (fkreads && ikread_bytes) {
348 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail kread bytes",
349 				 DIV_ROUND_UP(ikread_bytes, fkreads));
350 	}
351 
352 	WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
353 	if (fbecoming && ibecoming_bytes) {
354 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail becoming bytes",
355 				 DIV_ROUND_UP(ibecoming_bytes, fbecoming));
356 	}
357 
358 	WARN_ON_ONCE(idecompress_bytes && !fdecompress);
359 	if (fdecompress && idecompress_bytes) {
360 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail decomp bytes",
361 				 DIV_ROUND_UP(idecompress_bytes, fdecompress));
362 	}
363 
364 	WARN_ON_ONCE(imod_bytes && !floads);
365 	if (floads && imod_bytes) {
366 		len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average fail load bytes",
367 				 DIV_ROUND_UP(imod_bytes, floads));
368 	}
369 
370 	/* End of our debug preamble header. */
371 
372 	/* Catch when we've gone beyond our expected preamble */
373 	WARN_ON_ONCE(len >= MAX_PREAMBLE);
374 
375 	if (list_empty(&dup_failed_modules))
376 		goto out;
377 
378 	len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
379 	len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
380 			 "Module-name", "How-many-times", "Reason");
381 	mutex_lock(&module_mutex);
382 
383 
384 	list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
385 		if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
386 			goto out_unlock;
387 		len += scnprintf(buf + len, size - len, "%25s\t%15llu\t%25s\n", mod_fail->name,
388 				 atomic64_read(&mod_fail->count), mod_fail_to_str(mod_fail));
389 	}
390 out_unlock:
391 	mutex_unlock(&module_mutex);
392 out:
393 	kfree(buf);
394         return simple_read_from_buffer(user_buf, count, ppos, buf, len);
395 }
396 #undef MAX_PREAMBLE
397 #undef MAX_FAILED_MOD_PRINT
398 #undef MAX_BYTES_PER_MOD
399 
400 static const struct file_operations fops_mod_stats = {
401 	.read = read_file_mod_stats,
402 	.open = simple_open,
403 	.owner = THIS_MODULE,
404 	.llseek = default_llseek,
405 };
406 
407 #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
408 #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
409 static int __init module_stats_init(void)
410 {
411 	mod_debug_add_ulong(total_mod_size);
412 	mod_debug_add_ulong(total_text_size);
413 	mod_debug_add_ulong(invalid_kread_bytes);
414 	mod_debug_add_ulong(invalid_decompress_bytes);
415 	mod_debug_add_ulong(invalid_becoming_bytes);
416 	mod_debug_add_ulong(invalid_mod_bytes);
417 
418 	mod_debug_add_atomic(modcount);
419 	mod_debug_add_atomic(failed_kreads);
420 	mod_debug_add_atomic(failed_decompress);
421 	mod_debug_add_atomic(failed_becoming);
422 	mod_debug_add_atomic(failed_load_modules);
423 
424 	debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
425 
426 	return 0;
427 }
428 #undef mod_debug_add_ulong
429 #undef mod_debug_add_atomic
430 module_init(module_stats_init);
431