xref: /linux-6.15/kernel/kexec.c (revision 79365026)
140b0b3f8SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2dc009d92SEric W. Biederman /*
32965faa5SDave Young  * kexec.c - kexec_load system call
4dc009d92SEric W. Biederman  * Copyright (C) 2002-2004 Eric Biederman  <[email protected]>
5dc009d92SEric W. Biederman  */
6dc009d92SEric W. Biederman 
7de90a6bcSMinfei Huang #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8de90a6bcSMinfei Huang 
9c59ede7bSRandy.Dunlap #include <linux/capability.h>
10dc009d92SEric W. Biederman #include <linux/mm.h>
11dc009d92SEric W. Biederman #include <linux/file.h>
12a210fd32SMimi Zohar #include <linux/security.h>
13dc009d92SEric W. Biederman #include <linux/kexec.h>
148c5a1cf0SAndrew Morton #include <linux/mutex.h>
15dc009d92SEric W. Biederman #include <linux/list.h>
16dc009d92SEric W. Biederman #include <linux/syscalls.h>
17a43cac0dSDave Young #include <linux/vmalloc.h>
182965faa5SDave Young #include <linux/slab.h>
196e274d14SAlexander Nyberg 
20a43cac0dSDave Young #include "kexec_internal.h"
21a43cac0dSDave Young 
kimage_alloc_init(struct kimage ** rimage,unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)22255aedd9SVivek Goyal static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
2372414d3fSManeesh Soni 			     unsigned long nr_segments,
245d700a0fSArnd Bergmann 			     struct kexec_segment *segments,
25255aedd9SVivek Goyal 			     unsigned long flags)
26dc009d92SEric W. Biederman {
27255aedd9SVivek Goyal 	int ret;
28dc009d92SEric W. Biederman 	struct kimage *image;
29255aedd9SVivek Goyal 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
30255aedd9SVivek Goyal 
3102aff848SBaoquan He #ifdef CONFIG_CRASH_DUMP
32255aedd9SVivek Goyal 	if (kexec_on_panic) {
33255aedd9SVivek Goyal 		/* Verify we have a valid entry point */
3443546d86SRussell King 		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
3543546d86SRussell King 		    (entry > phys_to_boot_phys(crashk_res.end)))
36255aedd9SVivek Goyal 			return -EADDRNOTAVAIL;
37255aedd9SVivek Goyal 	}
3802aff848SBaoquan He #endif
39dc009d92SEric W. Biederman 
40dc009d92SEric W. Biederman 	/* Allocate and initialize a controlling structure */
41dabe7862SVivek Goyal 	image = do_kimage_alloc_init();
42dabe7862SVivek Goyal 	if (!image)
43dabe7862SVivek Goyal 		return -ENOMEM;
44dabe7862SVivek Goyal 
45dabe7862SVivek Goyal 	image->start = entry;
465d700a0fSArnd Bergmann 	image->nr_segments = nr_segments;
475d700a0fSArnd Bergmann 	memcpy(image->segment, segments, nr_segments * sizeof(*segments));
48dabe7862SVivek Goyal 
4902aff848SBaoquan He #ifdef CONFIG_CRASH_DUMP
50255aedd9SVivek Goyal 	if (kexec_on_panic) {
51cdf4b3faSXunlei Pang 		/* Enable special crash kernel control page alloc policy. */
52255aedd9SVivek Goyal 		image->control_page = crashk_res.start;
53255aedd9SVivek Goyal 		image->type = KEXEC_TYPE_CRASH;
54255aedd9SVivek Goyal 	}
5502aff848SBaoquan He #endif
56255aedd9SVivek Goyal 
57cdf4b3faSXunlei Pang 	ret = sanity_check_segment_list(image);
58cdf4b3faSXunlei Pang 	if (ret)
59cdf4b3faSXunlei Pang 		goto out_free_image;
60cdf4b3faSXunlei Pang 
61dc009d92SEric W. Biederman 	/*
62dc009d92SEric W. Biederman 	 * Find a location for the control code buffer, and add it
63dc009d92SEric W. Biederman 	 * the vector of segments so that it's pages will also be
64dc009d92SEric W. Biederman 	 * counted as destination pages.
65dc009d92SEric W. Biederman 	 */
66255aedd9SVivek Goyal 	ret = -ENOMEM;
67dc009d92SEric W. Biederman 	image->control_code_page = kimage_alloc_control_pages(image,
68163f6876SHuang Ying 					   get_order(KEXEC_CONTROL_PAGE_SIZE));
69dc009d92SEric W. Biederman 	if (!image->control_code_page) {
70e1bebcf4SFabian Frederick 		pr_err("Could not allocate control_code_buffer\n");
71dabe7862SVivek Goyal 		goto out_free_image;
72dc009d92SEric W. Biederman 	}
73dc009d92SEric W. Biederman 
74255aedd9SVivek Goyal 	if (!kexec_on_panic) {
753ab83521SHuang Ying 		image->swap_page = kimage_alloc_control_pages(image, 0);
763ab83521SHuang Ying 		if (!image->swap_page) {
77e1bebcf4SFabian Frederick 			pr_err("Could not allocate swap buffer\n");
78dabe7862SVivek Goyal 			goto out_free_control_pages;
793ab83521SHuang Ying 		}
80255aedd9SVivek Goyal 	}
813ab83521SHuang Ying 
82dc009d92SEric W. Biederman 	*rimage = image;
83b92e7e0dSZhang Yanfei 	return 0;
84dabe7862SVivek Goyal out_free_control_pages:
85b92e7e0dSZhang Yanfei 	kimage_free_page_list(&image->control_pages);
86dabe7862SVivek Goyal out_free_image:
87b92e7e0dSZhang Yanfei 	kfree(image);
88255aedd9SVivek Goyal 	return ret;
89dc009d92SEric W. Biederman }
90dc009d92SEric W. Biederman 
do_kexec_load(unsigned long entry,unsigned long nr_segments,struct kexec_segment * segments,unsigned long flags)910eea0867SMinfei Huang static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
925d700a0fSArnd Bergmann 		struct kexec_segment *segments, unsigned long flags)
930eea0867SMinfei Huang {
940eea0867SMinfei Huang 	struct kimage **dest_image, *image;
950eea0867SMinfei Huang 	unsigned long i;
960eea0867SMinfei Huang 	int ret;
970eea0867SMinfei Huang 
984b692e86SArnd Bergmann 	/*
994b692e86SArnd Bergmann 	 * Because we write directly to the reserved memory region when loading
10005c62574SValentin Schneider 	 * crash kernels we need a serialization here to prevent multiple crash
10105c62574SValentin Schneider 	 * kernels from attempting to load simultaneously.
1024b692e86SArnd Bergmann 	 */
10305c62574SValentin Schneider 	if (!kexec_trylock())
1044b692e86SArnd Bergmann 		return -EBUSY;
1054b692e86SArnd Bergmann 
10602aff848SBaoquan He #ifdef CONFIG_CRASH_DUMP
1070eea0867SMinfei Huang 	if (flags & KEXEC_ON_CRASH) {
1080eea0867SMinfei Huang 		dest_image = &kexec_crash_image;
1090eea0867SMinfei Huang 		if (kexec_crash_image)
1100eea0867SMinfei Huang 			arch_kexec_unprotect_crashkres();
11102aff848SBaoquan He 	} else
11202aff848SBaoquan He #endif
1130eea0867SMinfei Huang 		dest_image = &kexec_image;
1140eea0867SMinfei Huang 
1150eea0867SMinfei Huang 	if (nr_segments == 0) {
1160eea0867SMinfei Huang 		/* Uninstall image */
1170eea0867SMinfei Huang 		kimage_free(xchg(dest_image, NULL));
1184b692e86SArnd Bergmann 		ret = 0;
1194b692e86SArnd Bergmann 		goto out_unlock;
1200eea0867SMinfei Huang 	}
1210eea0867SMinfei Huang 	if (flags & KEXEC_ON_CRASH) {
1220eea0867SMinfei Huang 		/*
1230eea0867SMinfei Huang 		 * Loading another kernel to switch to if this one
1240eea0867SMinfei Huang 		 * crashes.  Free any current crash dump kernel before
1250eea0867SMinfei Huang 		 * we corrupt it.
1260eea0867SMinfei Huang 		 */
1270eea0867SMinfei Huang 		kimage_free(xchg(&kexec_crash_image, NULL));
1280eea0867SMinfei Huang 	}
1290eea0867SMinfei Huang 
1300eea0867SMinfei Huang 	ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
1310eea0867SMinfei Huang 	if (ret)
1324b692e86SArnd Bergmann 		goto out_unlock;
1330eea0867SMinfei Huang 
1340eea0867SMinfei Huang 	if (flags & KEXEC_PRESERVE_CONTEXT)
1350eea0867SMinfei Huang 		image->preserve_context = 1;
1360eea0867SMinfei Huang 
137a72bbec7SEric DeVolder #ifdef CONFIG_CRASH_HOTPLUG
138*79365026SSourabh Jain 	if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags))
139*79365026SSourabh Jain 		image->hotplug_support = 1;
140a72bbec7SEric DeVolder #endif
141a72bbec7SEric DeVolder 
1420eea0867SMinfei Huang 	ret = machine_kexec_prepare(image);
1430eea0867SMinfei Huang 	if (ret)
1440eea0867SMinfei Huang 		goto out;
1450eea0867SMinfei Huang 
1461229384fSXunlei Pang 	/*
1471229384fSXunlei Pang 	 * Some architecture(like S390) may touch the crash memory before
1481229384fSXunlei Pang 	 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
1491229384fSXunlei Pang 	 */
1501229384fSXunlei Pang 	ret = kimage_crash_copy_vmcoreinfo(image);
1511229384fSXunlei Pang 	if (ret)
1521229384fSXunlei Pang 		goto out;
1531229384fSXunlei Pang 
1540eea0867SMinfei Huang 	for (i = 0; i < nr_segments; i++) {
1550eea0867SMinfei Huang 		ret = kimage_load_segment(image, &image->segment[i]);
1560eea0867SMinfei Huang 		if (ret)
1570eea0867SMinfei Huang 			goto out;
1580eea0867SMinfei Huang 	}
1590eea0867SMinfei Huang 
1600eea0867SMinfei Huang 	kimage_terminate(image);
1610eea0867SMinfei Huang 
162de68e4daSPavel Tatashin 	ret = machine_kexec_post_load(image);
163de68e4daSPavel Tatashin 	if (ret)
164de68e4daSPavel Tatashin 		goto out;
165de68e4daSPavel Tatashin 
1660eea0867SMinfei Huang 	/* Install the new kernel and uninstall the old */
1670eea0867SMinfei Huang 	image = xchg(dest_image, image);
1680eea0867SMinfei Huang 
1690eea0867SMinfei Huang out:
17002aff848SBaoquan He #ifdef CONFIG_CRASH_DUMP
1710eea0867SMinfei Huang 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
1720eea0867SMinfei Huang 		arch_kexec_protect_crashkres();
17302aff848SBaoquan He #endif
1740eea0867SMinfei Huang 
1750eea0867SMinfei Huang 	kimage_free(image);
1764b692e86SArnd Bergmann out_unlock:
17705c62574SValentin Schneider 	kexec_unlock();
1780eea0867SMinfei Huang 	return ret;
1790eea0867SMinfei Huang }
1800eea0867SMinfei Huang 
181dc009d92SEric W. Biederman /*
182dc009d92SEric W. Biederman  * Exec Kernel system call: for obvious reasons only root may call it.
183dc009d92SEric W. Biederman  *
184dc009d92SEric W. Biederman  * This call breaks up into three pieces.
185dc009d92SEric W. Biederman  * - A generic part which loads the new kernel from the current
186dc009d92SEric W. Biederman  *   address space, and very carefully places the data in the
187dc009d92SEric W. Biederman  *   allocated pages.
188dc009d92SEric W. Biederman  *
189dc009d92SEric W. Biederman  * - A generic part that interacts with the kernel and tells all of
190dc009d92SEric W. Biederman  *   the devices to shut down.  Preventing on-going dmas, and placing
191dc009d92SEric W. Biederman  *   the devices in a consistent state so a later kernel can
192dc009d92SEric W. Biederman  *   reinitialize them.
193dc009d92SEric W. Biederman  *
194dc009d92SEric W. Biederman  * - A machine specific part that includes the syscall number
195002ace78SGeert Uytterhoeven  *   and then copies the image to it's final destination.  And
196dc009d92SEric W. Biederman  *   jumps into the image at entry.
197dc009d92SEric W. Biederman  *
198dc009d92SEric W. Biederman  * kexec does not sync, or unmount filesystems so if you need
199dc009d92SEric W. Biederman  * that to happen you need to do that yourself.
200dc009d92SEric W. Biederman  */
2018c5a1cf0SAndrew Morton 
kexec_load_check(unsigned long nr_segments,unsigned long flags)2026b27aef0SDominik Brodowski static inline int kexec_load_check(unsigned long nr_segments,
2036b27aef0SDominik Brodowski 				   unsigned long flags)
204dc009d92SEric W. Biederman {
205a42aaad2SRicardo Ribalda 	int image_type = (flags & KEXEC_ON_CRASH) ?
206a42aaad2SRicardo Ribalda 			 KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
207a210fd32SMimi Zohar 	int result;
208a210fd32SMimi Zohar 
209dc009d92SEric W. Biederman 	/* We only trust the superuser with rebooting the system. */
210a42aaad2SRicardo Ribalda 	if (!kexec_load_permitted(image_type))
211dc009d92SEric W. Biederman 		return -EPERM;
212dc009d92SEric W. Biederman 
213a210fd32SMimi Zohar 	/* Permit LSMs and IMA to fail the kexec */
214b64fcae7SKees Cook 	result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
215a210fd32SMimi Zohar 	if (result < 0)
216a210fd32SMimi Zohar 		return result;
217a210fd32SMimi Zohar 
218dc009d92SEric W. Biederman 	/*
2197d31f460SMatthew Garrett 	 * kexec can be used to circumvent module loading restrictions, so
2207d31f460SMatthew Garrett 	 * prevent loading in that case
2217d31f460SMatthew Garrett 	 */
2227d31f460SMatthew Garrett 	result = security_locked_down(LOCKDOWN_KEXEC);
2237d31f460SMatthew Garrett 	if (result)
2247d31f460SMatthew Garrett 		return result;
2257d31f460SMatthew Garrett 
2267d31f460SMatthew Garrett 	/*
227dc009d92SEric W. Biederman 	 * Verify we have a legal set of flags
228dc009d92SEric W. Biederman 	 * This leaves us room for future extensions.
229dc009d92SEric W. Biederman 	 */
230dc009d92SEric W. Biederman 	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
231dc009d92SEric W. Biederman 		return -EINVAL;
232dc009d92SEric W. Biederman 
233dc009d92SEric W. Biederman 	/* Put an artificial cap on the number
234dc009d92SEric W. Biederman 	 * of segments passed to kexec_load.
235dc009d92SEric W. Biederman 	 */
236dc009d92SEric W. Biederman 	if (nr_segments > KEXEC_SEGMENT_MAX)
237dc009d92SEric W. Biederman 		return -EINVAL;
238dc009d92SEric W. Biederman 
2396b27aef0SDominik Brodowski 	return 0;
2406b27aef0SDominik Brodowski }
2416b27aef0SDominik Brodowski 
SYSCALL_DEFINE4(kexec_load,unsigned long,entry,unsigned long,nr_segments,struct kexec_segment __user *,segments,unsigned long,flags)2426b27aef0SDominik Brodowski SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
2436b27aef0SDominik Brodowski 		struct kexec_segment __user *, segments, unsigned long, flags)
2446b27aef0SDominik Brodowski {
2455d700a0fSArnd Bergmann 	struct kexec_segment *ksegments;
2465d700a0fSArnd Bergmann 	unsigned long result;
2476b27aef0SDominik Brodowski 
2486b27aef0SDominik Brodowski 	result = kexec_load_check(nr_segments, flags);
2496b27aef0SDominik Brodowski 	if (result)
2506b27aef0SDominik Brodowski 		return result;
2516b27aef0SDominik Brodowski 
2526b27aef0SDominik Brodowski 	/* Verify we are on the appropriate architecture */
2536b27aef0SDominik Brodowski 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
2546b27aef0SDominik Brodowski 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
2556b27aef0SDominik Brodowski 		return -EINVAL;
2566b27aef0SDominik Brodowski 
257569c8d82SPhilipp Stanner 	ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
2585d700a0fSArnd Bergmann 	if (IS_ERR(ksegments))
2595d700a0fSArnd Bergmann 		return PTR_ERR(ksegments);
2605d700a0fSArnd Bergmann 
2615d700a0fSArnd Bergmann 	result = do_kexec_load(entry, nr_segments, ksegments, flags);
2625d700a0fSArnd Bergmann 	kfree(ksegments);
2639b492cf5SXunlei Pang 
264dc009d92SEric W. Biederman 	return result;
265dc009d92SEric W. Biederman }
266dc009d92SEric W. Biederman 
267dc009d92SEric W. Biederman #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load,compat_ulong_t,entry,compat_ulong_t,nr_segments,struct compat_kexec_segment __user *,segments,compat_ulong_t,flags)268ca2c405aSHeiko Carstens COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
269ca2c405aSHeiko Carstens 		       compat_ulong_t, nr_segments,
270ca2c405aSHeiko Carstens 		       struct compat_kexec_segment __user *, segments,
271ca2c405aSHeiko Carstens 		       compat_ulong_t, flags)
272dc009d92SEric W. Biederman {
273dc009d92SEric W. Biederman 	struct compat_kexec_segment in;
2745d700a0fSArnd Bergmann 	struct kexec_segment *ksegments;
275dc009d92SEric W. Biederman 	unsigned long i, result;
276dc009d92SEric W. Biederman 
2776b27aef0SDominik Brodowski 	result = kexec_load_check(nr_segments, flags);
2786b27aef0SDominik Brodowski 	if (result)
2796b27aef0SDominik Brodowski 		return result;
2806b27aef0SDominik Brodowski 
281dc009d92SEric W. Biederman 	/* Don't allow clients that don't understand the native
282dc009d92SEric W. Biederman 	 * architecture to do anything.
283dc009d92SEric W. Biederman 	 */
28472414d3fSManeesh Soni 	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
285dc009d92SEric W. Biederman 		return -EINVAL;
286dc009d92SEric W. Biederman 
2875d700a0fSArnd Bergmann 	ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
2885d700a0fSArnd Bergmann 			GFP_KERNEL);
2895d700a0fSArnd Bergmann 	if (!ksegments)
2905d700a0fSArnd Bergmann 		return -ENOMEM;
2915d700a0fSArnd Bergmann 
292dc009d92SEric W. Biederman 	for (i = 0; i < nr_segments; i++) {
293dc009d92SEric W. Biederman 		result = copy_from_user(&in, &segments[i], sizeof(in));
29472414d3fSManeesh Soni 		if (result)
2955d700a0fSArnd Bergmann 			goto fail;
296dc009d92SEric W. Biederman 
2975d700a0fSArnd Bergmann 		ksegments[i].buf   = compat_ptr(in.buf);
2985d700a0fSArnd Bergmann 		ksegments[i].bufsz = in.bufsz;
2995d700a0fSArnd Bergmann 		ksegments[i].mem   = in.mem;
3005d700a0fSArnd Bergmann 		ksegments[i].memsz = in.memsz;
301dc009d92SEric W. Biederman 	}
302dc009d92SEric W. Biederman 
3036b27aef0SDominik Brodowski 	result = do_kexec_load(entry, nr_segments, ksegments, flags);
3046b27aef0SDominik Brodowski 
3055d700a0fSArnd Bergmann fail:
3065d700a0fSArnd Bergmann 	kfree(ksegments);
3076b27aef0SDominik Brodowski 	return result;
308dc009d92SEric W. Biederman }
309dc009d92SEric W. Biederman #endif
310