/* * Copyright (c) 2000-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ /* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* */ /* * File: vm/vm_map.c * Author: Avadis Tevanian, Jr., Michael Wayne Young * Date: 1985 * * Virtual memory mapping module. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if DEVELOPMENT || DEBUG #include #endif /* DEVELOPMENT || DEBUG */ #include #include #include #include #include #include #include #include #include #include #if DEVELOPMENT || DEBUG extern int proc_selfcsflags(void); int vm_log_xnu_user_debug = 0; int panic_on_unsigned_execute = 0; int panic_on_mlock_failure = 0; #endif /* DEVELOPMENT || DEBUG */ #if DEVELOPMENT || DEBUG int debug4k_filter = 0; char debug4k_proc_name[1024] = ""; int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT); int debug4k_panic_on_misaligned_sharing = 0; const char *debug4k_category_name[] = { "error", /* 0 */ "life", /* 1 */ "load", /* 2 */ "fault", /* 3 */ "copy", /* 4 */ "share", /* 5 */ "adjust", /* 6 */ "pmap", /* 7 */ "mementry", /* 8 */ "iokit", /* 9 */ "upl", /* 10 */ "exc", /* 11 */ "vfs" /* 12 */ }; #endif /* DEVELOPMENT || DEBUG */ int debug4k_no_cow_copyin = 0; #if __arm64__ extern const int fourk_binary_compatibility_unsafe; #endif /* __arm64__ */ extern int proc_selfpid(void); extern char *proc_name_address(void *p); extern const char *proc_best_name(struct proc *p); #if VM_MAP_DEBUG_APPLE_PROTECT int vm_map_debug_apple_protect = 0; #endif /* VM_MAP_DEBUG_APPLE_PROTECT */ #if VM_MAP_DEBUG_FOURK int vm_map_debug_fourk = 0; #endif /* VM_MAP_DEBUG_FOURK */ #if DEBUG || DEVELOPMENT static TUNABLE(bool, vm_map_executable_immutable, "vm_map_executable_immutable", true); #else #define vm_map_executable_immutable true #endif os_refgrp_decl(static, map_refgrp, "vm_map", NULL); extern u_int32_t random(void); /* from */ /* Internal prototypes */ typedef struct vm_map_zap { vm_map_entry_t vmz_head; vm_map_entry_t *vmz_tail; } *vm_map_zap_t; #define VM_MAP_ZAP_DECLARE(zap) \ struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head } extern kern_return_t vm_map_wire_external( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut prot_u, boolean_t user_wire) __exported; #if XNU_PLATFORM_MacOSX extern /* exported via Private..MacOSX.exports on macOS */ #else static #endif kern_return_t vm_map_copyin_common( vm_map_t src_map, vm_map_address_ut src_addr, vm_map_size_ut len, boolean_t src_destroy, boolean_t src_volatile, vm_map_copy_t *copy_result, /* OUT */ boolean_t use_maxprot); static vm_map_entry_t vm_map_entry_insert( vm_map_t map, vm_map_entry_t insp_entry, vm_map_offset_t start, vm_map_offset_t end, vm_object_t object, vm_object_offset_t offset, vm_map_kernel_flags_t vmk_flags, boolean_t needs_copy, vm_prot_t cur_protection, vm_prot_t max_protection, vm_inherit_t inheritance, boolean_t clear_map_aligned); static void vm_map_simplify_range( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); /* forward */ static boolean_t vm_map_range_check( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_map_entry_t *entry); static void vm_map_submap_pmap_clean( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_map_t sub_map, vm_map_offset_t offset); static void vm_map_pmap_enter( vm_map_t map, vm_map_offset_t addr, vm_map_offset_t end_addr, vm_object_t object, vm_object_offset_t offset, vm_prot_t protection); static void _vm_map_clip_end( struct vm_map_header *map_header, vm_map_entry_t entry, vm_map_offset_t end); static void _vm_map_clip_start( struct vm_map_header *map_header, vm_map_entry_t entry, vm_map_offset_t start); static kmem_return_t vm_map_delete( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vmr_flags_t flags, kmem_guard_t guard, vm_map_zap_t zap); static void vm_map_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy); static kern_return_t vm_map_copy_overwrite_unaligned( vm_map_t dst_map, vm_map_entry_t entry, vm_map_copy_t copy, vm_map_address_t start, boolean_t discard_on_success); static kern_return_t vm_map_copy_overwrite_aligned( vm_map_t dst_map, vm_map_entry_t tmp_entry, vm_map_copy_t copy, vm_map_offset_t start, pmap_t pmap); static kern_return_t vm_map_copyin_kernel_buffer( vm_map_t src_map, vm_map_address_t src_addr, vm_map_size_t len, boolean_t src_destroy, vm_map_copy_t *copy_result); /* OUT */ static kern_return_t vm_map_copyout_kernel_buffer( vm_map_t map, vm_map_address_t *addr, /* IN/OUT */ vm_map_copy_t copy, vm_map_size_t copy_size, boolean_t overwrite, boolean_t consume_on_success); static void vm_map_fork_share( vm_map_t old_map, vm_map_entry_t old_entry, vm_map_t new_map); static boolean_t vm_map_fork_copy( vm_map_t old_map, vm_map_entry_t *old_entry_p, vm_map_t new_map, int vm_map_copyin_flags); static kern_return_t vm_map_wire_nested( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_prot_t caller_prot, vm_tag_t tag, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr, ppnum_t *physpage_p); static kern_return_t vm_map_unwire_nested( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr); static kern_return_t vm_map_overwrite_submap_recurse( vm_map_t dst_map, vm_map_offset_t dst_addr, vm_map_size_t dst_size); static kern_return_t vm_map_copy_overwrite_nested( vm_map_t dst_map, vm_map_offset_t dst_addr, vm_map_copy_t copy, boolean_t interruptible, pmap_t pmap, boolean_t discard_on_success); static kern_return_t vm_map_remap_extract( vm_map_t map, vm_map_offset_t addr, vm_map_size_t size, boolean_t copy, vm_map_copy_t map_copy, vm_prot_t *cur_protection, vm_prot_t *max_protection, vm_inherit_t inheritance, vm_map_kernel_flags_t vmk_flags); static void vm_map_region_look_for_page( vm_map_t map, vm_map_offset_t va, vm_object_t object, vm_object_offset_t offset, int max_refcnt, unsigned short depth, vm_region_extended_info_t extended, mach_msg_type_number_t count); static boolean_t vm_map_region_has_obj_ref( vm_map_entry_t entry, vm_object_t object); static kern_return_t vm_map_willneed( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); static kern_return_t vm_map_reuse_pages( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); static kern_return_t vm_map_reusable_pages( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); static kern_return_t vm_map_can_reuse( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); static kern_return_t vm_map_zero( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); static kern_return_t vm_map_random_address_for_size( vm_map_t map, vm_map_offset_t *address, vm_map_size_t size, vm_map_kernel_flags_t vmk_flags); #if CONFIG_MAP_RANGES static vm_map_range_id_t vm_map_user_range_resolve( vm_map_t map, mach_vm_address_t addr, mach_vm_address_t size, mach_vm_range_t range); #endif /* CONFIG_MAP_RANGES */ #if MACH_ASSERT static kern_return_t vm_map_pageout( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end); #endif /* MACH_ASSERT */ kern_return_t vm_map_corpse_footprint_collect( vm_map_t old_map, vm_map_entry_t old_entry, vm_map_t new_map); void vm_map_corpse_footprint_collect_done( vm_map_t new_map); void vm_map_corpse_footprint_destroy( vm_map_t map); kern_return_t vm_map_corpse_footprint_query_page_info( vm_map_t map, vm_map_offset_t va, int *disposition_p); void vm_map_footprint_query_page_info( vm_map_t map, vm_map_entry_t map_entry, vm_map_offset_t curr_s_offset, int *disposition_p); #if CONFIG_MAP_RANGES static void vm_map_range_map_init(void); #endif /* CONFIG_MAP_RANGES */ pid_t find_largest_process_vm_map_entries(void); __attribute__((always_inline)) int vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags) { int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK; /* in vmk flags the meaning of fixed/anywhere is inverted */ return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE); } __attribute__((always_inline, overloadable)) void vm_map_kernel_flags_set_vmflags( vm_map_kernel_flags_t *vmk_flags, int vm_flags, vm_tag_t vm_tag) { vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE); vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK; vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK); vmk_flags->vm_tag = vm_tag; } __attribute__((always_inline, overloadable)) void vm_map_kernel_flags_set_vmflags( vm_map_kernel_flags_t *vmk_flags, int vm_flags_and_tag) { vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE); vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK; vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK); VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag); } __attribute__((always_inline)) void vm_map_kernel_flags_and_vmflags( vm_map_kernel_flags_t *vmk_flags, int vm_flags_mask) { /* this function doesn't handle the inverted FIXED/ANYWHERE */ assert(vm_flags_mask & VM_FLAGS_ANYWHERE); vmk_flags->__vm_flags &= vm_flags_mask; } __attribute__((always_inline)) bool vm_map_kernel_flags_check_vm_and_kflags( vm_map_kernel_flags_t vmk_flags, int vm_flags_mask) { return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0; } bool vm_map_kernel_flags_check_vmflags( vm_map_kernel_flags_t vmk_flags, int vm_flags_mask) { int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK; /* Note: up to 16 still has good calling conventions */ static_assert(sizeof(vm_map_kernel_flags_t) == 8); #if DEBUG || DEVELOPMENT /* * All of this compiles to nothing if all checks pass. */ #define check(field, value) ({ \ vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \ fl.__vm_flags = (value); \ fl.field = 0; \ assert(fl.__vm_flags == 0); \ }) /* bits 0-7 */ check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted check(vmf_purgeable, VM_FLAGS_PURGABLE); check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK); check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR); check(vmf_no_cache, VM_FLAGS_NO_CACHE); check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN); check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA); check(vmf_permanent, VM_FLAGS_PERMANENT); /* bits 8-15 */ check(vmf_tpro, VM_FLAGS_TPRO); check(vmf_overwrite, VM_FLAGS_OVERWRITE); /* bits 16-23 */ check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK); check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR); check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR); { vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; /* check user tags will never clip */ fl.vm_tag = VM_MEMORY_COUNT - 1; assert(fl.vm_tag == VM_MEMORY_COUNT - 1); /* check kernel tags will never clip */ fl.vm_tag = VM_MAX_TAG_VALUE - 1; assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1); } #undef check #endif /* DEBUG || DEVELOPMENT */ return (vmflags & ~vm_flags_mask) == 0; } /* * Macros to copy a vm_map_entry. We must be careful to correctly * manage the wired page count. vm_map_entry_copy() creates a new * map entry to the same memory - the wired count in the new entry * must be set to zero. vm_map_entry_copy_full() creates a new * entry that is identical to the old entry. This preserves the * wire count; it's used for map splitting and zone changing in * vm_map_copyout. */ static inline void vm_map_entry_copy_csm_assoc( vm_map_t map __unused, vm_map_entry_t new __unused, vm_map_entry_t old __unused) { #if CODE_SIGNING_MONITOR /* when code signing monitor is enabled, we want to reset on copy */ new->csm_associated = FALSE; #else /* when code signing monitor is not enabled, assert as a sanity check */ assert(new->csm_associated == FALSE); #endif #if DEVELOPMENT || DEBUG if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) { printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, map, new, new->vme_start, new->vme_end); } #endif /* DEVELOPMENT || DEBUG */ #if XNU_TARGET_OS_OSX /* * On macOS, entries with "vme_xnu_user_debug" can be copied during fork() * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid * trigggering CSM assertions when the child accesses its mapping. */ #else /* XNU_TARGET_OS_OSX */ new->vme_xnu_user_debug = FALSE; #endif /* XNU_TARGET_OS_OSX */ } /* * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy(). * But for security reasons on some platforms, we don't want the * new mapping to be "used for jit", so we reset the flag here. */ static inline void vm_map_entry_copy_code_signing( vm_map_t map, vm_map_entry_t new, vm_map_entry_t old __unused) { if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) { assert(new->used_for_jit == old->used_for_jit); } else { if (old->used_for_jit) { DTRACE_VM3(cs_wx, uint64_t, new->vme_start, uint64_t, new->vme_end, vm_prot_t, new->protection); printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, "removing execute access"); new->protection &= ~VM_PROT_EXECUTE; new->max_protection &= ~VM_PROT_EXECUTE; } new->used_for_jit = FALSE; } } static inline void vm_map_entry_copy_full( vm_map_entry_t new, vm_map_entry_t old) { #if MAP_ENTRY_CREATION_DEBUG btref_put(new->vme_creation_bt); btref_retain(old->vme_creation_bt); #endif #if MAP_ENTRY_INSERTION_DEBUG btref_put(new->vme_insertion_bt); btref_retain(old->vme_insertion_bt); #endif #if VM_BTLOG_TAGS /* Discard the btref that might be in the new entry */ if (new->vme_kernel_object) { btref_put(new->vme_tag_btref); } /* Retain the btref in the old entry to account for its copy */ if (old->vme_kernel_object) { btref_retain(old->vme_tag_btref); } #endif /* VM_BTLOG_TAGS */ *new = *old; } static inline void vm_map_entry_copy( vm_map_t map, vm_map_entry_t new, vm_map_entry_t old) { vm_map_entry_copy_full(new, old); new->is_shared = FALSE; new->needs_wakeup = FALSE; new->in_transition = FALSE; new->wired_count = 0; new->user_wired_count = 0; new->vme_permanent = FALSE; vm_map_entry_copy_code_signing(map, new, old); vm_map_entry_copy_csm_assoc(map, new, old); if (new->iokit_acct) { assertf(!new->use_pmap, "old %p new %p\n", old, new); new->iokit_acct = FALSE; new->use_pmap = TRUE; } new->vme_resilient_codesign = FALSE; new->vme_resilient_media = FALSE; new->vme_atomic = FALSE; new->vme_no_copy_on_read = FALSE; } /* * Normal lock_read_to_write() returns FALSE/0 on failure. * These functions evaluate to zero on success and non-zero value on failure. */ __attribute__((always_inline)) int vm_map_lock_read_to_write(vm_map_t map) { if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) { DTRACE_VM(vm_map_lock_upgrade); return 0; } return 1; } __attribute__((always_inline)) boolean_t vm_map_try_lock(vm_map_t map) { if (lck_rw_try_lock_exclusive(&(map)->lock)) { DTRACE_VM(vm_map_lock_w); return TRUE; } return FALSE; } __attribute__((always_inline)) boolean_t vm_map_try_lock_read(vm_map_t map) { if (lck_rw_try_lock_shared(&(map)->lock)) { DTRACE_VM(vm_map_lock_r); return TRUE; } return FALSE; } /*! * @function kdp_vm_map_is_acquired_exclusive * * @abstract * Checks if vm map is acquired exclusive. * * @discussion * NOT SAFE: To be used only by kernel debugger. * * @param map map to check * * @returns TRUE if the map is acquired exclusively. */ boolean_t kdp_vm_map_is_acquired_exclusive(vm_map_t map) { return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock); } /* * Routines to get the page size the caller should * use while inspecting the target address space. * Use the "_safely" variant if the caller is dealing with a user-provided * array whose size depends on the page size, to avoid any overflow or * underflow of a user-allocated buffer. */ int vm_self_region_page_shift_safely( vm_map_t target_map) { int effective_page_shift = 0; if (PAGE_SIZE == (4096)) { /* x86_64 and 4k watches: always use 4k */ return PAGE_SHIFT; } /* did caller provide an explicit page size for this thread to use? */ effective_page_shift = thread_self_region_page_shift(); if (effective_page_shift) { /* use the explicitly-provided page size */ return effective_page_shift; } /* no explicit page size: use the caller's page size... */ effective_page_shift = VM_MAP_PAGE_SHIFT(current_map()); if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) { /* page size match: safe to use */ return effective_page_shift; } /* page size mismatch */ return -1; } int vm_self_region_page_shift( vm_map_t target_map) { int effective_page_shift; effective_page_shift = vm_self_region_page_shift_safely(target_map); if (effective_page_shift == -1) { /* no safe value but OK to guess for caller */ effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()), VM_MAP_PAGE_SHIFT(target_map)); } return effective_page_shift; } /* * Decide if we want to allow processes to execute from their data or stack areas. * override_nx() returns true if we do. Data/stack execution can be enabled independently * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec * or allow_stack_exec to enable data execution for that type of data area for that particular * ABI (or both by or'ing the flags together). These are initialized in the architecture * specific pmap files since the default behavior varies according to architecture. The * main reason it varies is because of the need to provide binary compatibility with old * applications that were written before these restrictions came into being. In the old * days, an app could execute anything it could read, but this has slowly been tightened * up over time. The default behavior is: * * 32-bit PPC apps may execute from both stack and data areas * 32-bit Intel apps may exeucte from data areas but not stack * 64-bit PPC/Intel apps may not execute from either data or stack * * An application on any architecture may override these defaults by explicitly * adding PROT_EXEC permission to the page in question with the mprotect(2) * system call. This code here just determines what happens when an app tries to * execute from a page that lacks execute permission. * * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore, * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow * execution from data areas for a particular binary even if the arch normally permits it. As * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit * to support some complicated use cases, notably browsers with out-of-process plugins that * are not all NX-safe. */ extern int allow_data_exec, allow_stack_exec; int override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ { int current_abi; if (map->pmap == kernel_pmap) { return FALSE; } /* * Determine if the app is running in 32 or 64 bit mode. */ if (vm_map_is_64bit(map)) { current_abi = VM_ABI_64; } else { current_abi = VM_ABI_32; } /* * Determine if we should allow the execution based on whether it's a * stack or data area and the current architecture. */ if (user_tag == VM_MEMORY_STACK) { return allow_stack_exec & current_abi; } return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE); } /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a single hint is used to speed up lookups. * * Sharing maps have been deleted from this version of Mach. * All shared objects are now mapped directly into the respective * maps. This requires a change in the copy on write strategy; * the asymmetric (delayed) strategy is used for shared temporary * objects instead of the symmetric (shadow) strategy. All maps * are now "top level" maps (either task map, kernel map or submap * of the kernel map). * * Since portions of maps are specified by start/end addreses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * No attempt is currently made to "glue back together" two * abutting entries. * * The symmetric (shadow) copy strategy implements virtual copy * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. * It is important to note that only one writeable reference * to a VM object region exists in any map when this strategy * is used -- this means that shadow object creation can be * delayed until a write operation occurs. The symmetric (delayed) * strategy allows multiple maps to have writeable references to * the same region of a vm object, and hence cannot delay creating * its copy objects. See vm_object_copy_quickly() in vm_object.c. * Copying of permanent objects is completely different; see * vm_object_copy_strategically() in vm_object.c. */ ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy); #define VM_MAP_ZONE_NAME "maps" #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM) #define VM_MAP_ENTRY_ZONE_NAME "VM map entries" #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM) #define VM_MAP_HOLES_ZONE_NAME "VM map holes" #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM) /* * Asserts that a vm_map_copy object is coming from the * vm_map_copy_zone to ensure that it isn't a fake constructed * anywhere else. */ void vm_map_copy_require(struct vm_map_copy *copy) { zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy); } /* * vm_map_require: * * Ensures that the argument is memory allocated from the genuine * vm map zone. (See zone_id_require_allow_foreign). */ void vm_map_require(vm_map_t map) { zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map); } #define VM_MAP_EARLY_COUNT_MAX 16 static __startup_data vm_offset_t map_data; static __startup_data vm_size_t map_data_size; static __startup_data vm_offset_t kentry_data; static __startup_data vm_size_t kentry_data_size; static __startup_data vm_offset_t map_holes_data; static __startup_data vm_size_t map_holes_data_size; static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX]; static __startup_data uint32_t early_map_count; #if XNU_TARGET_OS_OSX #define NO_COALESCE_LIMIT ((1024 * 128) - 1) #else /* XNU_TARGET_OS_OSX */ #define NO_COALESCE_LIMIT 0 #endif /* XNU_TARGET_OS_OSX */ /* Skip acquiring locks if we're in the midst of a kernel core dump */ unsigned int not_in_kdp = 1; unsigned int vm_map_set_cache_attr_count = 0; kern_return_t vm_map_set_cache_attr( vm_map_t map, vm_map_offset_t va) { vm_map_entry_t map_entry; vm_object_t object; kern_return_t kr = KERN_SUCCESS; vm_map_lock_read(map); if (!vm_map_lookup_entry(map, va, &map_entry) || map_entry->is_sub_map) { /* * that memory is not properly mapped */ kr = KERN_INVALID_ARGUMENT; goto done; } object = VME_OBJECT(map_entry); if (object == VM_OBJECT_NULL) { /* * there should be a VM object here at this point */ kr = KERN_INVALID_ARGUMENT; goto done; } vm_object_lock(object); object->set_cache_attr = TRUE; vm_object_unlock(object); vm_map_set_cache_attr_count++; done: vm_map_unlock_read(map); return kr; } #if CONFIG_CODE_DECRYPTION /* * vm_map_apple_protected: * This remaps the requested part of the object with an object backed by * the decrypting pager. * crypt_info contains entry points and session data for the crypt module. * The crypt_info block will be copied by vm_map_apple_protected. The data structures * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called. */ kern_return_t vm_map_apple_protected( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_object_offset_t crypto_backing_offset, struct pager_crypt_info *crypt_info, uint32_t cryptid) { boolean_t map_locked; kern_return_t kr; vm_map_entry_t map_entry; struct vm_map_entry tmp_entry; memory_object_t unprotected_mem_obj; vm_object_t protected_object; vm_map_offset_t map_addr; vm_map_offset_t start_aligned, end_aligned; vm_object_offset_t crypto_start, crypto_end; boolean_t cache_pager; map_locked = FALSE; unprotected_mem_obj = MEMORY_OBJECT_NULL; if (__improbable(vm_map_range_overflows(map, start, end - start))) { return KERN_INVALID_ADDRESS; } start_aligned = vm_map_trunc_page(start, PAGE_MASK_64); end_aligned = vm_map_round_page(end, PAGE_MASK_64); start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map)); end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map)); #if __arm64__ /* * "start" and "end" might be 4K-aligned but not 16K-aligned, * so we might have to loop and establish up to 3 mappings: * * + the first 16K-page, which might overlap with the previous * 4K-aligned mapping, * + the center, * + the last 16K-page, which might overlap with the next * 4K-aligned mapping. * Each of these mapping might be backed by a vnode pager (if * properly page-aligned) or a "fourk_pager", itself backed by a * vnode pager (if 4K-aligned but not page-aligned). */ #endif /* __arm64__ */ map_addr = start_aligned; for (map_addr = start_aligned; map_addr < end; map_addr = tmp_entry.vme_end) { vm_map_lock(map); map_locked = TRUE; /* lookup the protected VM object */ if (!vm_map_lookup_entry(map, map_addr, &map_entry) || map_entry->is_sub_map || VME_OBJECT(map_entry) == VM_OBJECT_NULL) { /* that memory is not properly mapped */ kr = KERN_INVALID_ARGUMENT; goto done; } /* ensure mapped memory is mapped as executable except * except for model decryption flow */ if ((cryptid != CRYPTID_MODEL_ENCRYPTION) && !(map_entry->protection & VM_PROT_EXECUTE)) { kr = KERN_INVALID_ARGUMENT; goto done; } /* get the protected object to be decrypted */ protected_object = VME_OBJECT(map_entry); if (protected_object == VM_OBJECT_NULL) { /* there should be a VM object here at this point */ kr = KERN_INVALID_ARGUMENT; goto done; } /* ensure protected object stays alive while map is unlocked */ vm_object_reference(protected_object); /* limit the map entry to the area we want to cover */ vm_map_clip_start(map, map_entry, start_aligned); vm_map_clip_end(map, map_entry, end_aligned); tmp_entry = *map_entry; map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */ vm_map_unlock(map); map_locked = FALSE; /* * This map entry might be only partially encrypted * (if not fully "page-aligned"). */ crypto_start = 0; crypto_end = tmp_entry.vme_end - tmp_entry.vme_start; if (tmp_entry.vme_start < start) { if (tmp_entry.vme_start != start_aligned) { kr = KERN_INVALID_ADDRESS; vm_object_deallocate(protected_object); goto done; } crypto_start += (start - tmp_entry.vme_start); } if (tmp_entry.vme_end > end) { if (tmp_entry.vme_end != end_aligned) { kr = KERN_INVALID_ADDRESS; vm_object_deallocate(protected_object); goto done; } crypto_end -= (tmp_entry.vme_end - end); } /* * This "extra backing offset" is needed to get the decryption * routine to use the right key. It adjusts for the possibly * relative offset of an interposed "4K" pager... */ if (crypto_backing_offset == (vm_object_offset_t) -1) { crypto_backing_offset = VME_OFFSET(&tmp_entry); } cache_pager = TRUE; #if XNU_TARGET_OS_OSX if (vm_map_is_alien(map)) { cache_pager = FALSE; } #endif /* XNU_TARGET_OS_OSX */ /* * Lookup (and create if necessary) the protected memory object * matching that VM object. * If successful, this also grabs a reference on the memory object, * to guarantee that it doesn't go away before we get a chance to map * it. */ unprotected_mem_obj = apple_protect_pager_setup( protected_object, VME_OFFSET(&tmp_entry), crypto_backing_offset, crypt_info, crypto_start, crypto_end, cache_pager); /* release extra ref on protected object */ vm_object_deallocate(protected_object); if (unprotected_mem_obj == NULL) { kr = KERN_FAILURE; goto done; } /* can overwrite an immutable mapping */ vm_map_kernel_flags_t vmk_flags = { .vmf_fixed = true, .vmf_overwrite = true, .vmkf_overwrite_immutable = true, }; /* make the new mapping as "permanent" as the one it replaces */ vmk_flags.vmf_permanent = tmp_entry.vme_permanent; /* map this memory object in place of the current one */ map_addr = tmp_entry.vme_start; kr = mach_vm_map_kernel(map, vm_sanitize_wrap_addr_ref(&map_addr), (tmp_entry.vme_end - tmp_entry.vme_start), (mach_vm_offset_t) 0, vmk_flags, (ipc_port_t)(uintptr_t) unprotected_mem_obj, 0, TRUE, tmp_entry.protection, tmp_entry.max_protection, tmp_entry.inheritance); assertf(kr == KERN_SUCCESS, "kr = 0x%x\n", kr); assertf(map_addr == tmp_entry.vme_start, "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n", (uint64_t)map_addr, (uint64_t) tmp_entry.vme_start, &tmp_entry); #if VM_MAP_DEBUG_APPLE_PROTECT if (vm_map_debug_apple_protect) { printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:" " backing:[object:%p,offset:0x%llx," "crypto_backing_offset:0x%llx," "crypto_start:0x%llx,crypto_end:0x%llx]\n", map, (uint64_t) map_addr, (uint64_t) (map_addr + (tmp_entry.vme_end - tmp_entry.vme_start)), unprotected_mem_obj, protected_object, VME_OFFSET(&tmp_entry), crypto_backing_offset, crypto_start, crypto_end); } #endif /* VM_MAP_DEBUG_APPLE_PROTECT */ /* * Release the reference obtained by * apple_protect_pager_setup(). * The mapping (if it succeeded) is now holding a reference on * the memory object. */ memory_object_deallocate(unprotected_mem_obj); unprotected_mem_obj = MEMORY_OBJECT_NULL; /* continue with next map entry */ crypto_backing_offset += (tmp_entry.vme_end - tmp_entry.vme_start); crypto_backing_offset -= crypto_start; } kr = KERN_SUCCESS; done: if (map_locked) { vm_map_unlock(map); } return kr; } #endif /* CONFIG_CODE_DECRYPTION */ LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map"); LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0); LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG); #if XNU_TARGET_OS_OSX #define MALLOC_NO_COW_DEFAULT 1 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1 #else /* XNU_TARGET_OS_OSX */ #define MALLOC_NO_COW_DEFAULT 1 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0 #endif /* XNU_TARGET_OS_OSX */ TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT); TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT); uint64_t vm_memory_malloc_no_cow_mask = 0ULL; #if DEBUG int vm_check_map_sanity = 0; #endif /* * vm_map_init: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from zones -- we must * initialize those zones. * * There are three zones of interest: * * vm_map_zone: used to allocate maps. * vm_map_entry_zone: used to allocate map entries. * * LP32: * vm_map_entry_reserved_zone: fallback zone for kernel map entries * * The kernel allocates map entries from a special zone that is initially * "crammed" with memory. It would be difficult (perhaps impossible) for * the kernel to allocate more memory to a entry zone when it became * empty since the very act of allocating memory implies the creation * of a new entry. */ __startup_func void vm_map_init(void) { #if MACH_ASSERT PE_parse_boot_argn("debug4k_filter", &debug4k_filter, sizeof(debug4k_filter)); #endif /* MACH_ASSERT */ zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL); /* * Don't quarantine because we always need elements available * Disallow GC on this zone... to aid the GC. */ zone_create_ext(VM_MAP_ENTRY_ZONE_NAME, sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS, ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) { z->z_elems_rsv = (uint16_t)(32 * (ml_early_cpu_max_number() + 1)); }); zone_create_ext(VM_MAP_HOLES_ZONE_NAME, sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS, ZONE_ID_VM_MAP_HOLES, ^(zone_t z) { z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z)); }); zone_create_ext("VM map copies", sizeof(struct vm_map_copy), ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL); /* * Add the stolen memory to zones, adjust zone size and stolen counts. */ zone_cram_early(vm_map_zone, map_data, map_data_size); zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size); zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size); printf("VM boostrap: %d maps, %d entries and %d holes available\n", zone_count_free(vm_map_zone), zone_count_free(vm_map_entry_zone), zone_count_free(vm_map_holes_zone)); /* * Since these are covered by zones, remove them from stolen page accounting. */ VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size)); #if VM_MAP_DEBUG_APPLE_PROTECT PE_parse_boot_argn("vm_map_debug_apple_protect", &vm_map_debug_apple_protect, sizeof(vm_map_debug_apple_protect)); #endif /* VM_MAP_DEBUG_APPLE_PROTECT */ #if VM_MAP_DEBUG_APPLE_FOURK PE_parse_boot_argn("vm_map_debug_fourk", &vm_map_debug_fourk, sizeof(vm_map_debug_fourk)); #endif /* VM_MAP_DEBUG_FOURK */ if (malloc_no_cow) { vm_memory_malloc_no_cow_mask = 0ULL; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM; #if XNU_TARGET_OS_OSX /* * On macOS, keep copy-on-write for MALLOC_LARGE because * realloc() may use vm_copy() to transfer the old contents * to the new location. */ #else /* XNU_TARGET_OS_OSX */ vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED; #endif /* XNU_TARGET_OS_OSX */ // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE; // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY; vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO; // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC; PE_parse_boot_argn("vm_memory_malloc_no_cow_mask", &vm_memory_malloc_no_cow_mask, sizeof(vm_memory_malloc_no_cow_mask)); } #if CONFIG_MAP_RANGES vm_map_range_map_init(); #endif /* CONFIG_MAP_RANGES */ #if DEBUG PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity)); if (vm_check_map_sanity) { kprintf("VM sanity checking enabled\n"); } else { kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n"); } #endif /* DEBUG */ #if DEVELOPMENT || DEBUG PE_parse_boot_argn("panic_on_unsigned_execute", &panic_on_unsigned_execute, sizeof(panic_on_unsigned_execute)); PE_parse_boot_argn("panic_on_mlock_failure", &panic_on_mlock_failure, sizeof(panic_on_mlock_failure)); #endif /* DEVELOPMENT || DEBUG */ } __startup_func static void vm_map_steal_memory(void) { /* * We need to reserve enough memory to support boostraping VM maps * and the zone subsystem. * * The VM Maps that need to function before zones can support them * are the ones registered with vm_map_will_allocate_early_map(), * which are: * - the kernel map * - the various submaps used by zones (pgz, meta, ...) * * We also need enough entries and holes to support them * until zone_metadata_init() is called, which is when * the zone allocator becomes capable of expanding dynamically. * * We need: * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps. * - To allow for 3-4 entries per map, but the kernel map * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries * to describe the submaps, so double it (and make it 8x too) * - To allow for holes between entries, * hence needs the same budget as entries */ map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), VM_MAP_ZFLAGS, VM_MAP_EARLY_COUNT_MAX); kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME, sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS, 8 * VM_MAP_EARLY_COUNT_MAX); map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME, sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS, 8 * VM_MAP_EARLY_COUNT_MAX); /* * Steal a contiguous range of memory so that a simple range check * can validate early addresses being freed/crammed to these * zones */ map_data = zone_early_mem_init(map_data_size + kentry_data_size + map_holes_data_size); kentry_data = map_data + map_data_size; map_holes_data = kentry_data + kentry_data_size; } STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory); __startup_func static void vm_kernel_boostraped(void) { zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]); zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]); zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]); printf("VM bootstrap done: %d maps, %d entries and %d holes left\n", zone_count_free(vm_map_zone), zone_count_free(vm_map_entry_zone), zone_count_free(vm_map_holes_zone)); } STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped); void vm_map_disable_hole_optimization(vm_map_t map) { vm_map_entry_t head_entry, hole_entry, next_hole_entry; if (map->holelistenabled) { head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); while (hole_entry != NULL) { next_hole_entry = hole_entry->vme_next; hole_entry->vme_next = NULL; hole_entry->vme_prev = NULL; zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry); if (next_hole_entry == head_entry) { hole_entry = NULL; } else { hole_entry = next_hole_entry; } } map->holes_list = NULL; map->holelistenabled = FALSE; map->first_free = vm_map_first_entry(map); SAVE_HINT_HOLE_WRITE(map, NULL); } } boolean_t vm_kernel_map_is_kernel(vm_map_t map) { return map->pmap == kernel_pmap; } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ extern vm_map_t vm_map_create_external( pmap_t pmap, vm_map_offset_t min_off, vm_map_offset_t max_off, boolean_t pageable); vm_map_t vm_map_create_external( pmap_t pmap, vm_map_offset_t min, vm_map_offset_t max, boolean_t pageable) { vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT; if (pageable) { options |= VM_MAP_CREATE_PAGEABLE; } return vm_map_create_options(pmap, min, max, options); } __startup_func void vm_map_will_allocate_early_map(vm_map_t *owner) { if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) { panic("VM_MAP_EARLY_COUNT_MAX is too low"); } early_map_owners[early_map_count++] = owner; } __startup_func void vm_map_relocate_early_maps(vm_offset_t delta) { for (uint32_t i = 0; i < early_map_count; i++) { vm_address_t addr = (vm_address_t)*early_map_owners[i]; *early_map_owners[i] = (vm_map_t)(addr + delta); } early_map_count = ~0u; } /* * Routine: vm_map_relocate_early_elem * * Purpose: * Early zone elements are allocated in a temporary part * of the address space. * * Once the zones live in their final place, the early * VM maps, map entries and map holes need to be relocated. * * It involves rewriting any vm_map_t, vm_map_entry_t or * pointers to vm_map_links. Other pointers to other types * are fine. * * Fortunately, pointers to those types are self-contained * in those zones, _except_ for pointers to VM maps, * which are tracked during early boot and fixed with * vm_map_relocate_early_maps(). */ __startup_func void vm_map_relocate_early_elem( uint32_t zone_id, vm_offset_t new_addr, vm_offset_t delta) { #define relocate(type_t, field) ({ \ typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \ if (*__field) { \ *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \ } \ }) switch (zone_id) { case ZONE_ID_VM_MAP: case ZONE_ID_VM_MAP_ENTRY: case ZONE_ID_VM_MAP_HOLES: break; default: panic("Unexpected zone ID %d", zone_id); } if (zone_id == ZONE_ID_VM_MAP) { relocate(vm_map_t, hdr.links.prev); relocate(vm_map_t, hdr.links.next); ((vm_map_t)new_addr)->pmap = kernel_pmap; #ifdef VM_MAP_STORE_USE_RB relocate(vm_map_t, hdr.rb_head_store.rbh_root); #endif /* VM_MAP_STORE_USE_RB */ relocate(vm_map_t, hint); relocate(vm_map_t, hole_hint); relocate(vm_map_t, first_free); return; } relocate(struct vm_map_links *, prev); relocate(struct vm_map_links *, next); if (zone_id == ZONE_ID_VM_MAP_ENTRY) { #ifdef VM_MAP_STORE_USE_RB relocate(vm_map_entry_t, store.entry.rbe_left); relocate(vm_map_entry_t, store.entry.rbe_right); relocate(vm_map_entry_t, store.entry.rbe_parent); #endif /* VM_MAP_STORE_USE_RB */ if (((vm_map_entry_t)new_addr)->is_sub_map) { /* no object to relocate because we haven't made any */ ((vm_map_entry_t)new_addr)->vme_submap += delta >> VME_SUBMAP_SHIFT; } #if MAP_ENTRY_CREATION_DEBUG relocate(vm_map_entry_t, vme_creation_maphdr); #endif /* MAP_ENTRY_CREATION_DEBUG */ } #undef relocate } vm_map_t vm_map_create_options( pmap_t pmap, vm_map_offset_t min, vm_map_offset_t max, vm_map_create_options_t options) { vm_map_t result; #if DEBUG || DEVELOPMENT if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) { if (early_map_count != ~0u && early_map_count != zone_count_allocated(vm_map_zone) + 1) { panic("allocating %dth early map, owner not known", zone_count_allocated(vm_map_zone) + 1); } if (early_map_count != ~0u && pmap && pmap != kernel_pmap) { panic("allocating %dth early map for non kernel pmap", early_map_count); } } #endif /* DEBUG || DEVELOPMENT */ result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO); vm_map_store_init(&result->hdr); result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE); vm_map_set_page_shift(result, PAGE_SHIFT); result->size_limit = RLIM_INFINITY; /* default unlimited */ result->data_limit = RLIM_INFINITY; /* default unlimited */ result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1); result->pmap = pmap; result->min_offset = min; result->max_offset = max; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); if (options & VM_MAP_CREATE_NEVER_FAULTS) { assert(pmap == kernel_pmap); result->never_faults = true; } /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */ if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) { result->has_corpse_footprint = true; } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) { struct vm_map_links *hole_entry; hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL); hole_entry->start = min; /* * Holes can be used to track ranges all the way up to * MACH_VM_MAX_ADDRESS or more (e.g. kernel map). */ hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS); result->holes_list = result->hole_hint = hole_entry; hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry); result->holelistenabled = true; } vm_map_lock_init(result); return result; } /* * Adjusts a submap that was made by kmem_suballoc() * before it knew where it would be mapped, * so that it has the right min/max offsets. * * We do not need to hold any locks: * only the caller knows about this map, * and it is not published on any entry yet. */ static void vm_map_adjust_offsets( vm_map_t map, vm_map_offset_t min_off, vm_map_offset_t max_off) { assert(map->min_offset == 0); assert(map->max_offset == max_off - min_off); assert(map->hdr.nentries == 0); assert(os_ref_get_count_raw(&map->map_refcnt) == 2); map->min_offset = min_off; map->max_offset = max_off; if (map->holelistenabled) { struct vm_map_links *hole = map->holes_list; hole->start = min_off; #if defined(__arm64__) hole->end = max_off; #else hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS); #endif } } vm_map_size_t vm_map_adjusted_size(vm_map_t map) { const struct vm_reserved_region *regions = NULL; size_t num_regions = 0; mach_vm_size_t reserved_size = 0, map_size = 0; if (map == NULL || (map->size == 0)) { return 0; } map_size = map->size; if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) { /* * No special reserved regions or not an exotic map or the task * is terminating and these special regions might have already * been deallocated. */ return map_size; } num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions); assert((num_regions == 0) || (num_regions > 0 && regions != NULL)); while (num_regions) { reserved_size += regions[--num_regions].vmrr_size; } /* * There are a few places where the map is being switched out due to * 'termination' without that bit being set (e.g. exec and corpse purging). * In those cases, we could have the map's regions being deallocated on * a core while some accounting process is trying to get the map's size. * So this assert can't be enabled till all those places are uniform in * their use of the 'map->terminated' bit. * * assert(map_size >= reserved_size); */ return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size; } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion in the * given map (or map copy). No fields are filled. * * The VM entry will be zero initialized, except for: * - behavior set to VM_BEHAVIOR_DEFAULT * - inheritance set to VM_INHERIT_DEFAULT */ #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr) #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr) static vm_map_entry_t _vm_map_entry_create( struct vm_map_header *map_header __unused) { vm_map_entry_t entry = NULL; entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO); /* * Help the compiler with what we know to be true, * so that the further bitfields inits have good codegen. * * See rdar://87041299 */ __builtin_assume(entry->vme_object_value == 0); __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0); __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0); static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK, "VME_ALIAS_MASK covers tags"); static_assert(VM_BEHAVIOR_DEFAULT == 0, "can skip zeroing of the behavior field"); entry->inheritance = VM_INHERIT_DEFAULT; #if MAP_ENTRY_CREATION_DEBUG entry->vme_creation_maphdr = map_header; entry->vme_creation_bt = btref_get(__builtin_frame_address(0), BTREF_GET_NOWAIT); #endif return entry; } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. * * write map lock held so no need to * do anything special to insure correctness * of the stores */ static void vm_map_entry_dispose( vm_map_entry_t entry) { #if VM_BTLOG_TAGS if (entry->vme_kernel_object) { btref_put(entry->vme_tag_btref); } #endif /* VM_BTLOG_TAGS */ #if MAP_ENTRY_CREATION_DEBUG btref_put(entry->vme_creation_bt); #endif #if MAP_ENTRY_INSERTION_DEBUG btref_put(entry->vme_insertion_bt); #endif zfree(vm_map_entry_zone, entry); } #define vm_map_copy_entry_dispose(copy_entry) \ vm_map_entry_dispose(copy_entry) static vm_map_entry_t vm_map_zap_first_entry( vm_map_zap_t list) { return list->vmz_head; } static vm_map_entry_t vm_map_zap_last_entry( vm_map_zap_t list) { assert(vm_map_zap_first_entry(list)); return __container_of(list->vmz_tail, struct vm_map_entry, vme_next); } static void vm_map_zap_append( vm_map_zap_t list, vm_map_entry_t entry) { entry->vme_next = VM_MAP_ENTRY_NULL; *list->vmz_tail = entry; list->vmz_tail = &entry->vme_next; } static vm_map_entry_t vm_map_zap_pop( vm_map_zap_t list) { vm_map_entry_t head = list->vmz_head; if (head != VM_MAP_ENTRY_NULL && (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) { list->vmz_tail = &list->vmz_head; } return head; } static void vm_map_zap_dispose( vm_map_zap_t list) { vm_map_entry_t entry; while ((entry = vm_map_zap_pop(list))) { if (entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(entry)); } else { vm_object_deallocate(VME_OBJECT(entry)); } vm_map_entry_dispose(entry); } } #if MACH_ASSERT static boolean_t first_free_check = FALSE; boolean_t first_free_is_valid( vm_map_t map) { if (!first_free_check) { return TRUE; } return first_free_is_valid_store( map ); } #endif /* MACH_ASSERT */ #define vm_map_copy_entry_link(copy, after_where, entry) \ _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry)) #define vm_map_copy_entry_unlink(copy, entry) \ _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false) /* * vm_map_destroy: * * Actually destroy a map. */ void vm_map_destroy( vm_map_t map) { /* final cleanup: this is not allowed to fail */ vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS; VM_MAP_ZAP_DECLARE(zap); vm_map_lock(map); map->terminated = true; /* clean up regular map entries */ (void)vm_map_delete(map, map->min_offset, map->max_offset, flags, KMEM_GUARD_NONE, &zap); /* clean up leftover special mappings (commpage, GPU carveout, etc...) */ (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags, KMEM_GUARD_NONE, &zap); vm_map_disable_hole_optimization(map); vm_map_corpse_footprint_destroy(map); vm_map_unlock(map); vm_map_zap_dispose(&zap); assert(map->hdr.nentries == 0); if (map->pmap) { pmap_destroy(map->pmap); } lck_rw_destroy(&map->lock, &vm_map_lck_grp); #if CONFIG_MAP_RANGES kfree_data(map->extra_ranges, map->extra_ranges_count * sizeof(struct vm_map_user_range)); #endif zfree_id(ZONE_ID_VM_MAP, map); } /* * Returns pid of the task with the largest number of VM map entries. * Used in the zone-map-exhaustion jetsam path. */ pid_t find_largest_process_vm_map_entries(void) { pid_t victim_pid = -1; int max_vm_map_entries = 0; task_t task = TASK_NULL; queue_head_t *task_list = &tasks; lck_mtx_lock(&tasks_threads_lock); queue_iterate(task_list, task, task_t, tasks) { if (task == kernel_task || !task->active) { continue; } vm_map_t task_map = task->map; if (task_map != VM_MAP_NULL) { int task_vm_map_entries = task_map->hdr.nentries; if (task_vm_map_entries > max_vm_map_entries) { max_vm_map_entries = task_vm_map_entries; victim_pid = pid_from_task(task); } } } lck_mtx_unlock(&tasks_threads_lock); printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries); return victim_pid; } /* * vm_map_lookup_entry: [ internal use only ] * * Calls into the vm map store layer to find the map * entry containing (or immediately preceding) the * specified address in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { bool result = false; #if CONFIG_KERNEL_TAGGING if (VM_KERNEL_ADDRESS(address)) { address = vm_memtag_canonicalize_address(address); } #endif /* CONFIG_KERNEL_TAGGING */ #if CONFIG_PROB_GZALLOC if (map->pmap == kernel_pmap) { assertf(!pgz_owned(address), "it is the responsibility of callers to unguard PGZ addresses"); } #endif /* CONFIG_PROB_GZALLOC */ result = vm_map_store_lookup_entry( map, address, entry ); return result; } boolean_t vm_map_lookup_entry_or_next( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { if (vm_map_lookup_entry(map, address, entry)) { return true; } *entry = (*entry)->vme_next; return false; } #if CONFIG_PROB_GZALLOC boolean_t vm_map_lookup_entry_allow_pgz( vm_map_t map, vm_map_offset_t address, vm_map_entry_t *entry) /* OUT */ { #if CONFIG_KERNEL_TAGGING if (VM_KERNEL_ADDRESS(address)) { address = vm_memtag_canonicalize_address(address); } #endif /* CONFIG_KERNEL_TAGGING */ return vm_map_store_lookup_entry( map, address, entry ); } #endif /* CONFIG_PROB_GZALLOC */ /* * Routine: vm_map_range_invalid_panic * Purpose: * Panic on detection of an invalid range id. */ __abortlike static void vm_map_range_invalid_panic( vm_map_t map, vm_map_range_id_t range_id) { panic("invalid range ID (%u) for map %p", range_id, map); } /* * Routine: vm_map_get_range * Purpose: * Adjust bounds based on security policy. */ static struct mach_vm_range vm_map_get_range( vm_map_t map, vm_map_address_t *address, vm_map_kernel_flags_t *vmk_flags, vm_map_size_t size, bool *is_ptr) { struct mach_vm_range effective_range = {}; vm_map_range_id_t range_id = vmk_flags->vmkf_range_id; if (map == kernel_map) { effective_range = kmem_ranges[range_id]; if (startup_phase >= STARTUP_SUB_KMEM) { /* * Hint provided by caller is zeroed as the range is restricted to a * subset of the entire kernel_map VA, which could put the hint outside * the range, causing vm_map_store_find_space to fail. */ *address = 0ull; /* * Ensure that range_id passed in by the caller is within meaningful * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space * to fail as the corresponding range is invalid. Range id larger than * KMEM_RANGE_ID_MAX will lead to an OOB access. */ if ((range_id == KMEM_RANGE_ID_NONE) || (range_id > KMEM_RANGE_ID_MAX)) { vm_map_range_invalid_panic(map, range_id); } /* * Pointer ranges use kmem_locate_space to do allocations. * * Non pointer fronts look like [ Small | Large | Permanent ] * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD. * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to * use the entire range. */ if (range_id < KMEM_RANGE_ID_SPRAYQTN) { *is_ptr = true; } else if (size >= KMEM_SMALLMAP_THRESHOLD) { effective_range = kmem_large_ranges[range_id]; } } #if CONFIG_MAP_RANGES } else if (map->uses_user_ranges) { switch (range_id) { case UMEM_RANGE_ID_DEFAULT: effective_range = map->default_range; break; case UMEM_RANGE_ID_HEAP: effective_range = map->data_range; break; case UMEM_RANGE_ID_LARGE_FILE: if (map->large_file_range.min_address != map->large_file_range.max_address) { /* large file range is configured and should be used */ effective_range = map->large_file_range; } else { /* * the user asking for this user range might not have the * permissions to use the large file range (i.e., it doesn't * hold the correct entitlement), so we give it the data range * instead */ effective_range = map->data_range; } break; case UMEM_RANGE_ID_FIXED: /* * anywhere allocations with an address in "FIXED" * makes no sense, leave the range empty */ break; default: vm_map_range_invalid_panic(map, range_id); } #endif /* CONFIG_MAP_RANGES */ } else { /* * If minimum is 0, bump it up by PAGE_SIZE. We want to limit * allocations of PAGEZERO to explicit requests since its * normal use is to catch dereferences of NULL and many * applications also treat pointers with a value of 0 as * special and suddenly having address 0 contain useable * memory would tend to confuse those applications. */ effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map)); effective_range.max_address = map->max_offset; } return effective_range; } kern_return_t vm_map_locate_space_anywhere( vm_map_t map, vm_map_size_t size, vm_map_offset_t mask, vm_map_kernel_flags_t vmk_flags, vm_map_offset_t *start_inout, vm_map_entry_t *entry_out) { struct mach_vm_range effective_range = {}; vm_map_size_t guard_offset; vm_map_offset_t hint, limit; vm_map_entry_t entry; bool is_kmem_ptr_range = false; /* * Only supported by vm_map_enter() with a fixed address. */ assert(!vmk_flags.vmf_fixed); assert(!vmk_flags.vmkf_beyond_max); if (__improbable(map->wait_for_space)) { /* * support for "wait_for_space" is minimal, * its only consumer is the ipc_kernel_copy_map. */ assert(!map->holelistenabled && !vmk_flags.vmkf_last_free && !vmk_flags.vmkf_keep_map_locked && !vmk_flags.vmkf_map_jit && !vmk_flags.vmf_random_addr && *start_inout <= map->min_offset); } else if (vmk_flags.vmkf_last_free) { assert(!vmk_flags.vmkf_map_jit && !vmk_flags.vmf_random_addr); } if (vmk_flags.vmkf_guard_before) { guard_offset = VM_MAP_PAGE_SIZE(map); assert(size > guard_offset); size -= guard_offset; } else { assert(size != 0); guard_offset = 0; } /* * Validate range_id from flags and get associated range */ effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size, &is_kmem_ptr_range); if (is_kmem_ptr_range) { return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id, vmk_flags.vmkf_last_free, start_inout, entry_out); } #if XNU_TARGET_OS_OSX if (__improbable(vmk_flags.vmkf_32bit_map_va)) { assert(map != kernel_map); effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL); } #endif /* XNU_TARGET_OS_OSX */ again: if (vmk_flags.vmkf_last_free) { hint = *start_inout; if (hint == 0 || hint > effective_range.max_address) { hint = effective_range.max_address; } if (hint <= effective_range.min_address) { return KERN_NO_SPACE; } limit = effective_range.min_address; } else { hint = *start_inout; if (vmk_flags.vmkf_map_jit) { if (map->jit_entry_exists && !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) { return KERN_INVALID_ARGUMENT; } if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) { vmk_flags.vmf_random_addr = true; } } if (vmk_flags.vmf_random_addr) { kern_return_t kr; kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags); if (kr != KERN_SUCCESS) { return kr; } } #if __x86_64__ else if ((hint == 0 || hint == vm_map_min(map)) && !map->disable_vmentry_reuse && map->vmmap_high_start != 0) { hint = map->vmmap_high_start; } #endif /* __x86_64__ */ if (hint < effective_range.min_address) { hint = effective_range.min_address; } if (effective_range.max_address <= hint) { return KERN_NO_SPACE; } limit = effective_range.max_address; } entry = vm_map_store_find_space(map, hint, limit, vmk_flags.vmkf_last_free, guard_offset, size, mask, start_inout); if (__improbable(entry == NULL)) { if (map->wait_for_space && guard_offset + size <= effective_range.max_address - effective_range.min_address) { assert_wait((event_t)map, THREAD_ABORTSAFE); vm_map_unlock(map); thread_block(THREAD_CONTINUE_NULL); vm_map_lock(map); goto again; } return KERN_NO_SPACE; } if (entry_out) { *entry_out = entry; } return KERN_SUCCESS; } /*! * @function vm_map_locate_space_fixed() * * @brief * Locate (no reservation) a range in the specified VM map at a fixed address. * * @param map the map to scan for memory, must be locked. * @param start the fixed address trying to be reserved * @param size the size of the allocation to make. * @param mask an alignment mask the allocation must respect, * @param vmk_flags the vm map kernel flags to influence this call. * vmk_flags.vmf_anywhere must not be set. * @param entry_out the entry right before the hole. * @param zap_list a zap list of entries to clean up after the call. * * @returns * - KERN_SUCCESS in case of success and no conflicting entry is found, * in which case entry_out is set to the entry before the hole. * * - KERN_MEMORY_PRESENT if a conflicting entry is found, * in which case entry_out is set the conflicting entry, * the callers MUST handle this error explicitly. * * - KERN_INVALID_ADDRESS if the specified @c start or @c size * would result in a mapping outside of the map. * * - KERN_NO_SPACE for various cases of unrecoverable failures. */ static kern_return_t vm_map_locate_space_fixed( vm_map_t map, vm_map_offset_t start, vm_map_size_t size, vm_map_offset_t mask, vm_map_kernel_flags_t vmk_flags, vm_map_entry_t *entry_out, vm_map_zap_t zap_list) { vm_map_offset_t effective_min_offset, effective_max_offset; vm_map_entry_t entry; vm_map_offset_t end; assert(vmk_flags.vmf_fixed); effective_min_offset = map->min_offset; effective_max_offset = map->max_offset; if (vmk_flags.vmkf_beyond_max) { /* * Allow an insertion beyond the map's max offset. */ effective_max_offset = 0x00000000FFFFF000ULL; if (vm_map_is_64bit(map)) { effective_max_offset = 0xFFFFFFFFFFFFF000ULL; } #if XNU_TARGET_OS_OSX } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) { effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL); #endif /* XNU_TARGET_OS_OSX */ } if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT && !vmk_flags.vmf_overwrite && map->pmap == kernel_pmap && vmk_flags.vm_tag == VM_MEMORY_REALLOC) { /* * Force realloc() to switch to a new allocation, * to prevent 4k-fragmented virtual ranges. */ // DEBUG4K_ERROR("no realloc in place"); return KERN_NO_SPACE; } /* * Verify that: * the address doesn't itself violate * the mask requirement. */ if ((start & mask) != 0) { return KERN_NO_SPACE; } #if CONFIG_MAP_RANGES if (map->uses_user_ranges) { struct mach_vm_range r; vm_map_user_range_resolve(map, start, 1, &r); if (r.max_address == 0) { return KERN_INVALID_ADDRESS; } effective_min_offset = r.min_address; effective_max_offset = r.max_address; } #endif /* CONFIG_MAP_RANGES */ if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap && (map == kernel_map)) { mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size); effective_min_offset = r->min_address; effective_max_offset = r->max_address; } /* * ... the address is within bounds */ end = start + size; if ((start < effective_min_offset) || (end > effective_max_offset) || (start >= end)) { return KERN_INVALID_ADDRESS; } if (vmk_flags.vmf_overwrite) { vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE; kern_return_t remove_kr; /* * Fixed mapping and "overwrite" flag: attempt to * remove all existing mappings in the specified * address range, saving them in our "zap_list". * * This avoids releasing the VM map lock in * vm_map_entry_delete() and allows atomicity * when we want to replace some mappings with a new one. * It also allows us to restore the old VM mappings if the * new mapping fails. */ remove_flags |= VM_MAP_REMOVE_NO_YIELD; if (vmk_flags.vmkf_overwrite_immutable) { /* we can overwrite immutable mappings */ remove_flags |= VM_MAP_REMOVE_IMMUTABLE; } if (vmk_flags.vmkf_remap_prot_copy) { remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE; } remove_kr = vm_map_delete(map, start, end, remove_flags, KMEM_GUARD_NONE, zap_list).kmr_return; if (remove_kr) { /* XXX FBDP restore zap_list? */ return remove_kr; } } /* * ... the starting address isn't allocated */ if (vm_map_lookup_entry(map, start, &entry)) { *entry_out = entry; return KERN_MEMORY_PRESENT; } /* * ... the next region doesn't overlap the * end point. */ if ((entry->vme_next != vm_map_to_entry(map)) && (entry->vme_next->vme_start < end)) { return KERN_NO_SPACE; } *entry_out = entry; return KERN_SUCCESS; } /* * Routine: vm_map_find_space * Purpose: * Allocate a range in the specified virtual address map, * returning the entry allocated for that range. * Used by kmem_alloc, etc. * * The map must be NOT be locked. It will be returned locked * on KERN_SUCCESS, unlocked on failure. * * If an entry is allocated, the object/offset fields * are initialized to zero. */ kern_return_t vm_map_find_space( vm_map_t map, vm_map_offset_t hint_address, vm_map_size_t size, vm_map_offset_t mask, vm_map_kernel_flags_t vmk_flags, vm_map_entry_t *o_entry) /* OUT */ { vm_map_entry_t new_entry, entry; kern_return_t kr; if (size == 0) { return KERN_INVALID_ARGUMENT; } new_entry = vm_map_entry_create(map); new_entry->use_pmap = true; new_entry->protection = VM_PROT_DEFAULT; new_entry->max_protection = VM_PROT_ALL; if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { new_entry->map_aligned = true; } if (vmk_flags.vmf_permanent) { new_entry->vme_permanent = true; } vm_map_lock(map); kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags, &hint_address, &entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); vm_map_entry_dispose(new_entry); return kr; } new_entry->vme_start = hint_address; new_entry->vme_end = hint_address + size; /* * At this point, * * - new_entry's "vme_start" and "vme_end" should define * the endpoints of the available new range, * * - and "entry" should refer to the region before * the new range, * * - and the map should still be locked. */ assert(page_aligned(new_entry->vme_start)); assert(page_aligned(new_entry->vme_end)); assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map))); /* * Insert the new entry into the list */ vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE); map->size += size; /* * Update the lookup hint */ SAVE_HINT_MAP_WRITE(map, new_entry); *o_entry = new_entry; return KERN_SUCCESS; } int vm_map_pmap_enter_print = FALSE; int vm_map_pmap_enter_enable = FALSE; /* * Routine: vm_map_pmap_enter [internal only] * * Description: * Force pages from the specified object to be entered into * the pmap at the specified address if they are present. * As soon as a page not found in the object the scan ends. * * Returns: * Nothing. * * In/out conditions: * The source map should not be locked on entry. */ __unused static void vm_map_pmap_enter( vm_map_t map, vm_map_offset_t addr, vm_map_offset_t end_addr, vm_object_t object, vm_object_offset_t offset, vm_prot_t protection) { int type_of_fault; kern_return_t kr; uint8_t object_lock_type = 0; struct vm_object_fault_info fault_info = {}; if (map->pmap == 0) { return; } assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT); while (addr < end_addr) { vm_page_t m; /* * TODO: * From vm_map_enter(), we come into this function without the map * lock held or the object lock held. * We haven't taken a reference on the object either. * We should do a proper lookup on the map to make sure * that things are sane before we go locking objects that * could have been deallocated from under us. */ object_lock_type = OBJECT_LOCK_EXCLUSIVE; vm_object_lock(object); m = vm_page_lookup(object, offset); if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious || (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) { vm_object_unlock(object); return; } if (vm_map_pmap_enter_print) { printf("vm_map_pmap_enter:"); printf("map: %p, addr: %llx, object: %p, offset: %llx\n", map, (unsigned long long)addr, object, (unsigned long long)offset); } type_of_fault = DBG_CACHE_HIT_FAULT; kr = vm_fault_enter(m, map->pmap, addr, PAGE_SIZE, 0, protection, protection, VM_PAGE_WIRED(m), FALSE, /* change_wiring */ VM_KERN_MEMORY_NONE, /* tag - not wiring */ &fault_info, NULL, /* need_retry */ &type_of_fault, &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/ vm_object_unlock(object); offset += PAGE_SIZE_64; addr += PAGE_SIZE; } } #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000 static kern_return_t vm_map_random_address_for_size( vm_map_t map, vm_map_offset_t *address, vm_map_size_t size, vm_map_kernel_flags_t vmk_flags) { kern_return_t kr = KERN_SUCCESS; int tries = 0; vm_map_offset_t random_addr = 0; vm_map_offset_t hole_end; vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL; vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL; vm_map_size_t vm_hole_size = 0; vm_map_size_t addr_space_size; bool is_kmem_ptr; struct mach_vm_range effective_range; effective_range = vm_map_get_range(map, address, &vmk_flags, size, &is_kmem_ptr); addr_space_size = effective_range.max_address - effective_range.min_address; if (size >= addr_space_size) { return KERN_NO_SPACE; } addr_space_size -= size; assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))); while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) { if (startup_phase < STARTUP_SUB_ZALLOC) { random_addr = (vm_map_offset_t)early_random(); } else { random_addr = (vm_map_offset_t)random(); } random_addr <<= VM_MAP_PAGE_SHIFT(map); random_addr = vm_map_trunc_page( effective_range.min_address + (random_addr % addr_space_size), VM_MAP_PAGE_MASK(map)); #if CONFIG_PROB_GZALLOC if (map->pmap == kernel_pmap && pgz_owned(random_addr)) { continue; } #endif /* CONFIG_PROB_GZALLOC */ if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) { if (prev_entry == vm_map_to_entry(map)) { next_entry = vm_map_first_entry(map); } else { next_entry = prev_entry->vme_next; } if (next_entry == vm_map_to_entry(map)) { hole_end = vm_map_max(map); } else { hole_end = next_entry->vme_start; } vm_hole_size = hole_end - random_addr; if (vm_hole_size >= size) { *address = random_addr; break; } } tries++; } if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) { kr = KERN_NO_SPACE; } return kr; } static boolean_t vm_memory_malloc_no_cow( int alias) { uint64_t alias_mask; if (!malloc_no_cow) { return FALSE; } if (alias > 63) { return FALSE; } alias_mask = 1ULL << alias; if (alias_mask & vm_memory_malloc_no_cow_mask) { return TRUE; } return FALSE; } uint64_t vm_map_enter_RLIMIT_AS_count = 0; uint64_t vm_map_enter_RLIMIT_DATA_count = 0; /* * Routine: vm_map_enter * * Description: * Allocate a range in the specified virtual address map. * The resulting range will refer to memory defined by * the given memory object and offset into that object. * * Arguments are as defined in the vm_map call. */ static unsigned int vm_map_enter_restore_successes = 0; static unsigned int vm_map_enter_restore_failures = 0; kern_return_t vm_map_enter( vm_map_t map, vm_map_offset_t *address, /* IN/OUT */ vm_map_size_t size, vm_map_offset_t mask, vm_map_kernel_flags_t vmk_flags, vm_object_t object, vm_object_offset_t offset, boolean_t needs_copy, vm_prot_t cur_protection, vm_prot_t max_protection, vm_inherit_t inheritance) { vm_map_entry_t entry, new_entry; vm_map_offset_t start, tmp_start, tmp_offset; vm_map_offset_t end, tmp_end; vm_map_offset_t tmp2_start, tmp2_end; vm_map_offset_t step; kern_return_t result = KERN_SUCCESS; bool map_locked = FALSE; bool pmap_empty = TRUE; bool new_mapping_established = FALSE; const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked; const bool anywhere = !vmk_flags.vmf_fixed; const bool purgable = vmk_flags.vmf_purgeable; const bool no_cache = vmk_flags.vmf_no_cache; const bool is_submap = vmk_flags.vmkf_submap; const bool permanent = vmk_flags.vmf_permanent; const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read; const bool entry_for_jit = vmk_flags.vmkf_map_jit; const bool iokit_acct = vmk_flags.vmkf_iokit_acct; const bool resilient_codesign = vmk_flags.vmf_resilient_codesign; const bool resilient_media = vmk_flags.vmf_resilient_media; const bool entry_for_tpro = vmk_flags.vmf_tpro; const unsigned int superpage_size = vmk_flags.vmf_superpage_size; const vm_tag_t alias = vmk_flags.vm_tag; vm_tag_t user_alias; kern_return_t kr; bool clear_map_aligned = FALSE; vm_map_size_t chunk_size = 0; vm_object_t caller_object; VM_MAP_ZAP_DECLARE(zap_old_list); VM_MAP_ZAP_DECLARE(zap_new_list); caller_object = object; assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused); if (vmk_flags.vmf_4gb_chunk) { #if defined(__LP64__) chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */ #else /* __LP64__ */ chunk_size = ANON_CHUNK_SIZE; #endif /* __LP64__ */ } else { chunk_size = ANON_CHUNK_SIZE; } if (superpage_size) { if (object != VM_OBJECT_NULL) { /* caller can't provide their own VM object */ return KERN_INVALID_ARGUMENT; } switch (superpage_size) { /* * Note that the current implementation only supports * a single size for superpages, SUPERPAGE_SIZE, per * architecture. As soon as more sizes are supposed * to be supported, SUPERPAGE_SIZE has to be replaced * with a lookup of the size depending on superpage_size. */ #ifdef __x86_64__ case SUPERPAGE_SIZE_ANY: /* handle it like 2 MB and round up to page size */ size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1); OS_FALLTHROUGH; case SUPERPAGE_SIZE_2MB: break; #endif default: return KERN_INVALID_ARGUMENT; } mask = SUPERPAGE_SIZE - 1; if (size & (SUPERPAGE_SIZE - 1)) { return KERN_INVALID_ARGUMENT; } inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */ } if ((cur_protection & VM_PROT_WRITE) && (cur_protection & VM_PROT_EXECUTE) && #if XNU_TARGET_OS_OSX map->pmap != kernel_pmap && (cs_process_global_enforcement() || (vmk_flags.vmkf_cs_enforcement_override ? vmk_flags.vmkf_cs_enforcement : (vm_map_cs_enforcement(map) #if __arm64__ || !VM_MAP_IS_EXOTIC(map) #endif /* __arm64__ */ ))) && #endif /* XNU_TARGET_OS_OSX */ #if CODE_SIGNING_MONITOR (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) && #endif (VM_MAP_POLICY_WX_FAIL(map) || VM_MAP_POLICY_WX_STRIP_X(map)) && !entry_for_jit) { boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map); DTRACE_VM3(cs_wx, uint64_t, 0, uint64_t, 0, vm_prot_t, cur_protection); printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, (vm_protect_wx_fail ? "failing" : "turning off execute")); cur_protection &= ~VM_PROT_EXECUTE; if (vm_protect_wx_fail) { return KERN_PROTECTION_FAILURE; } } if (entry_for_jit && cur_protection != VM_PROT_ALL) { /* * Native macOS processes and all non-macOS processes are * expected to create JIT regions via mmap(MAP_JIT, RWX) but * the RWX requirement was not enforced, and thus, we must live * with our sins. We are now dealing with a JIT mapping without * RWX. * * We deal with these by letting the MAP_JIT stick in order * to avoid CS violations when these pages are mapped executable * down the line. In order to appease the page table monitor (you * know what I'm talking about), these pages will end up being * marked as XNU_USER_DEBUG, which will be allowed because we * don't enforce the code signing monitor on macOS systems. If * the user-space application ever changes permissions to RWX, * which they are allowed to since the mapping was originally * created with MAP_JIT, then they'll switch over to using the * XNU_USER_JIT type, and won't be allowed to downgrade any * more after that. * * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is * strictly disallowed. */ #if XNU_TARGET_OS_OSX /* * Continue to allow non-RWX JIT */ #else /* non-macOS: reject JIT regions without RWX */ DTRACE_VM3(cs_wx, uint64_t, 0, uint64_t, 0, vm_prot_t, cur_protection); printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, cur_protection); return KERN_PROTECTION_FAILURE; #endif } /* * If the task has requested executable lockdown, * deny any new executable mapping. */ if (map->map_disallow_new_exec == TRUE) { if (cur_protection & VM_PROT_EXECUTE) { return KERN_PROTECTION_FAILURE; } } if (resilient_codesign) { assert(!is_submap); int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC)); if ((cur_protection | max_protection) & reject_prot) { return KERN_PROTECTION_FAILURE; } } if (resilient_media) { assert(!is_submap); // assert(!needs_copy); if (object != VM_OBJECT_NULL && !object->internal) { /* * This mapping is directly backed by an external * memory manager (e.g. a vnode pager for a file): * we would not have any safe place to inject * a zero-filled page if an actual page is not * available, without possibly impacting the actual * contents of the mapped object (e.g. the file), * so we can't provide any media resiliency here. */ return KERN_INVALID_ARGUMENT; } } if (entry_for_tpro) { /* * TPRO overrides the effective permissions of the region * and explicitly maps as RW. Ensure we have been passed * the expected permissions. We accept `cur_protections` * RO as that will be handled on fault. */ if (!(max_protection & VM_PROT_READ) || !(max_protection & VM_PROT_WRITE) || !(cur_protection & VM_PROT_READ)) { return KERN_PROTECTION_FAILURE; } /* * We can now downgrade the cur_protection to RO. This is a mild lie * to the VM layer. But TPRO will be responsible for toggling the * protections between RO/RW */ cur_protection = VM_PROT_READ; } if (is_submap) { vm_map_t submap; if (purgable) { /* submaps can not be purgeable */ return KERN_INVALID_ARGUMENT; } if (object == VM_OBJECT_NULL) { /* submaps can not be created lazily */ return KERN_INVALID_ARGUMENT; } submap = (vm_map_t) object; if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) { /* page size mismatch */ return KERN_INVALID_ARGUMENT; } } if (vmk_flags.vmkf_already) { /* * VM_FLAGS_ALREADY says that it's OK if the same mapping * is already present. For it to be meaningul, the requested * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and * we shouldn't try and remove what was mapped there first * (!VM_FLAGS_OVERWRITE). */ if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) { return KERN_INVALID_ARGUMENT; } } if (size == 0 || (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) { *address = 0; return KERN_INVALID_ARGUMENT; } if (map->pmap == kernel_pmap) { user_alias = VM_KERN_MEMORY_NONE; } else { user_alias = alias; } if (user_alias == VM_MEMORY_MALLOC_MEDIUM) { chunk_size = MALLOC_MEDIUM_CHUNK_SIZE; } #define RETURN(value) { result = value; goto BailOut; } assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address); assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size); if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { assertf(page_aligned(*address), "0x%llx", (uint64_t)*address); assertf(page_aligned(size), "0x%llx", (uint64_t)size); } if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK && !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) { /* * In most cases, the caller rounds the size up to the * map's page size. * If we get a size that is explicitly not map-aligned here, * we'll have to respect the caller's wish and mark the * mapping as "not map-aligned" to avoid tripping the * map alignment checks later. */ clear_map_aligned = TRUE; } if (!anywhere && VM_MAP_PAGE_MASK(map) >= PAGE_MASK && !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) { /* * We've been asked to map at a fixed address and that * address is not aligned to the map's specific alignment. * The caller should know what it's doing (i.e. most likely * mapping some fragmented copy map, transferring memory from * a VM map with a different alignment), so clear map_aligned * for this new VM map entry and proceed. */ clear_map_aligned = TRUE; } /* * Only zero-fill objects are allowed to be purgable. * LP64todo - limit purgable objects to 32-bits for now */ if (purgable && (offset != 0 || (object != VM_OBJECT_NULL && (object->vo_size != size || object->purgable == VM_PURGABLE_DENY)) #if __LP64__ || size > ANON_MAX_SIZE #endif )) { return KERN_INVALID_ARGUMENT; } vm_map_lock(map); map_locked = TRUE; if (anywhere) { result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags, address, &entry); start = *address; } else { start = *address; result = vm_map_locate_space_fixed(map, start, size, mask, vmk_flags, &entry, &zap_old_list); } end = start + size; assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); /* * Check if what's already there is what we want. */ if (result == KERN_MEMORY_PRESENT) { assert(!anywhere); if (!(vmk_flags.vmkf_already)) { RETURN(KERN_NO_SPACE); } tmp_start = start; tmp_offset = offset; if (entry->vme_start < start) { tmp_start -= start - entry->vme_start; tmp_offset -= start - entry->vme_start; } for (; entry->vme_start < end; entry = entry->vme_next) { /* * Check if the mapping's attributes * match the existing map entry. */ if (entry == vm_map_to_entry(map) || entry->vme_start != tmp_start || entry->is_sub_map != is_submap || VME_OFFSET(entry) != tmp_offset || entry->needs_copy != needs_copy || entry->protection != cur_protection || entry->max_protection != max_protection || entry->inheritance != inheritance || entry->iokit_acct != iokit_acct || VME_ALIAS(entry) != alias) { /* not the same mapping ! */ RETURN(KERN_NO_SPACE); } /* * Check if the same object is being mapped. */ if (is_submap) { if (VME_SUBMAP(entry) != (vm_map_t) object) { /* not the same submap */ RETURN(KERN_NO_SPACE); } } else { if (VME_OBJECT(entry) != object) { /* not the same VM object... */ vm_object_t obj2; obj2 = VME_OBJECT(entry); if ((obj2 == VM_OBJECT_NULL || obj2->internal) && (object == VM_OBJECT_NULL || object->internal)) { /* * ... but both are * anonymous memory, * so equivalent. */ } else { RETURN(KERN_NO_SPACE); } } } tmp_offset += entry->vme_end - entry->vme_start; tmp_start += entry->vme_end - entry->vme_start; if (entry->vme_end >= end) { /* reached the end of our mapping */ break; } } /* it all matches: let's use what's already there ! */ RETURN(KERN_MEMORY_PRESENT); } if (result != KERN_SUCCESS) { goto BailOut; } /* * At this point, * "start" and "end" should define the endpoints of the * available new range, and * "entry" should refer to the region before the new * range, and * * the map should be locked. */ /* * See whether we can avoid creating a new entry (and object) by * extending one of our neighbors. [So far, we only attempt to * extend from below.] Note that we can never extend/join * purgable objects because they need to remain distinct * entities in order to implement their "volatile object" * semantics. */ if (purgable || entry_for_jit || entry_for_tpro || vm_memory_malloc_no_cow(user_alias)) { if (superpage_size) { /* * For "super page" allocations, we will allocate * special physically-contiguous VM objects later on, * so we should not have flags instructing us to create * a differently special VM object here. */ RETURN(KERN_INVALID_ARGUMENT); } if (object == VM_OBJECT_NULL) { assert(!superpage_size); object = vm_object_allocate(size); vm_object_lock(object); object->copy_strategy = MEMORY_OBJECT_COPY_NONE; VM_OBJECT_SET_TRUE_SHARE(object, FALSE); if (malloc_no_cow_except_fork && !purgable && !entry_for_jit && !entry_for_tpro && vm_memory_malloc_no_cow(user_alias)) { object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK; VM_OBJECT_SET_TRUE_SHARE(object, TRUE); } if (entry_for_jit) { object->vo_inherit_copy_none = true; } if (purgable) { task_t owner; VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE); if (map->pmap == kernel_pmap) { /* * Purgeable mappings made in a kernel * map are "owned" by the kernel itself * rather than the current user task * because they're likely to be used by * more than this user task (see * execargs_purgeable_allocate(), for * example). */ owner = kernel_task; } else { owner = current_task(); } assert(object->vo_owner == NULL); assert(object->resident_page_count == 0); assert(object->wired_page_count == 0); vm_purgeable_nonvolatile_enqueue(object, owner); } vm_object_unlock(object); offset = (vm_object_offset_t)0; } } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* no coalescing if address space uses sub-pages */ } else if ((is_submap == FALSE) && (object == VM_OBJECT_NULL) && (entry != vm_map_to_entry(map)) && (entry->vme_end == start) && (!entry->is_shared) && (!entry->is_sub_map) && (!entry->in_transition) && (!entry->needs_wakeup) && (entry->behavior == VM_BEHAVIOR_DEFAULT) && (entry->protection == cur_protection) && (entry->max_protection == max_protection) && (entry->inheritance == inheritance) && ((user_alias == VM_MEMORY_REALLOC) || (VME_ALIAS(entry) == alias)) && (entry->no_cache == no_cache) && (entry->vme_permanent == permanent) && /* no coalescing for immutable executable mappings */ !((entry->protection & VM_PROT_EXECUTE) && entry->vme_permanent) && (!entry->superpage_size && !superpage_size) && /* * No coalescing if not map-aligned, to avoid propagating * that condition any further than needed: */ (!entry->map_aligned || !clear_map_aligned) && (!entry->zero_wired_pages) && (!entry->used_for_jit && !entry_for_jit) && #if __arm64e__ (!entry->used_for_tpro && !entry_for_tpro) && #endif (!entry->csm_associated) && (entry->iokit_acct == iokit_acct) && (!entry->vme_resilient_codesign) && (!entry->vme_resilient_media) && (!entry->vme_atomic) && (entry->vme_no_copy_on_read == no_copy_on_read) && ((entry->vme_end - entry->vme_start) + size <= (user_alias == VM_MEMORY_REALLOC ? ANON_CHUNK_SIZE : NO_COALESCE_LIMIT)) && (entry->wired_count == 0)) { /* implies user_wired_count == 0 */ if (vm_object_coalesce(VME_OBJECT(entry), VM_OBJECT_NULL, VME_OFFSET(entry), (vm_object_offset_t) 0, (vm_map_size_t)(entry->vme_end - entry->vme_start), (vm_map_size_t)(end - entry->vme_end))) { /* * Coalesced the two objects - can extend * the previous map entry to include the * new range. */ map->size += (end - entry->vme_end); assert(entry->vme_start < end); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (__improbable(vm_debug_events)) { DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end); } entry->vme_end = end; if (map->holelistenabled) { vm_map_store_update_first_free(map, entry, TRUE); } else { vm_map_store_update_first_free(map, map->first_free, TRUE); } new_mapping_established = TRUE; RETURN(KERN_SUCCESS); } } step = superpage_size ? SUPERPAGE_SIZE : (end - start); new_entry = NULL; if (vmk_flags.vmkf_submap_adjust) { vm_map_adjust_offsets((vm_map_t)caller_object, start, end); offset = start; } for (tmp2_start = start; tmp2_start < end; tmp2_start += step) { tmp2_end = tmp2_start + step; /* * Create a new entry * * XXX FBDP * The reserved "page zero" in each process's address space can * be arbitrarily large. Splitting it into separate objects and * therefore different VM map entries serves no purpose and just * slows down operations on the VM map, so let's not split the * allocation into chunks if the max protection is NONE. That * memory should never be accessible, so it will never get to the * default pager. */ tmp_start = tmp2_start; if (!is_submap && object == VM_OBJECT_NULL && size > chunk_size && max_protection != VM_PROT_NONE && superpage_size == 0) { tmp_end = tmp_start + chunk_size; } else { tmp_end = tmp2_end; } do { if (!is_submap && object != VM_OBJECT_NULL && object->internal && offset + (tmp_end - tmp_start) > object->vo_size) { // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start)); DTRACE_VM5(vm_map_enter_overmap, vm_map_t, map, vm_map_address_t, tmp_start, vm_map_address_t, tmp_end, vm_object_offset_t, offset, vm_object_size_t, object->vo_size); } new_entry = vm_map_entry_insert(map, entry, tmp_start, tmp_end, object, offset, vmk_flags, needs_copy, cur_protection, max_protection, (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ? VM_INHERIT_NONE : inheritance), clear_map_aligned); assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias)); if (resilient_codesign) { int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC)); if (!((cur_protection | max_protection) & reject_prot)) { new_entry->vme_resilient_codesign = TRUE; } } if (resilient_media && (object == VM_OBJECT_NULL || object->internal)) { new_entry->vme_resilient_media = TRUE; } assert(!new_entry->iokit_acct); if (!is_submap && object != VM_OBJECT_NULL && object->internal && (object->purgable != VM_PURGABLE_DENY || object->vo_ledger_tag)) { assert(new_entry->use_pmap); assert(!new_entry->iokit_acct); /* * Turn off pmap accounting since * purgeable (or tagged) objects have their * own ledgers. */ new_entry->use_pmap = FALSE; } else if (!is_submap && iokit_acct && object != VM_OBJECT_NULL && object->internal) { /* alternate accounting */ assert(!new_entry->iokit_acct); assert(new_entry->use_pmap); new_entry->iokit_acct = TRUE; new_entry->use_pmap = FALSE; DTRACE_VM4( vm_map_iokit_mapped_region, vm_map_t, map, vm_map_offset_t, new_entry->vme_start, vm_map_offset_t, new_entry->vme_end, int, VME_ALIAS(new_entry)); vm_map_iokit_mapped_region( map, (new_entry->vme_end - new_entry->vme_start)); } else if (!is_submap) { assert(!new_entry->iokit_acct); assert(new_entry->use_pmap); } if (is_submap) { vm_map_t submap; boolean_t submap_is_64bit; boolean_t use_pmap; assert(new_entry->is_sub_map); assert(!new_entry->use_pmap); assert(!new_entry->iokit_acct); submap = (vm_map_t) object; submap_is_64bit = vm_map_is_64bit(submap); use_pmap = vmk_flags.vmkf_nested_pmap; #ifndef NO_NESTED_PMAP if (use_pmap && submap->pmap == NULL) { ledger_t ledger = map->pmap->ledger; /* we need a sub pmap to nest... */ submap->pmap = pmap_create_options(ledger, 0, submap_is_64bit ? PMAP_CREATE_64BIT : 0); if (submap->pmap == NULL) { /* let's proceed without nesting... */ } #if defined(__arm64__) else { pmap_set_nested(submap->pmap); } #endif } if (use_pmap && submap->pmap != NULL) { if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) { DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap)); kr = KERN_FAILURE; } else { kr = pmap_nest(map->pmap, submap->pmap, tmp_start, tmp_end - tmp_start); } if (kr != KERN_SUCCESS) { printf("vm_map_enter: " "pmap_nest(0x%llx,0x%llx) " "error 0x%x\n", (long long)tmp_start, (long long)tmp_end, kr); } else { /* we're now nested ! */ new_entry->use_pmap = TRUE; pmap_empty = FALSE; } } #endif /* NO_NESTED_PMAP */ } entry = new_entry; if (superpage_size) { vm_page_t pages, m; vm_object_t sp_object; vm_object_offset_t sp_offset; assert(object == VM_OBJECT_NULL); VME_OFFSET_SET(entry, 0); /* allocate one superpage */ kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0); if (kr != KERN_SUCCESS) { /* deallocate whole range... */ new_mapping_established = TRUE; /* ... but only up to "tmp_end" */ size -= end - tmp_end; RETURN(kr); } /* create one vm_object per superpage */ sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start)); vm_object_lock(sp_object); sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE); sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE; VME_OBJECT_SET(entry, sp_object, false, 0); assert(entry->use_pmap); /* enter the base pages into the object */ for (sp_offset = 0; sp_offset < SUPERPAGE_SIZE; sp_offset += PAGE_SIZE) { m = pages; pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK); } vm_object_unlock(sp_object); } } while (tmp_end != tmp2_end && (tmp_start = tmp_end) && (tmp_end = (tmp2_end - tmp_end > chunk_size) ? tmp_end + chunk_size : tmp2_end)); } new_mapping_established = TRUE; BailOut: assert(map_locked == TRUE); /* * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA): * If we have identified and possibly established the new mapping(s), * make sure we did not go beyond the address space limit. */ if (result == KERN_SUCCESS) { if (map->size_limit != RLIM_INFINITY && map->size > map->size_limit) { /* * Establishing the requested mappings would exceed * the process's RLIMIT_AS limit: fail with * KERN_NO_SPACE. */ result = KERN_NO_SPACE; printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, (uint64_t) map->size, (uint64_t) map->size_limit); DTRACE_VM2(vm_map_enter_RLIMIT_AS, vm_map_size_t, map->size, uint64_t, map->size_limit); vm_map_enter_RLIMIT_AS_count++; } else if (map->data_limit != RLIM_INFINITY && map->size > map->data_limit) { /* * Establishing the requested mappings would exceed * the process's RLIMIT_DATA limit: fail with * KERN_NO_SPACE. */ result = KERN_NO_SPACE; printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, (uint64_t) map->size, (uint64_t) map->data_limit); DTRACE_VM2(vm_map_enter_RLIMIT_DATA, vm_map_size_t, map->size, uint64_t, map->data_limit); vm_map_enter_RLIMIT_DATA_count++; } } if (result == KERN_SUCCESS) { vm_prot_t pager_prot; memory_object_t pager; #if DEBUG if (pmap_empty && !(vmk_flags.vmkf_no_pmap_check)) { assert(pmap_is_empty(map->pmap, *address, *address + size)); } #endif /* DEBUG */ /* * For "named" VM objects, let the pager know that the * memory object is being mapped. Some pagers need to keep * track of this, to know when they can reclaim the memory * object, for example. * VM calls memory_object_map() for each mapping (specifying * the protection of each mapping) and calls * memory_object_last_unmap() when all the mappings are gone. */ pager_prot = max_protection; if (needs_copy) { /* * Copy-On-Write mapping: won't modify * the memory object. */ pager_prot &= ~VM_PROT_WRITE; } if (!is_submap && object != VM_OBJECT_NULL && object->named && object->pager != MEMORY_OBJECT_NULL) { vm_object_lock(object); pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL) { assert(object->pager_ready); vm_object_mapping_wait(object, THREAD_UNINT); /* object might have lost its pager while waiting */ pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL) { vm_object_mapping_begin(object); vm_object_unlock(object); kr = memory_object_map(pager, pager_prot); assert(kr == KERN_SUCCESS); vm_object_lock(object); vm_object_mapping_end(object); } } vm_object_unlock(object); } } assert(map_locked == TRUE); if (new_mapping_established) { /* * If we release the map lock for any reason below, * another thread could deallocate our new mapping, * releasing the caller's reference on "caller_object", * which was transferred to the mapping. * If this was the only reference, the object could be * destroyed. * * We need to take an extra reference on "caller_object" * to keep it alive if we need to return the caller's * reference to the caller in case of failure. */ if (is_submap) { vm_map_reference((vm_map_t)caller_object); } else { vm_object_reference(caller_object); } } if (!keep_map_locked) { vm_map_unlock(map); map_locked = FALSE; entry = VM_MAP_ENTRY_NULL; new_entry = VM_MAP_ENTRY_NULL; } /* * We can't hold the map lock if we enter this block. */ if (result == KERN_SUCCESS) { /* Wire down the new entry if the user * requested all new map entries be wired. */ if ((map->wiring_required) || (superpage_size)) { assert(!keep_map_locked); pmap_empty = FALSE; /* pmap won't be empty */ kr = vm_map_wire_nested(map, start, end, cur_protection, VM_KERN_MEMORY_MLOCK, TRUE, PMAP_NULL, 0, NULL); result = kr; } } if (result != KERN_SUCCESS) { if (new_mapping_established) { vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS; /* * We have to get rid of the new mappings since we * won't make them available to the user. * Try and do that atomically, to minimize the risk * that someone else create new mappings that range. */ if (!map_locked) { vm_map_lock(map); map_locked = TRUE; } remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN; remove_flags |= VM_MAP_REMOVE_NO_YIELD; if (permanent) { remove_flags |= VM_MAP_REMOVE_IMMUTABLE; } (void) vm_map_delete(map, *address, *address + size, remove_flags, KMEM_GUARD_NONE, &zap_new_list); } if (vm_map_zap_first_entry(&zap_old_list)) { vm_map_entry_t entry1, entry2; /* * The new mapping failed. Attempt to restore * the old mappings, saved in the "zap_old_map". */ if (!map_locked) { vm_map_lock(map); map_locked = TRUE; } /* first check if the coast is still clear */ start = vm_map_zap_first_entry(&zap_old_list)->vme_start; end = vm_map_zap_last_entry(&zap_old_list)->vme_end; if (vm_map_lookup_entry(map, start, &entry1) || vm_map_lookup_entry(map, end, &entry2) || entry1 != entry2) { /* * Part of that range has already been * re-mapped: we can't restore the old * mappings... */ vm_map_enter_restore_failures++; } else { /* * Transfer the saved map entries from * "zap_old_map" to the original "map", * inserting them all after "entry1". */ while ((entry2 = vm_map_zap_pop(&zap_old_list))) { vm_map_size_t entry_size; entry_size = (entry2->vme_end - entry2->vme_start); vm_map_store_entry_link(map, entry1, entry2, VM_MAP_KERNEL_FLAGS_NONE); map->size += entry_size; entry1 = entry2; } if (map->wiring_required) { /* * XXX TODO: we should rewire the * old pages here... */ } vm_map_enter_restore_successes++; } } } /* * The caller is responsible for releasing the lock if it requested to * keep the map locked. */ if (map_locked && !keep_map_locked) { vm_map_unlock(map); } vm_map_zap_dispose(&zap_old_list); vm_map_zap_dispose(&zap_new_list); if (new_mapping_established) { /* * The caller had a reference on "caller_object" and we * transferred that reference to the mapping. * We also took an extra reference on "caller_object" to keep * it alive while the map was unlocked. */ if (result == KERN_SUCCESS) { /* * On success, the caller's reference on the object gets * tranferred to the mapping. * Release our extra reference. */ if (is_submap) { vm_map_deallocate((vm_map_t)caller_object); } else { vm_object_deallocate(caller_object); } } else { /* * On error, the caller expects to still have a * reference on the object it gave us. * Let's use our extra reference for that. */ } } return result; #undef RETURN } /* * Counters for the prefault optimization. */ int64_t vm_prefault_nb_pages = 0; int64_t vm_prefault_nb_bailout = 0; static kern_return_t vm_map_enter_adjust_offset( vm_object_offset_t *obj_offs, vm_object_offset_t *obj_end, vm_object_offset_t quantity) { if (os_add_overflow(*obj_offs, quantity, obj_offs) || os_add_overflow(*obj_end, quantity, obj_end) || vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) { return KERN_INVALID_ARGUMENT; } return KERN_SUCCESS; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_enter_mem_object_sanitize( vm_map_t target_map, vm_map_offset_ut address_u, vm_map_size_ut initial_size_u, vm_map_offset_ut mask_u, vm_object_offset_ut offset_u, vm_prot_ut cur_protection_u, vm_prot_ut max_protection_u, vm_inherit_ut inheritance_u, vm_map_kernel_flags_t vmk_flags, ipc_port_t port, vm_map_address_t *map_addr, vm_map_size_t *map_size, vm_map_offset_t *mask, vm_object_offset_t *obj_offs, vm_object_offset_t *obj_end, vm_object_size_t *obj_size, vm_prot_t *cur_protection, vm_prot_t *max_protection, vm_inherit_t *inheritance) { kern_return_t result; result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map, VM_PROT_IS_MASK, cur_protection, max_protection); if (__improbable(result != KERN_SUCCESS)) { return result; } result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, inheritance); if (__improbable(result != KERN_SUCCESS)) { return result; } result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask); if (__improbable(result != KERN_SUCCESS)) { return result; } if (vmk_flags.vmf_fixed) { vm_map_address_t map_end; result = vm_sanitize_addr_size(address_u, initial_size_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map, VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START, map_addr, &map_end, map_size); if (__improbable(result != KERN_SUCCESS)) { return result; } } else { *map_addr = vm_sanitize_addr(target_map, address_u); result = vm_sanitize_size(0, initial_size_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map, VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size); if (__improbable(result != KERN_SUCCESS)) { return result; } } *obj_size = vm_object_round_page(*map_size); if (__improbable(*obj_size == 0)) { return KERN_INVALID_ARGUMENT; } if (IP_VALID(port)) { result = vm_sanitize_addr_size(offset_u, *obj_size, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, PAGE_MASK, VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, obj_offs, obj_end, obj_size); if (__improbable(result != KERN_SUCCESS)) { return result; } } else { *obj_offs = 0; *obj_end = *obj_size; } return KERN_SUCCESS; } kern_return_t vm_map_enter_mem_object( vm_map_t target_map, vm_map_offset_ut *address_u, vm_map_size_ut initial_size_u, vm_map_offset_ut mask_u, vm_map_kernel_flags_t vmk_flags, ipc_port_t port, vm_object_offset_ut offset_u, boolean_t copy, vm_prot_ut cur_protection_u, vm_prot_ut max_protection_u, vm_inherit_ut inheritance_u, upl_page_list_ptr_t page_list, unsigned int page_list_count) { vm_map_offset_t mask; vm_prot_t cur_protection; vm_prot_t max_protection; vm_inherit_t inheritance; vm_map_address_t map_addr, map_mask; vm_map_size_t map_size; vm_object_t object = VM_OBJECT_NULL; vm_object_offset_t obj_offs, obj_end; vm_object_size_t obj_size; kern_return_t result; boolean_t mask_cur_protection, mask_max_protection; boolean_t kernel_prefault, try_prefault = (page_list_count != 0); vm_map_offset_t offset_in_mapping = 0; if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) { /* XXX TODO4K prefaulting depends on page size... */ try_prefault = FALSE; } /* * Check arguments for validity */ if ((target_map == VM_MAP_NULL) || (try_prefault && (copy || !page_list))) { return KERN_INVALID_ARGUMENT; } map_mask = vm_map_page_mask(target_map); /* * Sanitize any input parameters that are addr/size/prot/inherit */ result = vm_map_enter_mem_object_sanitize( target_map, *address_u, initial_size_u, mask_u, offset_u, cur_protection_u, max_protection_u, inheritance_u, vmk_flags, port, &map_addr, &map_size, &mask, &obj_offs, &obj_end, &obj_size, &cur_protection, &max_protection, &inheritance); if (__improbable(result != KERN_SUCCESS)) { return vm_sanitize_get_kr(result); } assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused); vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size); mask_cur_protection = cur_protection & VM_PROT_IS_MASK; mask_max_protection = max_protection & VM_PROT_IS_MASK; cur_protection &= ~VM_PROT_IS_MASK; max_protection &= ~VM_PROT_IS_MASK; #if __arm64__ if (cur_protection & VM_PROT_EXECUTE) { cur_protection |= VM_PROT_READ; } #endif /* __arm64__ */ /* * Find the vm object (if any) corresponding to this port. */ if (!IP_VALID(port)) { object = VM_OBJECT_NULL; copy = FALSE; } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) { vm_named_entry_t named_entry; vm_object_size_t initial_size; named_entry = mach_memory_entry_from_port(port); if (vmk_flags.vmf_return_data_addr || vmk_flags.vmf_return_4k_data_addr) { result = vm_map_enter_adjust_offset(&obj_offs, &obj_end, named_entry->data_offset); if (__improbable(result)) { return result; } } /* a few checks to make sure user is obeying rules */ if (mask_max_protection) { max_protection &= named_entry->protection; } if (mask_cur_protection) { cur_protection &= named_entry->protection; } if ((named_entry->protection & max_protection) != max_protection) { return KERN_INVALID_RIGHT; } if ((named_entry->protection & cur_protection) != cur_protection) { return KERN_INVALID_RIGHT; } /* * unwrap is safe because we know obj_size is larger and doesn't * overflow */ initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u); if (named_entry->size < obj_offs + initial_size) { return KERN_INVALID_ARGUMENT; } /* for a vm_map_copy, we can only map it whole */ if (named_entry->is_copy && (obj_size != named_entry->size) && (vm_map_round_page(obj_size, map_mask) == named_entry->size)) { /* XXX FBDP use the rounded size... */ obj_end += named_entry->size - obj_size; obj_size = named_entry->size; } if (named_entry->offset) { /* * the callers parameter offset is defined to be the * offset from beginning of named entry offset in object * * Because we checked above that * obj_offs + obj_size < named_entry_size * these overflow checks should be redundant... */ result = vm_map_enter_adjust_offset(&obj_offs, &obj_end, named_entry->offset); if (__improbable(result)) { return result; } } if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) { /* * Let's not map more than requested; * vm_map_enter() will handle this "not map-aligned" * case. */ map_size = obj_size; } named_entry_lock(named_entry); // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum) assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map); if (named_entry->is_sub_map) { vm_map_t submap; assert(!named_entry->is_copy); assert(!named_entry->is_object); if (vmk_flags.vmf_return_data_addr || vmk_flags.vmf_return_4k_data_addr) { panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap."); } submap = named_entry->backing.map; vm_map_reference(submap); named_entry_unlock(named_entry); vmk_flags.vmkf_submap = TRUE; result = vm_map_enter(target_map, &map_addr, map_size, mask, vmk_flags, (vm_object_t)(uintptr_t) submap, obj_offs, copy, cur_protection, max_protection, inheritance); if (result != KERN_SUCCESS) { vm_map_deallocate(submap); return result; } /* * No need to lock "submap" just to check its * "mapped" flag: that flag is never reset * once it's been set and if we race, we'll * just end up setting it twice, which is OK. */ if (submap->mapped_in_other_pmaps == FALSE && vm_map_pmap(submap) != PMAP_NULL && vm_map_pmap(submap) != vm_map_pmap(target_map)) { /* * This submap is being mapped in a map * that uses a different pmap. * Set its "mapped_in_other_pmaps" flag * to indicate that we now need to * remove mappings from all pmaps rather * than just the submap's pmap. */ vm_map_lock(submap); submap->mapped_in_other_pmaps = TRUE; vm_map_unlock(submap); } goto out; } if (named_entry->is_copy) { kern_return_t kr; vm_map_copy_t copy_map; vm_map_entry_t copy_entry; vm_map_offset_t copy_addr; vm_map_copy_t target_copy_map; vm_map_offset_t overmap_start, overmap_end; vm_map_offset_t trimmed_start; vm_map_size_t target_size; assert(!named_entry->is_object); assert(!named_entry->is_sub_map); if (!vm_map_kernel_flags_check_vmflags(vmk_flags, (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE | VM_FLAGS_OVERWRITE | VM_FLAGS_RETURN_4K_DATA_ADDR | VM_FLAGS_RETURN_DATA_ADDR))) { named_entry_unlock(named_entry); return KERN_INVALID_ARGUMENT; } copy_map = named_entry->backing.copy; assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) { /* unsupported type; should not happen */ printf("vm_map_enter_mem_object: " "memory_entry->backing.copy " "unsupported type 0x%x\n", copy_map->type); named_entry_unlock(named_entry); return KERN_INVALID_ARGUMENT; } if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) { DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map)); } if (vmk_flags.vmf_return_data_addr || vmk_flags.vmf_return_4k_data_addr) { offset_in_mapping = obj_offs & map_mask; if (vmk_flags.vmf_return_4k_data_addr) { offset_in_mapping &= ~((signed)(0xFFF)); } } target_copy_map = VM_MAP_COPY_NULL; target_size = copy_map->size; overmap_start = 0; overmap_end = 0; trimmed_start = 0; if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) { DEBUG4K_ADJUST("adjusting...\n"); kr = vm_map_copy_adjust_to_target( copy_map, obj_offs, initial_size, target_map, copy, &target_copy_map, &overmap_start, &overmap_end, &trimmed_start); if (kr != KERN_SUCCESS) { named_entry_unlock(named_entry); return kr; } target_size = target_copy_map->size; } else { /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy_map); target_copy_map = copy_map; } vm_map_kernel_flags_t rsv_flags = vmk_flags; vm_map_kernel_flags_and_vmflags(&rsv_flags, (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE | VM_FLAGS_OVERWRITE | VM_FLAGS_RETURN_4K_DATA_ADDR | VM_FLAGS_RETURN_DATA_ADDR)); /* reserve a contiguous range */ kr = vm_map_enter(target_map, &map_addr, vm_map_round_page(target_size, map_mask), mask, rsv_flags, VM_OBJECT_NULL, 0, FALSE, /* copy */ cur_protection, max_protection, inheritance); if (kr != KERN_SUCCESS) { DEBUG4K_ERROR("kr 0x%x\n", kr); if (target_copy_map != copy_map) { vm_map_copy_discard(target_copy_map); target_copy_map = VM_MAP_COPY_NULL; } named_entry_unlock(named_entry); return kr; } copy_addr = map_addr; for (copy_entry = vm_map_copy_first_entry(target_copy_map); copy_entry != vm_map_copy_to_entry(target_copy_map); copy_entry = copy_entry->vme_next) { vm_map_t copy_submap = VM_MAP_NULL; vm_object_t copy_object = VM_OBJECT_NULL; vm_map_size_t copy_size; vm_object_offset_t copy_offset; boolean_t do_copy = false; if (copy_entry->is_sub_map) { copy_submap = VME_SUBMAP(copy_entry); copy_object = (vm_object_t)copy_submap; } else { copy_object = VME_OBJECT(copy_entry); } copy_offset = VME_OFFSET(copy_entry); copy_size = (copy_entry->vme_end - copy_entry->vme_start); /* sanity check */ if ((copy_addr + copy_size) > (map_addr + overmap_start + overmap_end + named_entry->size /* XXX full size */)) { /* over-mapping too much !? */ kr = KERN_INVALID_ARGUMENT; DEBUG4K_ERROR("kr 0x%x\n", kr); /* abort */ break; } /* take a reference on the object */ if (copy_entry->is_sub_map) { vm_map_reference(copy_submap); } else { if (!copy && copy_object != VM_OBJECT_NULL && copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { bool is_writable; /* * We need to resolve our side of this * "symmetric" copy-on-write now; we * need a new object to map and share, * instead of the current one which * might still be shared with the * original mapping. * * Note: A "vm_map_copy_t" does not * have a lock but we're protected by * the named entry's lock here. */ // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE); assert(copy_object != VME_OBJECT(copy_entry)); is_writable = false; if (copy_entry->protection & VM_PROT_WRITE) { is_writable = true; #if __arm64e__ } else if (copy_entry->used_for_tpro) { is_writable = true; #endif /* __arm64e__ */ } if (!copy_entry->needs_copy && is_writable) { vm_prot_t prot; prot = copy_entry->protection & ~VM_PROT_WRITE; vm_object_pmap_protect(copy_object, copy_offset, copy_size, PMAP_NULL, PAGE_SIZE, 0, prot); } copy_entry->needs_copy = FALSE; copy_entry->is_shared = TRUE; copy_object = VME_OBJECT(copy_entry); copy_offset = VME_OFFSET(copy_entry); vm_object_lock(copy_object); /* we're about to make a shared mapping of this object */ copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE); vm_object_unlock(copy_object); } if (copy_object != VM_OBJECT_NULL && copy_object->named && copy_object->pager != MEMORY_OBJECT_NULL && copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { memory_object_t pager; vm_prot_t pager_prot; /* * For "named" VM objects, let the pager know that the * memory object is being mapped. Some pagers need to keep * track of this, to know when they can reclaim the memory * object, for example. * VM calls memory_object_map() for each mapping (specifying * the protection of each mapping) and calls * memory_object_last_unmap() when all the mappings are gone. */ pager_prot = max_protection; if (copy) { /* * Copy-On-Write mapping: won't modify the * memory object. */ pager_prot &= ~VM_PROT_WRITE; } vm_object_lock(copy_object); pager = copy_object->pager; if (copy_object->named && pager != MEMORY_OBJECT_NULL && copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { assert(copy_object->pager_ready); vm_object_mapping_wait(copy_object, THREAD_UNINT); /* * Object might have lost its pager * while waiting. */ pager = copy_object->pager; if (copy_object->named && pager != MEMORY_OBJECT_NULL) { vm_object_mapping_begin(copy_object); vm_object_unlock(copy_object); kr = memory_object_map(pager, pager_prot); assert(kr == KERN_SUCCESS); vm_object_lock(copy_object); vm_object_mapping_end(copy_object); } } vm_object_unlock(copy_object); } /* * Perform the copy if requested */ if (copy && copy_object != VM_OBJECT_NULL) { vm_object_t new_object; vm_object_offset_t new_offset; result = vm_object_copy_strategically(copy_object, copy_offset, copy_size, false, /* forking */ &new_object, &new_offset, &do_copy); if (result == KERN_MEMORY_RESTART_COPY) { boolean_t success; boolean_t src_needs_copy; /* * XXX * We currently ignore src_needs_copy. * This really is the issue of how to make * MEMORY_OBJECT_COPY_SYMMETRIC safe for * non-kernel users to use. Solution forthcoming. * In the meantime, since we don't allow non-kernel * memory managers to specify symmetric copy, * we won't run into problems here. */ new_object = copy_object; new_offset = copy_offset; success = vm_object_copy_quickly(new_object, new_offset, copy_size, &src_needs_copy, &do_copy); assert(success); result = KERN_SUCCESS; } if (result != KERN_SUCCESS) { kr = result; break; } copy_object = new_object; copy_offset = new_offset; /* * No extra object reference for the mapping: * the mapping should be the only thing keeping * this new object alive. */ } else { /* * We already have the right object * to map. */ copy_object = VME_OBJECT(copy_entry); /* take an extra ref for the mapping below */ vm_object_reference(copy_object); } } /* * If the caller does not want a specific * tag for this new mapping: use * the tag of the original mapping. */ vm_map_kernel_flags_t vmk_remap_flags = { .vmkf_submap = copy_entry->is_sub_map, }; vm_map_kernel_flags_set_vmflags(&vmk_remap_flags, vm_map_kernel_flags_vmflags(vmk_flags), vmk_flags.vm_tag ?: VME_ALIAS(copy_entry)); /* over-map the object into destination */ vmk_remap_flags.vmf_fixed = true; vmk_remap_flags.vmf_overwrite = true; if (!copy && !copy_entry->is_sub_map) { /* * copy-on-write should have been * resolved at this point, or we would * end up sharing instead of copying. */ assert(!copy_entry->needs_copy); } #if XNU_TARGET_OS_OSX if (copy_entry->used_for_jit) { vmk_remap_flags.vmkf_map_jit = TRUE; } #endif /* XNU_TARGET_OS_OSX */ kr = vm_map_enter(target_map, ©_addr, copy_size, (vm_map_offset_t) 0, vmk_remap_flags, copy_object, copy_offset, ((copy_object == NULL) ? FALSE : (copy || copy_entry->needs_copy)), cur_protection, max_protection, inheritance); if (kr != KERN_SUCCESS) { DEBUG4K_SHARE("failed kr 0x%x\n", kr); if (copy_entry->is_sub_map) { vm_map_deallocate(copy_submap); } else { vm_object_deallocate(copy_object); } /* abort */ break; } /* next mapping */ copy_addr += copy_size; } named_entry_unlock(named_entry); if (target_copy_map != copy_map) { vm_map_copy_discard(target_copy_map); target_copy_map = VM_MAP_COPY_NULL; } if (kr == KERN_SUCCESS) { if (overmap_start) { DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start)); } offset_in_mapping += overmap_start; } else if (!vmk_flags.vmf_overwrite) { /* deallocate the contiguous range */ vm_map_remove(target_map, map_addr, map_addr + map_size); } result = kr; goto out; } if (named_entry->is_object) { unsigned int access; unsigned int wimg_mode; assert(!named_entry->is_copy); assert(!named_entry->is_sub_map); /* we are mapping a VM object */ access = named_entry->access; if (vmk_flags.vmf_return_data_addr || vmk_flags.vmf_return_4k_data_addr) { offset_in_mapping = obj_offs & map_mask; if (vmk_flags.vmf_return_4k_data_addr) { offset_in_mapping &= ~((signed)(0xFFF)); } obj_offs -= offset_in_mapping; map_size = vm_map_round_page(initial_size + offset_in_mapping, map_mask); } object = vm_named_entry_to_vm_object(named_entry); assert(object != VM_OBJECT_NULL); vm_object_lock(object); named_entry_unlock(named_entry); vm_object_reference_locked(object); wimg_mode = object->wimg_bits; vm_prot_to_wimg(access, &wimg_mode); if (object->wimg_bits != wimg_mode) { vm_object_change_wimg_mode(object, wimg_mode); } vm_object_unlock(object); } else { panic("invalid VM named entry %p", named_entry); } } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) { /* * JMM - This is temporary until we unify named entries * and raw memory objects. * * Detected fake ip_kotype for a memory object. In * this case, the port isn't really a port at all, but * instead is just a raw memory object. */ if (vmk_flags.vmf_return_data_addr || vmk_flags.vmf_return_4k_data_addr) { panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object."); } object = memory_object_to_vm_object((memory_object_t)port); if (object == VM_OBJECT_NULL) { return KERN_INVALID_OBJECT; } vm_object_reference(object); /* wait for object (if any) to be ready */ if (object != VM_OBJECT_NULL) { if (is_kernel_object(object)) { printf("Warning: Attempt to map kernel object" " by a non-private kernel entity\n"); return KERN_INVALID_OBJECT; } if (!object->pager_ready) { vm_object_lock(object); while (!object->pager_ready) { vm_object_sleep(object, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE); } vm_object_unlock(object); } } } else { return KERN_INVALID_OBJECT; } if (object != VM_OBJECT_NULL && object->named && object->pager != MEMORY_OBJECT_NULL && object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { memory_object_t pager; vm_prot_t pager_prot; kern_return_t kr; /* * For "named" VM objects, let the pager know that the * memory object is being mapped. Some pagers need to keep * track of this, to know when they can reclaim the memory * object, for example. * VM calls memory_object_map() for each mapping (specifying * the protection of each mapping) and calls * memory_object_last_unmap() when all the mappings are gone. */ pager_prot = max_protection; if (copy) { /* * Copy-On-Write mapping: won't modify the * memory object. */ pager_prot &= ~VM_PROT_WRITE; } vm_object_lock(object); pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL && object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { assert(object->pager_ready); vm_object_mapping_wait(object, THREAD_UNINT); /* object might have lost its pager while waiting */ pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL) { vm_object_mapping_begin(object); vm_object_unlock(object); kr = memory_object_map(pager, pager_prot); assert(kr == KERN_SUCCESS); vm_object_lock(object); vm_object_mapping_end(object); } } vm_object_unlock(object); } /* * Perform the copy if requested */ if (copy) { vm_object_t new_object; vm_object_offset_t new_offset; result = vm_object_copy_strategically(object, obj_offs, map_size, false, /* forking */ &new_object, &new_offset, ©); if (result == KERN_MEMORY_RESTART_COPY) { boolean_t success; boolean_t src_needs_copy; /* * XXX * We currently ignore src_needs_copy. * This really is the issue of how to make * MEMORY_OBJECT_COPY_SYMMETRIC safe for * non-kernel users to use. Solution forthcoming. * In the meantime, since we don't allow non-kernel * memory managers to specify symmetric copy, * we won't run into problems here. */ new_object = object; new_offset = obj_offs; success = vm_object_copy_quickly(new_object, new_offset, map_size, &src_needs_copy, ©); assert(success); result = KERN_SUCCESS; } /* * Throw away the reference to the * original object, as it won't be mapped. */ vm_object_deallocate(object); if (result != KERN_SUCCESS) { return result; } object = new_object; obj_offs = new_offset; } /* * If non-kernel users want to try to prefault pages, the mapping and prefault * needs to be atomic. */ kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map)); vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault); result = vm_map_enter(target_map, &map_addr, map_size, (vm_map_offset_t)mask, vmk_flags, object, obj_offs, copy, cur_protection, max_protection, inheritance); if (result != KERN_SUCCESS) { vm_object_deallocate(object); } /* * Try to prefault, and do not forget to release the vm map lock. */ if (result == KERN_SUCCESS && try_prefault) { mach_vm_address_t va = map_addr; kern_return_t kr = KERN_SUCCESS; unsigned int i = 0; int pmap_options; pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT; if (object->internal) { pmap_options |= PMAP_OPTIONS_INTERNAL; } for (i = 0; i < page_list_count; ++i) { if (!UPL_VALID_PAGE(page_list, i)) { if (kernel_prefault) { assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE"); result = KERN_MEMORY_ERROR; break; } } else { /* * If this function call failed, we should stop * trying to optimize, other calls are likely * going to fail too. * * We are not gonna report an error for such * failure though. That's an optimization, not * something critical. */ kr = pmap_enter_options(target_map->pmap, va, UPL_PHYS_PAGE(page_list, i), cur_protection, VM_PROT_NONE, 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER); if (kr != KERN_SUCCESS) { OSIncrementAtomic64(&vm_prefault_nb_bailout); if (kernel_prefault) { result = kr; } break; } OSIncrementAtomic64(&vm_prefault_nb_pages); } /* Next virtual address */ va += PAGE_SIZE; } if (vmk_flags.vmkf_keep_map_locked) { vm_map_unlock(target_map); } } out: if (result == KERN_SUCCESS) { #if KASAN if (target_map->pmap == kernel_pmap) { kasan_notify_address(map_addr, map_size); } #endif *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping); } return result; } kern_return_t vm_map_enter_mem_object_prefault( vm_map_t target_map, vm_map_offset_ut *address, vm_map_size_ut initial_size, vm_map_offset_ut mask, vm_map_kernel_flags_t vmk_flags, ipc_port_t port, vm_object_offset_ut offset, vm_prot_ut cur_protection, vm_prot_ut max_protection, upl_page_list_ptr_t page_list, unsigned int page_list_count) { /* range_id is set by vm_map_enter_mem_object */ return vm_map_enter_mem_object(target_map, address, initial_size, mask, vmk_flags, port, offset, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT, page_list, page_list_count); } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_enter_mem_object_control_sanitize( vm_map_t target_map, vm_map_offset_ut address_u, vm_map_size_ut initial_size_u, vm_map_offset_ut mask_u, vm_object_offset_ut offset_u, vm_prot_ut cur_protection_u, vm_prot_ut max_protection_u, vm_inherit_ut inheritance_u, vm_map_kernel_flags_t vmk_flags, vm_map_address_t *map_addr, vm_map_size_t *map_size, vm_map_offset_t *mask, vm_object_offset_t *obj_offs, vm_object_offset_t *obj_end, vm_object_size_t *obj_size, vm_prot_t *cur_protection, vm_prot_t *max_protection, vm_inherit_t *inheritance) { kern_return_t kr; kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map, cur_protection, max_protection); if (__improbable(kr != KERN_SUCCESS)) { return kr; } kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, inheritance); if (__improbable(kr != KERN_SUCCESS)) { return kr; } kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask); if (__improbable(kr != KERN_SUCCESS)) { return kr; } /* * Ensure arithmetic doesn't overflow in vm_object space (kernel * pages). * We keep unaligned values for now. The call we eventually make to * vm_map_enter does guarantee that offset_u is page aligned for EITHER * target_map pages or kernel pages. But this isn't enough to guarantee * kernel space alignment. */ kr = vm_sanitize_addr_size(offset_u, initial_size_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK, VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, obj_offs, obj_end, obj_size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } /* * There is no vm_sanitize_addr_size variant that also adjusts for * a separate offset. Rather than create one for this one-off issue, * we sanitize map_addr and map_size individually, relying on * vm_sanitize_size to incorporate the offset. Then, we perform the * overflow check manually below. */ *map_addr = vm_sanitize_addr(target_map, address_u); kr = vm_sanitize_size(offset_u, initial_size_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map, VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } /* * Ensure arithmetic doesn't overflow in target_map space. * The computation of map_size above accounts for the possibility that * offset_u might be unaligned in target_map space. */ if (vmk_flags.vmf_fixed) { vm_map_address_t map_end; if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) { return KERN_INVALID_ARGUMENT; } } return KERN_SUCCESS; } kern_return_t vm_map_enter_mem_object_control( vm_map_t target_map, vm_map_offset_ut *address_u, vm_map_size_ut initial_size_u, vm_map_offset_ut mask_u, vm_map_kernel_flags_t vmk_flags, memory_object_control_t control, vm_object_offset_ut offset_u, boolean_t needs_copy, vm_prot_ut cur_protection_u, vm_prot_ut max_protection_u, vm_inherit_ut inheritance_u) { vm_map_offset_t mask; vm_prot_t cur_protection; vm_prot_t max_protection; vm_inherit_t inheritance; vm_map_address_t map_addr; vm_map_size_t map_size; vm_object_t object; vm_object_offset_t obj_offs, obj_end; vm_object_size_t obj_size; kern_return_t result; memory_object_t pager; vm_prot_t pager_prot; kern_return_t kr; /* * Check arguments for validity */ if (target_map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; } /* * We only support vmf_return_data_addr-like behavior. */ vmk_flags.vmf_return_data_addr = true; /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_enter_mem_object_control_sanitize(target_map, *address_u, initial_size_u, mask_u, offset_u, cur_protection_u, max_protection_u, inheritance_u, vmk_flags, &map_addr, &map_size, &mask, &obj_offs, &obj_end, &obj_size, &cur_protection, &max_protection, &inheritance); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } object = memory_object_control_to_vm_object(control); if (object == VM_OBJECT_NULL) { return KERN_INVALID_OBJECT; } if (is_kernel_object(object)) { printf("Warning: Attempt to map kernel object" " by a non-private kernel entity\n"); return KERN_INVALID_OBJECT; } vm_object_lock(object); os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp); /* * For "named" VM objects, let the pager know that the * memory object is being mapped. Some pagers need to keep * track of this, to know when they can reclaim the memory * object, for example. * VM calls memory_object_map() for each mapping (specifying * the protection of each mapping) and calls * memory_object_last_unmap() when all the mappings are gone. */ pager_prot = max_protection; if (needs_copy) { pager_prot &= ~VM_PROT_WRITE; } pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL && object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { assert(object->pager_ready); vm_object_mapping_wait(object, THREAD_UNINT); /* object might have lost its pager while waiting */ pager = object->pager; if (object->named && pager != MEMORY_OBJECT_NULL) { vm_object_mapping_begin(object); vm_object_unlock(object); kr = memory_object_map(pager, pager_prot); assert(kr == KERN_SUCCESS); vm_object_lock(object); vm_object_mapping_end(object); } } vm_object_unlock(object); /* * Perform the copy if requested */ if (needs_copy) { vm_object_t new_object; vm_object_offset_t new_offset; result = vm_object_copy_strategically(object, obj_offs, obj_size, false, /* forking */ &new_object, &new_offset, &needs_copy); if (result == KERN_MEMORY_RESTART_COPY) { boolean_t success; boolean_t src_needs_copy; /* * XXX * We currently ignore src_needs_copy. * This really is the issue of how to make * MEMORY_OBJECT_COPY_SYMMETRIC safe for * non-kernel users to use. Solution forthcoming. * In the meantime, since we don't allow non-kernel * memory managers to specify symmetric copy, * we won't run into problems here. */ new_object = object; new_offset = obj_offs; success = vm_object_copy_quickly(new_object, new_offset, obj_size, &src_needs_copy, &needs_copy); assert(success); result = KERN_SUCCESS; } /* * Throw away the reference to the * original object, as it won't be mapped. */ vm_object_deallocate(object); if (result != KERN_SUCCESS) { return result; } object = new_object; obj_offs = new_offset; } result = vm_map_enter(target_map, &map_addr, map_size, (vm_map_offset_t)mask, vmk_flags, object, obj_offs, needs_copy, cur_protection, max_protection, inheritance); if (result == KERN_SUCCESS) { *address_u = vm_sanitize_wrap_addr( map_addr + (obj_offs & vm_map_page_mask(target_map))); } else { vm_object_deallocate(object); } return result; } /* Not used without nested pmaps */ #ifndef NO_NESTED_PMAP /* * Clip and unnest a portion of a nested submap mapping. */ static void vm_map_clip_unnest( vm_map_t map, vm_map_entry_t entry, vm_map_offset_t start_unnest, vm_map_offset_t end_unnest) { vm_map_offset_t old_start_unnest = start_unnest; vm_map_offset_t old_end_unnest = end_unnest; assert(entry->is_sub_map); assert(VME_SUBMAP(entry) != NULL); assert(entry->use_pmap); /* * Query the platform for the optimal unnest range. * DRK: There's some duplication of effort here, since * callers may have adjusted the range to some extent. This * routine was introduced to support 1GiB subtree nesting * for x86 platforms, which can also nest on 2MiB boundaries * depending on size/alignment. */ if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) { assert(VME_SUBMAP(entry)->is_nested_map); assert(!VME_SUBMAP(entry)->disable_vmentry_reuse); log_unnest_badness(map, old_start_unnest, old_end_unnest, VME_SUBMAP(entry)->is_nested_map, (entry->vme_start + VME_SUBMAP(entry)->lowest_unnestable_start - VME_OFFSET(entry))); } if (entry->vme_start > start_unnest || entry->vme_end < end_unnest) { panic("vm_map_clip_unnest(0x%llx,0x%llx): " "bad nested entry: start=0x%llx end=0x%llx\n", (long long)start_unnest, (long long)end_unnest, (long long)entry->vme_start, (long long)entry->vme_end); } if (start_unnest > entry->vme_start) { _vm_map_clip_start(&map->hdr, entry, start_unnest); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); } else { vm_map_store_update_first_free(map, map->first_free, FALSE); } } if (entry->vme_end > end_unnest) { _vm_map_clip_end(&map->hdr, entry, end_unnest); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); } else { vm_map_store_update_first_free(map, map->first_free, FALSE); } } pmap_unnest(map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, entry->vme_end, VME_SUBMAP(entry), VME_OFFSET(entry)); } entry->use_pmap = FALSE; if ((map->pmap != kernel_pmap) && (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) { VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP); } } #endif /* NO_NESTED_PMAP */ __abortlike static void __vm_map_clip_atomic_entry_panic( vm_map_t map, vm_map_entry_t entry, vm_map_offset_t where) { panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry " "%p [0x%llx:0x%llx] at 0x%llx", map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)where); } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ void vm_map_clip_start( vm_map_t map, vm_map_entry_t entry, vm_map_offset_t startaddr) { #ifndef NO_NESTED_PMAP if (entry->is_sub_map && entry->use_pmap && startaddr >= entry->vme_start) { vm_map_offset_t start_unnest, end_unnest; /* * Make sure "startaddr" is no longer in a nested range * before we clip. Unnest only the minimum range the platform * can handle. * vm_map_clip_unnest may perform additional adjustments to * the unnest range. */ start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1); end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap); vm_map_clip_unnest(map, entry, start_unnest, end_unnest); } #endif /* NO_NESTED_PMAP */ if (startaddr > entry->vme_start) { if (!entry->is_sub_map && VME_OBJECT(entry) && VME_OBJECT(entry)->phys_contiguous) { pmap_remove(map->pmap, (addr64_t)(entry->vme_start), (addr64_t)(entry->vme_end)); } if (entry->vme_atomic) { __vm_map_clip_atomic_entry_panic(map, entry, startaddr); } DTRACE_VM5( vm_map_clip_start, vm_map_t, map, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_map_offset_t, startaddr, int, VME_ALIAS(entry)); _vm_map_clip_start(&map->hdr, entry, startaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); } else { vm_map_store_update_first_free(map, map->first_free, FALSE); } } } #define vm_map_copy_clip_start(copy, entry, startaddr) \ MACRO_BEGIN \ if ((startaddr) > (entry)->vme_start) \ _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \ MACRO_END /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start( struct vm_map_header *map_header, vm_map_entry_t entry, vm_map_offset_t start) { vm_map_entry_t new_entry; /* * Split off the front portion -- * note that we must insert the new * entry BEFORE this one, so that * this entry has the specified starting * address. */ if (entry->map_aligned) { assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_HDR_PAGE_MASK(map_header))); } new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy_full(new_entry, entry); new_entry->vme_end = start; assert(new_entry->vme_start < new_entry->vme_end); VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start)); if (__improbable(start >= entry->vme_end)) { panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start); } assert(start < entry->vme_end); entry->vme_start = start; #if VM_BTLOG_TAGS if (new_entry->vme_kernel_object) { btref_retain(new_entry->vme_tag_btref); } #endif /* VM_BTLOG_TAGS */ _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); if (entry->is_sub_map) { vm_map_reference(VME_SUBMAP(new_entry)); } else { vm_object_reference(VME_OBJECT(new_entry)); } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ void vm_map_clip_end( vm_map_t map, vm_map_entry_t entry, vm_map_offset_t endaddr) { if (endaddr > entry->vme_end) { /* * Within the scope of this clipping, limit "endaddr" to * the end of this map entry... */ endaddr = entry->vme_end; } #ifndef NO_NESTED_PMAP if (entry->is_sub_map && entry->use_pmap) { vm_map_offset_t start_unnest, end_unnest; /* * Make sure the range between the start of this entry and * the new "endaddr" is no longer nested before we clip. * Unnest only the minimum range the platform can handle. * vm_map_clip_unnest may perform additional adjustments to * the unnest range. */ start_unnest = entry->vme_start; end_unnest = (endaddr + pmap_shared_region_size_min(map->pmap) - 1) & ~(pmap_shared_region_size_min(map->pmap) - 1); vm_map_clip_unnest(map, entry, start_unnest, end_unnest); } #endif /* NO_NESTED_PMAP */ if (endaddr < entry->vme_end) { if (!entry->is_sub_map && VME_OBJECT(entry) && VME_OBJECT(entry)->phys_contiguous) { pmap_remove(map->pmap, (addr64_t)(entry->vme_start), (addr64_t)(entry->vme_end)); } if (entry->vme_atomic) { __vm_map_clip_atomic_entry_panic(map, entry, endaddr); } DTRACE_VM5( vm_map_clip_end, vm_map_t, map, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_map_offset_t, endaddr, int, VME_ALIAS(entry)); _vm_map_clip_end(&map->hdr, entry, endaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); } else { vm_map_store_update_first_free(map, map->first_free, FALSE); } } } #define vm_map_copy_clip_end(copy, entry, endaddr) \ MACRO_BEGIN \ if ((endaddr) < (entry)->vme_end) \ _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \ MACRO_END /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end( struct vm_map_header *map_header, vm_map_entry_t entry, vm_map_offset_t end) { vm_map_entry_t new_entry; /* * Create a new entry and insert it * AFTER the specified entry */ if (entry->map_aligned) { assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_HDR_PAGE_MASK(map_header))); } new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy_full(new_entry, entry); if (__improbable(end <= entry->vme_start)) { panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end); } assert(entry->vme_start < end); new_entry->vme_start = entry->vme_end = end; VME_OFFSET_SET(new_entry, VME_OFFSET(new_entry) + (end - entry->vme_start)); assert(new_entry->vme_start < new_entry->vme_end); #if VM_BTLOG_TAGS if (new_entry->vme_kernel_object) { btref_retain(new_entry->vme_tag_btref); } #endif /* VM_BTLOG_TAGS */ _vm_map_store_entry_link(map_header, entry, new_entry); if (entry->is_sub_map) { vm_map_reference(VME_SUBMAP(new_entry)); } else { vm_object_reference(VME_OBJECT(new_entry)); } } /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ MACRO_BEGIN \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ MACRO_END /* * vm_map_range_check: [ internal use only ] * * Check that the region defined by the specified start and * end addresses are wholly contained within a single map * entry or set of adjacent map entries of the spacified map, * i.e. the specified region contains no unmapped space. * If any or all of the region is unmapped, FALSE is returned. * Otherwise, TRUE is returned and if the output argument 'entry' * is not NULL it points to the map entry containing the start * of the region. * * The map is locked for reading on entry and is left locked. */ static boolean_t vm_map_range_check( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_map_entry_t *entry) { vm_map_entry_t cur; vm_map_offset_t prev; /* * Basic sanity checks first */ if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) { return FALSE; } /* * Check first if the region starts within a valid * mapping for the map. */ if (!vm_map_lookup_entry(map, start, &cur)) { return FALSE; } /* * Optimize for the case that the region is contained * in a single map entry. */ if (entry != (vm_map_entry_t *) NULL) { *entry = cur; } if (end <= cur->vme_end) { return TRUE; } /* * If the region is not wholly contained within a * single entry, walk the entries looking for holes. */ prev = cur->vme_end; cur = cur->vme_next; while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) { if (end <= cur->vme_end) { return TRUE; } prev = cur->vme_end; cur = cur->vme_next; } return FALSE; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_protect_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut new_prot_u, vm_map_offset_t *start, vm_map_offset_t *end, vm_prot_t *new_prot) { kern_return_t kr; vm_map_size_t size; kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT, map, VM_PROT_COPY, new_prot); if (__improbable(kr != KERN_SUCCESS)) { return kr; } kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } return KERN_SUCCESS; } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ kern_return_t vm_map_protect( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, boolean_t set_max, vm_prot_ut new_prot_u) { vm_map_entry_t current; vm_map_offset_t prev; vm_map_entry_t entry; vm_prot_t new_prot; vm_prot_t new_max; int pmap_options = 0; kern_return_t kr; vm_map_offset_t start, original_start; vm_map_offset_t end; kr = vm_map_protect_sanitize(map, start_u, end_u, new_prot_u, &start, &end, &new_prot); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } original_start = start; if (new_prot & VM_PROT_COPY) { vm_map_offset_t new_start; vm_prot_t cur_prot, max_prot; vm_map_kernel_flags_t kflags; /* LP64todo - see below */ if (start >= map->max_offset) { return KERN_INVALID_ADDRESS; } if ((new_prot & VM_PROT_ALLEXEC) && map->pmap != kernel_pmap && (vm_map_cs_enforcement(map) #if XNU_TARGET_OS_OSX && __arm64__ || !VM_MAP_IS_EXOTIC(map) #endif /* XNU_TARGET_OS_OSX && __arm64__ */ ) && VM_MAP_POLICY_WX_FAIL(map)) { DTRACE_VM3(cs_wx, uint64_t, (uint64_t) start, uint64_t, (uint64_t) end, vm_prot_t, new_prot); printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, #if DEVELOPMENT || DEBUG (uint64_t)start, (uint64_t)end, #else /* DEVELOPMENT || DEBUG */ (uint64_t)0, (uint64_t)0, #endif /* DEVELOPMENT || DEBUG */ new_prot); return KERN_PROTECTION_FAILURE; } /* * Let vm_map_remap_extract() know that it will need to: * + make a copy of the mapping * + add VM_PROT_WRITE to the max protections * + remove any protections that are no longer allowed from the * max protections (to avoid any WRITE/EXECUTE conflict, for * example). * Note that "max_prot" is an IN/OUT parameter only for this * specific (VM_PROT_COPY) case. It's usually an OUT parameter * only. */ max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC); cur_prot = VM_PROT_NONE; kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true); kflags.vmkf_remap_prot_copy = true; kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map); new_start = start; kr = vm_map_remap(map, vm_sanitize_wrap_addr_ref(&new_start), end - start, 0, /* mask */ kflags, map, start, TRUE, /* copy-on-write remapping! */ vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */ vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */ VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { return kr; } new_prot &= ~VM_PROT_COPY; } vm_map_lock(map); restart_after_unlock: /* LP64todo - remove this check when vm_map_commpage64() * no longer has to stuff in a map_entry for the commpage * above the map's max_offset. */ if (start >= map->max_offset) { vm_map_unlock(map); return KERN_INVALID_ADDRESS; } while (1) { /* * Lookup the entry. If it doesn't start in a valid * entry, return an error. */ if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock(map); return KERN_INVALID_ADDRESS; } if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */ start = SUPERPAGE_ROUND_DOWN(start); continue; } break; } if (entry->superpage_size) { end = SUPERPAGE_ROUND_UP(end); } /* * Make a first pass to check for protection and address * violations. */ current = entry; prev = current->vme_start; while ((current != vm_map_to_entry(map)) && (current->vme_start < end)) { /* * If there is a hole, return an error. */ if (current->vme_start != prev) { vm_map_unlock(map); return KERN_INVALID_ADDRESS; } new_max = current->max_protection; #if defined(__x86_64__) /* Allow max mask to include execute prot bits if this map doesn't enforce CS */ if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) { new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC); } #elif CODE_SIGNING_MONITOR if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) { new_max |= VM_PROT_EXECUTE; } #endif if ((new_prot & new_max) != new_prot) { vm_map_unlock(map); return KERN_PROTECTION_FAILURE; } if (current->used_for_jit && pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) { vm_map_unlock(map); return KERN_PROTECTION_FAILURE; } #if __arm64e__ /* Disallow protecting hw assisted TPRO mappings */ if (current->used_for_tpro) { vm_map_unlock(map); return KERN_PROTECTION_FAILURE; } #endif /* __arm64e__ */ if ((new_prot & VM_PROT_WRITE) && (new_prot & VM_PROT_ALLEXEC) && #if XNU_TARGET_OS_OSX map->pmap != kernel_pmap && (vm_map_cs_enforcement(map) #if __arm64__ || !VM_MAP_IS_EXOTIC(map) #endif /* __arm64__ */ ) && #endif /* XNU_TARGET_OS_OSX */ #if CODE_SIGNING_MONITOR (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) && #endif !(current->used_for_jit)) { DTRACE_VM3(cs_wx, uint64_t, (uint64_t) current->vme_start, uint64_t, (uint64_t) current->vme_end, vm_prot_t, new_prot); printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, #if DEVELOPMENT || DEBUG (uint64_t)current->vme_start, (uint64_t)current->vme_end, #else /* DEVELOPMENT || DEBUG */ (uint64_t)0, (uint64_t)0, #endif /* DEVELOPMENT || DEBUG */ new_prot); new_prot &= ~VM_PROT_ALLEXEC; if (VM_MAP_POLICY_WX_FAIL(map)) { vm_map_unlock(map); return KERN_PROTECTION_FAILURE; } } /* * If the task has requested executable lockdown, * deny both: * - adding executable protections OR * - adding write protections to an existing executable mapping. */ if (map->map_disallow_new_exec == TRUE) { if ((new_prot & VM_PROT_ALLEXEC) || ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) { vm_map_unlock(map); return KERN_PROTECTION_FAILURE; } } prev = current->vme_end; current = current->vme_next; } #if __arm64__ if (end > prev && end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) { vm_map_entry_t prev_entry; prev_entry = current->vme_prev; if (prev_entry != vm_map_to_entry(map) && !prev_entry->map_aligned && (vm_map_round_page(prev_entry->vme_end, VM_MAP_PAGE_MASK(map)) == end)) { /* * The last entry in our range is not "map-aligned" * but it would have reached all the way to "end" * if it had been map-aligned, so this is not really * a hole in the range and we can proceed. */ prev = end; } } #endif /* __arm64__ */ if (end > prev) { vm_map_unlock(map); return KERN_INVALID_ADDRESS; } /* * Go back and fix up protections. * Clip to start here if the range starts within * the entry. */ current = entry; if (current != vm_map_to_entry(map)) { /* clip and unnest if necessary */ vm_map_clip_start(map, current, start); } while ((current != vm_map_to_entry(map)) && (current->vme_start < end)) { vm_prot_t old_prot; if (current->in_transition) { wait_result_t wait_result; vm_map_offset_t current_start; /* * Another thread is wiring/unwiring this entry. * Let the other thread know we are waiting. */ current_start = current->vme_start; current->needs_wakeup = true; /* wait for the other thread to be done */ wait_result = vm_map_entry_wait(map, TH_UNINT); /* * We unlocked the map, so anything could have changed in the * range and we need to re-check from "current_start" to "end". * Our entries might no longer be valid. */ current = NULL; entry = NULL; /* * Re-lookup and re-clip "current_start". * If it's no longer mapped, */ vm_map_lookup_entry_or_next(map, current_start, ¤t); if (current != vm_map_to_entry(map)) { vm_map_clip_start(map, current, current_start); } /* restart from this point */ start = current_start; goto restart_after_unlock; } vm_map_clip_end(map, current, end); #if DEVELOPMENT || DEBUG if (current->csm_associated && vm_log_xnu_user_debug) { printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, (uint64_t)start, (uint64_t)end, new_prot, map, current, current->vme_start, current->vme_end, current->protection, current->max_protection); } #endif /* DEVELOPMENT || DEBUG */ if (current->is_sub_map) { /* clipping did unnest if needed */ assert(!current->use_pmap); } old_prot = current->protection; if (set_max) { current->max_protection = new_prot; /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */ current->protection = (new_prot & old_prot); } else { current->protection = new_prot; } #if CODE_SIGNING_MONITOR if (!current->vme_xnu_user_debug && /* a !csm_associated mapping becoming executable */ ((!current->csm_associated && !(old_prot & VM_PROT_EXECUTE) && (current->protection & VM_PROT_EXECUTE)) || /* a csm_associated mapping becoming writable */ (current->csm_associated && !(old_prot & VM_PROT_WRITE) && (current->protection & VM_PROT_WRITE)))) { /* * This mapping has not already been marked as * "user_debug" and it is either: * 1. not code-signing-monitored and becoming executable * 2. code-signing-monitored and becoming writable, * so inform the CodeSigningMonitor and mark the * mapping as "user_debug" if appropriate. */ vm_map_kernel_flags_t vmk_flags; vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; /* pretend it's a vm_protect(VM_PROT_COPY)... */ vmk_flags.vmkf_remap_prot_copy = true; kr = vm_map_entry_cs_associate(map, current, vmk_flags); #if DEVELOPMENT || DEBUG if (vm_log_xnu_user_debug) { printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, map, current, current->vme_start, current->vme_end, old_prot, current->protection, kr, current->vme_xnu_user_debug); } #endif /* DEVELOPMENT || DEBUG */ } #endif /* CODE_SIGNING_MONITOR */ /* * Update physical map if necessary. * If the request is to turn off write protection, * we won't do it for real (in pmap). This is because * it would cause copy-on-write to fail. We've already * set, the new protection in the map, so if a * write-protect fault occurred, it will be fixed up * properly, COW or not. */ if (current->protection != old_prot) { /* Look one level in we support nested pmaps */ /* from mapped submaps which are direct entries */ /* in our map */ vm_prot_t prot; prot = current->protection; if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) { prot &= ~VM_PROT_WRITE; } else { assert(!VME_OBJECT(current)->code_signed); assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE); if (prot & VM_PROT_WRITE) { /* * For write requests on the * compressor, we wil ask the * pmap layer to prevent us from * taking a write fault when we * attempt to access the mapping * next. */ pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE; } } if (override_nx(map, VME_ALIAS(current)) && prot) { prot |= VM_PROT_EXECUTE; } #if DEVELOPMENT || DEBUG if (!(old_prot & VM_PROT_EXECUTE) && (prot & VM_PROT_EXECUTE) && panic_on_unsigned_execute && (proc_selfcsflags() & CS_KILL)) { panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot); } #endif /* DEVELOPMENT || DEBUG */ if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) { if (current->wired_count) { panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count); } /* If the pmap layer cares about this * protection type, force a fault for * each page so that vm_fault will * repopulate the page with the full * set of protections. */ /* * TODO: We don't seem to need this, * but this is due to an internal * implementation detail of * pmap_protect. Do we want to rely * on this? */ prot = VM_PROT_NONE; } if (current->is_sub_map && current->use_pmap) { pmap_protect(VME_SUBMAP(current)->pmap, current->vme_start, current->vme_end, prot); } else { pmap_protect_options(map->pmap, current->vme_start, current->vme_end, prot, pmap_options, NULL); } } current = current->vme_next; } if (entry == VM_MAP_ENTRY_NULL) { /* * Re-lookup the original start of our range. * If it's no longer mapped, start with the next mapping. */ vm_map_lookup_entry_or_next(map, original_start, &entry); } current = entry; while ((current != vm_map_to_entry(map)) && (current->vme_start <= end)) { vm_map_simplify_entry(map, current); current = current->vme_next; } vm_map_unlock(map); return KERN_SUCCESS; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_inherit_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_inherit_ut new_inheritance_u, vm_map_offset_t *start, vm_map_offset_t *end, vm_inherit_t *new_inheritance) { kern_return_t kr; vm_map_size_t size; kr = vm_sanitize_inherit(new_inheritance_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance); if (__improbable(kr != KERN_SUCCESS)) { return kr; } kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } return KERN_SUCCESS; } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vm_map_fork. */ kern_return_t vm_map_inherit( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_inherit_ut new_inheritance_u) { vm_map_entry_t entry; vm_map_entry_t temp_entry; kern_return_t kr; vm_map_offset_t start; vm_map_offset_t end; vm_inherit_t new_inheritance; kr = vm_map_inherit_sanitize(map, start_u, end_u, new_inheritance_u, &start, &end, &new_inheritance); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; } else { temp_entry = temp_entry->vme_next; entry = temp_entry; } /* first check entire range for submaps which can't support the */ /* given inheritance. */ while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { if (entry->is_sub_map) { if (new_inheritance == VM_INHERIT_COPY) { vm_map_unlock(map); return KERN_INVALID_ARGUMENT; } } entry = entry->vme_next; } entry = temp_entry; if (entry != vm_map_to_entry(map)) { /* clip and unnest if necessary */ vm_map_clip_start(map, entry, start); } while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { vm_map_clip_end(map, entry, end); if (entry->is_sub_map) { /* clip did unnest if needed */ assert(!entry->use_pmap); } entry->inheritance = new_inheritance; entry = entry->vme_next; } vm_map_unlock(map); return KERN_SUCCESS; } /* * Update the accounting for the amount of wired memory in this map. If the user has * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails. */ static kern_return_t add_wire_counts( vm_map_t map, vm_map_entry_t entry, boolean_t user_wire) { vm_map_size_t size; bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0; if (user_wire) { unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count; /* * We're wiring memory at the request of the user. Check if this is the first time the user is wiring * this map entry. */ if (entry->user_wired_count == 0) { size = entry->vme_end - entry->vme_start; /* * Since this is the first time the user is wiring this map entry, check to see if we're * exceeding the user wire limits. There is a per map limit which is the smaller of either * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also * a system-wide limit on the amount of memory all users can wire. If the user is over either * limit, then we fail. */ if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) || size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) { if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) { #if DEVELOPMENT || DEBUG if (panic_on_mlock_failure) { panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size); } #endif /* DEVELOPMENT || DEBUG */ os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed); } else { os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed); #if DEVELOPMENT || DEBUG if (panic_on_mlock_failure) { panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size); } #endif /* DEVELOPMENT || DEBUG */ } return KERN_RESOURCE_SHORTAGE; } /* * The first time the user wires an entry, we also increment the wired_count and add this to * the total that has been wired in the map. */ if (entry->wired_count >= MAX_WIRE_COUNT) { return KERN_FAILURE; } entry->wired_count++; map->user_wire_size += size; } if (entry->user_wired_count >= MAX_WIRE_COUNT) { return KERN_FAILURE; } entry->user_wired_count++; } else { /* * The kernel's wiring the memory. Just bump the count and continue. */ if (entry->wired_count >= MAX_WIRE_COUNT) { panic("vm_map_wire: too many wirings"); } entry->wired_count++; } if (first_wire) { vme_btref_consider_and_set(entry, __builtin_frame_address(0)); } return KERN_SUCCESS; } /* * Update the memory wiring accounting now that the given map entry is being unwired. */ static void subtract_wire_counts( vm_map_t map, vm_map_entry_t entry, boolean_t user_wire) { if (user_wire) { /* * We're unwiring memory at the request of the user. See if we're removing the last user wire reference. */ if (entry->user_wired_count == 1) { /* * We're removing the last user wire reference. Decrement the wired_count and the total * user wired memory for this map. */ assert(entry->wired_count >= 1); entry->wired_count--; map->user_wire_size -= entry->vme_end - entry->vme_start; } assert(entry->user_wired_count >= 1); entry->user_wired_count--; } else { /* * The kernel is unwiring the memory. Just update the count. */ assert(entry->wired_count >= 1); entry->wired_count--; } vme_btref_consider_and_put(entry); } int cs_executable_wire = 0; static kern_return_t vm_map_wire_nested( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_prot_t caller_prot, vm_tag_t tag, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr, ppnum_t *physpage_p) { vm_map_entry_t entry; vm_prot_t access_type; struct vm_map_entry *first_entry, tmp_entry; vm_map_t real_map; vm_map_offset_t s, e; kern_return_t rc; boolean_t need_wakeup; boolean_t main_map = FALSE; wait_interrupt_t interruptible_state; thread_t cur_thread; unsigned int last_timestamp; vm_map_size_t size; boolean_t wire_and_extract; vm_prot_t extra_prots; extra_prots = VM_PROT_COPY; extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE; #if XNU_TARGET_OS_OSX if (map->pmap == kernel_pmap || !vm_map_cs_enforcement(map)) { extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE; } #endif /* XNU_TARGET_OS_OSX */ #if CODE_SIGNING_MONITOR if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) { extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE; } #endif /* CODE_SIGNING_MONITOR */ access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC)); wire_and_extract = FALSE; if (physpage_p != NULL) { /* * The caller wants the physical page number of the * wired page. We return only one physical page number * so this works for only one page at a time. * * The only caller (vm_map_wire_and_extract) * guarantees it. */ assert(end - start == VM_MAP_PAGE_SIZE(map)); wire_and_extract = TRUE; *physpage_p = 0; } VM_MAP_RANGE_CHECK(map, start, end); assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (start == end) { /* We wired what the caller asked for, zero pages */ return KERN_SUCCESS; } vm_map_lock(map); if (map_pmap == NULL) { main_map = TRUE; } last_timestamp = map->timestamp; need_wakeup = FALSE; cur_thread = current_thread(); s = start; rc = KERN_SUCCESS; if (vm_map_lookup_entry(map, s, &first_entry)) { entry = first_entry; /* * vm_map_clip_start will be done later. * We don't want to unnest any nested submaps here ! */ } else { /* Start address is not in map */ rc = KERN_INVALID_ADDRESS; goto done; } while ((entry != vm_map_to_entry(map)) && (s < end)) { /* * At this point, we have wired from "start" to "s". * We still need to wire from "s" to "end". * * "entry" hasn't been clipped, so it could start before "s" * and/or end after "end". */ /* "e" is how far we want to wire in this entry */ e = entry->vme_end; if (e > end) { e = end; } /* * If another thread is wiring/unwiring this entry then * block after informing other thread to wake us up. */ if (entry->in_transition) { wait_result_t wait_result; /* * We have not clipped the entry. Make sure that * the start address is in range so that the lookup * below will succeed. * "s" is the current starting point: we've already * wired from "start" to "s" and we still have * to wire from "s" to "end". */ entry->needs_wakeup = TRUE; /* * wake up anybody waiting on entries that we have * already wired. */ if (need_wakeup) { vm_map_entry_wakeup(map); need_wakeup = FALSE; } /* * User wiring is interruptible */ wait_result = vm_map_entry_wait(map, (user_wire) ? THREAD_ABORTSAFE : THREAD_UNINT); if (user_wire && wait_result == THREAD_INTERRUPTED) { /* * undo the wirings we have done so far * We do not clear the needs_wakeup flag, * because we cannot tell if we were the * only one waiting. */ rc = KERN_FAILURE; goto done; } /* * Cannot avoid a lookup here. reset timestamp. */ last_timestamp = map->timestamp; /* * The entry could have been clipped, look it up again. * Worse that can happen is, it may not exist anymore. */ if (!vm_map_lookup_entry(map, s, &first_entry)) { /* * User: undo everything upto the previous * entry. let vm_map_unwire worry about * checking the validity of the range. */ rc = KERN_FAILURE; goto done; } entry = first_entry; continue; } if (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; vm_map_offset_t local_start; vm_map_offset_t local_end; pmap_t pmap; if (wire_and_extract) { /* * Wiring would result in copy-on-write * which would not be compatible with * the sharing we have with the original * provider of this memory. */ rc = KERN_INVALID_ARGUMENT; goto done; } vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); sub_start = VME_OFFSET(entry); sub_end = entry->vme_end; sub_end += VME_OFFSET(entry) - entry->vme_start; local_end = entry->vme_end; if (map_pmap == NULL) { vm_object_t object; vm_object_offset_t offset; vm_prot_t prot; boolean_t wired; vm_map_entry_t local_entry; vm_map_version_t version; vm_map_t lookup_map; if (entry->use_pmap) { pmap = VME_SUBMAP(entry)->pmap; /* ppc implementation requires that */ /* submaps pmap address ranges line */ /* up with parent map */ #ifdef notdef pmap_addr = sub_start; #endif pmap_addr = s; } else { pmap = map->pmap; pmap_addr = s; } if (entry->wired_count) { if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { goto done; } /* * The map was not unlocked: * no need to goto re-lookup. * Just go directly to next entry. */ entry = entry->vme_next; s = entry->vme_start; continue; } /* call vm_map_lookup_and_lock_object to */ /* cause any needs copy to be */ /* evaluated */ local_start = entry->vme_start; lookup_map = map; vm_map_lock_write_to_read(map); rc = vm_map_lookup_and_lock_object( &lookup_map, local_start, (access_type | extra_prots), OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired, NULL, &real_map, NULL); if (rc != KERN_SUCCESS) { vm_map_unlock_read(lookup_map); assert(map_pmap == NULL); vm_map_unwire_nested(map, start, s, user_wire, PMAP_NULL, 0); return rc; } vm_object_unlock(object); if (real_map != lookup_map) { vm_map_unlock(real_map); } vm_map_unlock_read(lookup_map); vm_map_lock(map); /* we unlocked, so must re-lookup */ if (!vm_map_lookup_entry(map, local_start, &local_entry)) { rc = KERN_FAILURE; goto done; } /* * entry could have been "simplified", * so re-clip */ entry = local_entry; assert(s == local_start); vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); /* re-compute "e" */ e = entry->vme_end; if (e > end) { e = end; } /* did we have a change of type? */ if (!entry->is_sub_map) { last_timestamp = map->timestamp; continue; } } else { local_start = entry->vme_start; pmap = map_pmap; } if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { goto done; } entry->in_transition = TRUE; vm_map_unlock(map); rc = vm_map_wire_nested(VME_SUBMAP(entry), sub_start, sub_end, caller_prot, tag, user_wire, pmap, pmap_addr, NULL); vm_map_lock(map); /* * Find the entry again. It could have been clipped * after we unlocked the map. */ if (!vm_map_lookup_entry(map, local_start, &first_entry)) { panic("vm_map_wire: re-lookup failed"); } entry = first_entry; assert(local_start == s); /* re-compute "e" */ e = entry->vme_end; if (e > end) { e = end; } last_timestamp = map->timestamp; while ((entry != vm_map_to_entry(map)) && (entry->vme_start < e)) { assert(entry->in_transition); entry->in_transition = FALSE; if (entry->needs_wakeup) { entry->needs_wakeup = FALSE; need_wakeup = TRUE; } if (rc != KERN_SUCCESS) {/* from vm_*_wire */ subtract_wire_counts(map, entry, user_wire); } entry = entry->vme_next; } if (rc != KERN_SUCCESS) { /* from vm_*_wire */ goto done; } /* no need to relookup again */ s = entry->vme_start; continue; } /* * If this entry is already wired then increment * the appropriate wire reference count. */ if (entry->wired_count) { if ((entry->protection & access_type) != access_type) { /* found a protection problem */ /* * XXX FBDP * We should always return an error * in this case but since we didn't * enforce it before, let's do * it only for the new "wire_and_extract" * code path for now... */ if (wire_and_extract) { rc = KERN_PROTECTION_FAILURE; goto done; } } /* * entry is already wired down, get our reference * after clipping to our range. */ vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { goto done; } if (wire_and_extract) { vm_object_t object; vm_object_offset_t offset; vm_page_t m; /* * We don't have to "wire" the page again * bit we still have to "extract" its * physical page number, after some sanity * checks. */ assert((entry->vme_end - entry->vme_start) == PAGE_SIZE); assert(!entry->needs_copy); assert(!entry->is_sub_map); assert(VME_OBJECT(entry)); if (((entry->vme_end - entry->vme_start) != PAGE_SIZE) || entry->needs_copy || entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) { rc = KERN_INVALID_ARGUMENT; goto done; } object = VME_OBJECT(entry); offset = VME_OFFSET(entry); /* need exclusive lock to update m->dirty */ if (entry->protection & VM_PROT_WRITE) { vm_object_lock(object); } else { vm_object_lock_shared(object); } m = vm_page_lookup(object, offset); assert(m != VM_PAGE_NULL); assert(VM_PAGE_WIRED(m)); if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) { *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (entry->protection & VM_PROT_WRITE) { vm_object_lock_assert_exclusive( object); m->vmp_dirty = TRUE; } } else { /* not already wired !? */ *physpage_p = 0; } vm_object_unlock(object); } /* map was not unlocked: no need to relookup */ entry = entry->vme_next; s = entry->vme_start; continue; } /* * Unwired entry or wire request transmitted via submap */ /* * Wiring would copy the pages to the shadow object. * The shadow object would not be code-signed so * attempting to execute code from these copied pages * would trigger a code-signing violation. */ if ((entry->protection & VM_PROT_EXECUTE) #if XNU_TARGET_OS_OSX && map->pmap != kernel_pmap && (vm_map_cs_enforcement(map) #if __arm64__ || !VM_MAP_IS_EXOTIC(map) #endif /* __arm64__ */ ) #endif /* XNU_TARGET_OS_OSX */ #if CODE_SIGNING_MONITOR && (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) #endif ) { #if MACH_ASSERT printf("pid %d[%s] wiring executable range from " "0x%llx to 0x%llx: rejected to preserve " "code-signing\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), (uint64_t) entry->vme_start, (uint64_t) entry->vme_end); #endif /* MACH_ASSERT */ DTRACE_VM2(cs_executable_wire, uint64_t, (uint64_t)entry->vme_start, uint64_t, (uint64_t)entry->vme_end); cs_executable_wire++; rc = KERN_PROTECTION_FAILURE; goto done; } /* * Perform actions of vm_map_lookup that need the write * lock on the map: create a shadow object for a * copy-on-write region, or an object for a zero-fill * region. */ size = entry->vme_end - entry->vme_start; /* * If wiring a copy-on-write page, we need to copy it now * even if we're only (currently) requesting read access. * This is aggressive, but once it's wired we can't move it. */ if (entry->needs_copy) { if (wire_and_extract) { /* * We're supposed to share with the original * provider so should not be "needs_copy" */ rc = KERN_INVALID_ARGUMENT; goto done; } VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map)); entry->needs_copy = FALSE; } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) { if (wire_and_extract) { /* * We're supposed to share with the original * provider so should already have an object. */ rc = KERN_INVALID_ARGUMENT; goto done; } VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0); VME_OFFSET_SET(entry, (vm_object_offset_t)0); assert(entry->use_pmap); } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { if (wire_and_extract) { /* * We're supposed to share with the original * provider so should not be COPY_SYMMETRIC. */ rc = KERN_INVALID_ARGUMENT; goto done; } /* * Force an unrequested "copy-on-write" but only for * the range we're wiring. */ // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract); vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); /* recompute "size" */ size = entry->vme_end - entry->vme_start; /* make a shadow object */ vm_object_t orig_object; vm_object_offset_t orig_offset; orig_object = VME_OBJECT(entry); orig_offset = VME_OFFSET(entry); VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map)); if (VME_OBJECT(entry) != orig_object) { /* * This mapping has not been shared (or it would be * COPY_DELAY instead of COPY_SYMMETRIC) and it has * not been copied-on-write (or it would be marked * as "needs_copy" and would have been handled above * and also already write-protected). * We still need to write-protect here to prevent * other threads from modifying these pages while * we're in the process of copying and wiring * the copied pages. * Since the mapping is neither shared nor COWed, * we only need to write-protect the PTEs for this * mapping. */ vm_object_pmap_protect(orig_object, orig_offset, size, map->pmap, VM_MAP_PAGE_SIZE(map), entry->vme_start, entry->protection & ~VM_PROT_WRITE); } } if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { /* * Make the object COPY_DELAY to get a stable object * to wire. * That should avoid creating long shadow chains while * wiring/unwiring the same range repeatedly. * That also prevents part of the object from being * wired while another part is "needs_copy", which * could result in conflicting rules wrt copy-on-write. */ vm_object_t object; object = VME_OBJECT(entry); vm_object_lock(object); if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size, "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n", object, (uint64_t)object->vo_size, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)VME_OFFSET(entry), (uint64_t)size); assertf(os_ref_get_count_raw(&object->ref_count) == 1, "object %p ref_count %d\n", object, os_ref_get_count_raw(&object->ref_count)); assertf(!entry->needs_copy, "entry %p\n", entry); object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; VM_OBJECT_SET_TRUE_SHARE(object, TRUE); } vm_object_unlock(object); } vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); /* re-compute "e" */ e = entry->vme_end; if (e > end) { e = end; } /* * Check for holes and protection mismatch. * Holes: Next entry should be contiguous unless this * is the end of the region. * Protection: Access requested must be allowed, unless * wiring is by protection class */ if ((entry->vme_end < end) && ((entry->vme_next == vm_map_to_entry(map)) || (entry->vme_next->vme_start > entry->vme_end))) { /* found a hole */ rc = KERN_INVALID_ADDRESS; goto done; } if ((entry->protection & access_type) != access_type) { /* found a protection problem */ rc = KERN_PROTECTION_FAILURE; goto done; } assert(entry->wired_count == 0 && entry->user_wired_count == 0); if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { goto done; } entry->in_transition = TRUE; /* * This entry might get split once we unlock the map. * In vm_fault_wire(), we need the current range as * defined by this entry. In order for this to work * along with a simultaneous clip operation, we make a * temporary copy of this entry and use that for the * wiring. Note that the underlying objects do not * change during a clip. */ tmp_entry = *entry; /* * The in_transition state guarentees that the entry * (or entries for this range, if split occured) will be * there when the map lock is acquired for the second time. */ vm_map_unlock(map); if (!user_wire && cur_thread != THREAD_NULL) { interruptible_state = thread_interrupt_level(THREAD_UNINT); } else { interruptible_state = THREAD_UNINT; } if (map_pmap) { rc = vm_fault_wire(map, &tmp_entry, caller_prot, tag, map_pmap, pmap_addr, physpage_p); } else { rc = vm_fault_wire(map, &tmp_entry, caller_prot, tag, map->pmap, tmp_entry.vme_start, physpage_p); } if (!user_wire && cur_thread != THREAD_NULL) { thread_interrupt_level(interruptible_state); } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Find the entry again. It could have been clipped * after we unlocked the map. */ if (!vm_map_lookup_entry(map, tmp_entry.vme_start, &first_entry)) { panic("vm_map_wire: re-lookup failed"); } entry = first_entry; } last_timestamp = map->timestamp; while ((entry != vm_map_to_entry(map)) && (entry->vme_start < tmp_entry.vme_end)) { assert(entry->in_transition); entry->in_transition = FALSE; if (entry->needs_wakeup) { entry->needs_wakeup = FALSE; need_wakeup = TRUE; } if (rc != KERN_SUCCESS) { /* from vm_*_wire */ subtract_wire_counts(map, entry, user_wire); } entry = entry->vme_next; } if (rc != KERN_SUCCESS) { /* from vm_*_wire */ goto done; } if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */ (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */ (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */ /* found a "new" hole */ s = tmp_entry.vme_end; rc = KERN_INVALID_ADDRESS; goto done; } s = entry->vme_start; } /* end while loop through map entries */ done: if (rc == KERN_SUCCESS) { /* repair any damage we may have made to the VM map */ vm_map_simplify_range(map, start, end); } vm_map_unlock(map); /* * wake up anybody waiting on entries we wired. */ if (need_wakeup) { vm_map_entry_wakeup(map); } if (rc != KERN_SUCCESS) { /* undo what has been wired so far */ vm_map_unwire_nested(map, start, s, user_wire, map_pmap, pmap_addr); if (physpage_p) { *physpage_p = 0; } } return rc; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_wire_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut prot_u, vm_sanitize_caller_t vm_sanitize_caller, vm_map_offset_t *start, vm_map_offset_t *end, vm_map_size_t *size, vm_prot_t *prot) { kern_return_t kr; kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot); if (__improbable(kr != KERN_SUCCESS)) { return kr; } return KERN_SUCCESS; } /* * Validation function for vm_map_wire_nested(). */ kern_return_t vm_map_wire_impl( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut prot_u, vm_tag_t tag, boolean_t user_wire, ppnum_t *physpage_p, vm_sanitize_caller_t vm_sanitize_caller) { vm_map_offset_t start, end; vm_map_size_t size; vm_prot_t prot; kern_return_t kr; /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_wire_sanitize(map, start_u, end_u, prot_u, vm_sanitize_caller, &start, &end, &size, &prot); if (__improbable(kr != KERN_SUCCESS)) { if (physpage_p) { *physpage_p = 0; } return vm_sanitize_get_kr(kr); } return vm_map_wire_nested(map, start, end, prot, tag, user_wire, PMAP_NULL, 0, physpage_p); } kern_return_t vm_map_wire_external( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut prot_u, boolean_t user_wire) { vm_tag_t tag = vm_tag_bt(); return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire); } kern_return_t vm_map_wire_kernel( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut prot_u, vm_tag_t tag, boolean_t user_wire) { return vm_map_wire_impl(map, start_u, end_u, prot_u, tag, user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE); } #if XNU_PLATFORM_MacOSX kern_return_t vm_map_wire_and_extract( vm_map_t map, vm_map_offset_ut start_u, vm_prot_ut prot_u, boolean_t user_wire, ppnum_t *physpage_p) { vm_tag_t tag = vm_tag_bt(); vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map)); vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u); return vm_map_wire_impl(map, start_u, end_u, prot_u, tag, user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE); } #endif /* XNU_PLATFORM_MacOSX */ static kern_return_t vm_map_unwire_nested( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, boolean_t user_wire, pmap_t map_pmap, vm_map_offset_t pmap_addr) { vm_map_entry_t entry; struct vm_map_entry *first_entry, tmp_entry; boolean_t need_wakeup; boolean_t main_map = FALSE; unsigned int last_timestamp; VM_MAP_RANGE_CHECK(map, start, end); assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (start == end) { /* We unwired what the caller asked for: zero pages */ return KERN_SUCCESS; } vm_map_lock(map); if (map_pmap == NULL) { main_map = TRUE; } last_timestamp = map->timestamp; if (vm_map_lookup_entry(map, start, &first_entry)) { entry = first_entry; /* * vm_map_clip_start will be done later. * We don't want to unnest any nested sub maps here ! */ } else { if (!user_wire) { panic("vm_map_unwire: start not found"); } /* Start address is not in map. */ vm_map_unlock(map); return KERN_INVALID_ADDRESS; } if (entry->superpage_size) { /* superpages are always wired */ vm_map_unlock(map); return KERN_INVALID_ADDRESS; } need_wakeup = FALSE; while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { if (entry->in_transition) { /* * 1) * Another thread is wiring down this entry. Note * that if it is not for the other thread we would * be unwiring an unwired entry. This is not * permitted. If we wait, we will be unwiring memory * we did not wire. * * 2) * Another thread is unwiring this entry. We did not * have a reference to it, because if we did, this * entry will not be getting unwired now. */ if (!user_wire) { /* * XXX FBDP * This could happen: there could be some * overlapping vslock/vsunlock operations * going on. * We should probably just wait and retry, * but then we have to be careful that this * entry could get "simplified" after * "in_transition" gets unset and before * we re-lookup the entry, so we would * have to re-clip the entry to avoid * re-unwiring what we have already unwired... * See vm_map_wire_nested(). * * Or we could just ignore "in_transition" * here and proceed to decement the wired * count(s) on this entry. That should be fine * as long as "wired_count" doesn't drop all * the way to 0 (and we should panic if THAT * happens). */ panic("vm_map_unwire: in_transition entry"); } entry = entry->vme_next; continue; } if (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; vm_map_offset_t local_end; pmap_t pmap; vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); sub_start = VME_OFFSET(entry); sub_end = entry->vme_end - entry->vme_start; sub_end += VME_OFFSET(entry); local_end = entry->vme_end; if (map_pmap == NULL) { if (entry->use_pmap) { pmap = VME_SUBMAP(entry)->pmap; pmap_addr = sub_start; } else { pmap = map->pmap; pmap_addr = start; } if (entry->wired_count == 0 || (user_wire && entry->user_wired_count == 0)) { if (!user_wire) { panic("vm_map_unwire: entry is unwired"); } entry = entry->vme_next; continue; } /* * Check for holes * Holes: Next entry should be contiguous unless * this is the end of the region. */ if (((entry->vme_end < end) && ((entry->vme_next == vm_map_to_entry(map)) || (entry->vme_next->vme_start > entry->vme_end)))) { if (!user_wire) { panic("vm_map_unwire: non-contiguous region"); } /* * entry = entry->vme_next; * continue; */ } subtract_wire_counts(map, entry, user_wire); if (entry->wired_count != 0) { entry = entry->vme_next; continue; } entry->in_transition = TRUE; tmp_entry = *entry;/* see comment in vm_map_wire() */ /* * We can unlock the map now. The in_transition state * guarantees existance of the entry. */ vm_map_unlock(map); vm_map_unwire_nested(VME_SUBMAP(entry), sub_start, sub_end, user_wire, pmap, pmap_addr); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Find the entry again. It could have been * clipped or deleted after we unlocked the map. */ if (!vm_map_lookup_entry(map, tmp_entry.vme_start, &first_entry)) { if (!user_wire) { panic("vm_map_unwire: re-lookup failed"); } entry = first_entry->vme_next; } else { entry = first_entry; } } last_timestamp = map->timestamp; /* * clear transition bit for all constituent entries * that were in the original entry (saved in * tmp_entry). Also check for waiters. */ while ((entry != vm_map_to_entry(map)) && (entry->vme_start < tmp_entry.vme_end)) { assert(entry->in_transition); entry->in_transition = FALSE; if (entry->needs_wakeup) { entry->needs_wakeup = FALSE; need_wakeup = TRUE; } entry = entry->vme_next; } continue; } else { tmp_entry = *entry; vm_map_unlock(map); vm_map_unwire_nested(VME_SUBMAP(entry), sub_start, sub_end, user_wire, map_pmap, pmap_addr); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Find the entry again. It could have been * clipped or deleted after we unlocked the map. */ if (!vm_map_lookup_entry(map, tmp_entry.vme_start, &first_entry)) { if (!user_wire) { panic("vm_map_unwire: re-lookup failed"); } entry = first_entry->vme_next; } else { entry = first_entry; } } last_timestamp = map->timestamp; } } if ((entry->wired_count == 0) || (user_wire && entry->user_wired_count == 0)) { if (!user_wire) { panic("vm_map_unwire: entry is unwired"); } entry = entry->vme_next; continue; } assert(entry->wired_count > 0 && (!user_wire || entry->user_wired_count > 0)); vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Check for holes * Holes: Next entry should be contiguous unless * this is the end of the region. */ if (((entry->vme_end < end) && ((entry->vme_next == vm_map_to_entry(map)) || (entry->vme_next->vme_start > entry->vme_end)))) { if (!user_wire) { panic("vm_map_unwire: non-contiguous region"); } entry = entry->vme_next; continue; } subtract_wire_counts(map, entry, user_wire); if (entry->wired_count != 0) { entry = entry->vme_next; continue; } if (entry->zero_wired_pages) { entry->zero_wired_pages = FALSE; } entry->in_transition = TRUE; tmp_entry = *entry; /* see comment in vm_map_wire() */ /* * We can unlock the map now. The in_transition state * guarantees existance of the entry. */ vm_map_unlock(map); if (map_pmap) { vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap, pmap_addr, tmp_entry.vme_end); } else { vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap, tmp_entry.vme_start, tmp_entry.vme_end); } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Find the entry again. It could have been clipped * or deleted after we unlocked the map. */ if (!vm_map_lookup_entry(map, tmp_entry.vme_start, &first_entry)) { if (!user_wire) { panic("vm_map_unwire: re-lookup failed"); } entry = first_entry->vme_next; } else { entry = first_entry; } } last_timestamp = map->timestamp; /* * clear transition bit for all constituent entries that * were in the original entry (saved in tmp_entry). Also * check for waiters. */ while ((entry != vm_map_to_entry(map)) && (entry->vme_start < tmp_entry.vme_end)) { assert(entry->in_transition); entry->in_transition = FALSE; if (entry->needs_wakeup) { entry->needs_wakeup = FALSE; need_wakeup = TRUE; } entry = entry->vme_next; } } /* * We might have fragmented the address space when we wired this * range of addresses. Attempt to re-coalesce these VM map entries * with their neighbors now that they're no longer wired. * Under some circumstances, address space fragmentation can * prevent VM object shadow chain collapsing, which can cause * swap space leaks. */ vm_map_simplify_range(map, start, end); vm_map_unlock(map); /* * wake up anybody waiting on entries that we have unwired. */ if (need_wakeup) { vm_map_entry_wakeup(map); } return KERN_SUCCESS; } kern_return_t vm_map_unwire( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, boolean_t user_wire) { return vm_map_unwire_impl(map, start_u, end_u, user_wire, VM_SANITIZE_CALLER_VM_MAP_UNWIRE); } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_unwire_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_sanitize_caller_t vm_sanitize_caller, vm_map_offset_t *start, vm_map_offset_t *end, vm_map_size_t *size) { return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, size); } kern_return_t vm_map_unwire_impl( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, boolean_t user_wire, vm_sanitize_caller_t vm_sanitize_caller) { vm_map_offset_t start, end; vm_map_size_t size; kern_return_t kr; /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_unwire_sanitize( map, start_u, end_u, vm_sanitize_caller, &start, &end, &size); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } return vm_map_unwire_nested(map, start, end, user_wire, (pmap_t)NULL, 0); } /* * vm_map_entry_zap: [ internal use only ] * * Remove the entry from the target map * and put it on a zap list. */ static void vm_map_entry_zap( vm_map_t map, vm_map_entry_t entry, vm_map_zap_t zap) { vm_map_offset_t s, e; s = entry->vme_start; e = entry->vme_end; assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK)); assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK)); if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { assert(page_aligned(s)); assert(page_aligned(e)); } if (entry->map_aligned == TRUE) { assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map))); } assert(entry->wired_count == 0); assert(entry->user_wired_count == 0); assert(!entry->vme_permanent); vm_map_store_entry_unlink(map, entry, false); map->size -= e - s; vm_map_zap_append(zap, entry); } static void vm_map_submap_pmap_clean( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_map_t sub_map, vm_map_offset_t offset) { vm_map_offset_t submap_start; vm_map_offset_t submap_end; vm_map_size_t remove_size; vm_map_entry_t entry; submap_end = offset + (end - start); submap_start = offset; vm_map_lock_read(sub_map); if (vm_map_lookup_entry(sub_map, offset, &entry)) { remove_size = (entry->vme_end - entry->vme_start); if (offset > entry->vme_start) { remove_size -= offset - entry->vme_start; } if (submap_end < entry->vme_end) { remove_size -= entry->vme_end - submap_end; } if (entry->is_sub_map) { vm_map_submap_pmap_clean( sub_map, start, start + remove_size, VME_SUBMAP(entry), VME_OFFSET(entry)); } else { if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0 && VME_OBJECT(entry) != NULL) { vm_object_pmap_protect_options( VME_OBJECT(entry), (VME_OFFSET(entry) + offset - entry->vme_start), remove_size, PMAP_NULL, PAGE_SIZE, entry->vme_start, VM_PROT_NONE, PMAP_OPTIONS_REMOVE); } else { pmap_remove(map->pmap, (addr64_t)start, (addr64_t)(start + remove_size)); } } } entry = entry->vme_next; while ((entry != vm_map_to_entry(sub_map)) && (entry->vme_start < submap_end)) { remove_size = (entry->vme_end - entry->vme_start); if (submap_end < entry->vme_end) { remove_size -= entry->vme_end - submap_end; } if (entry->is_sub_map) { vm_map_submap_pmap_clean( sub_map, (start + entry->vme_start) - offset, ((start + entry->vme_start) - offset) + remove_size, VME_SUBMAP(entry), VME_OFFSET(entry)); } else { if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0 && VME_OBJECT(entry) != NULL) { vm_object_pmap_protect_options( VME_OBJECT(entry), VME_OFFSET(entry), remove_size, PMAP_NULL, PAGE_SIZE, entry->vme_start, VM_PROT_NONE, PMAP_OPTIONS_REMOVE); } else { pmap_remove(map->pmap, (addr64_t)((start + entry->vme_start) - offset), (addr64_t)(((start + entry->vme_start) - offset) + remove_size)); } } entry = entry->vme_next; } vm_map_unlock_read(sub_map); return; } /* * virt_memory_guard_ast: * * Handle the AST callout for a virtual memory guard. * raise an EXC_GUARD exception and terminate the task * if configured to do so. */ void virt_memory_guard_ast( thread_t thread, mach_exception_data_type_t code, mach_exception_data_type_t subcode) { task_t task = get_threadtask(thread); assert(task != kernel_task); assert(task == current_task()); kern_return_t sync_exception_result; uint32_t behavior; behavior = task->task_exc_guard; /* Is delivery enabled */ if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { return; } /* If only once, make sure we're that once */ while (behavior & TASK_EXC_GUARD_VM_ONCE) { uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER; if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) { break; } behavior = task->task_exc_guard; if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { return; } } const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL; /* Raise exception synchronously and see if handler claimed it */ sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal); if (fatal) { /* * If Synchronous EXC_GUARD delivery was successful then * kill the process and return, else kill the process * and deliver the exception via EXC_CORPSE_NOTIFY. */ int flags = PX_DEBUG_NO_HONOR; exception_info_t info = { .os_reason = OS_REASON_GUARD, .exception_type = EXC_GUARD, .mx_code = code, .mx_subcode = subcode }; if (sync_exception_result == KERN_SUCCESS) { flags |= PX_PSIGNAL; } exit_with_mach_exception(current_proc(), info, flags); } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) { /* * If the synchronous EXC_GUARD delivery was not successful, * raise a simulated crash. */ if (sync_exception_result != KERN_SUCCESS) { task_violated_guard(code, subcode, NULL, FALSE); } } } /* * vm_map_guard_exception: * * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception. * * Right now, we do this when we find nothing mapped, or a * gap in the mapping when a user address space deallocate * was requested. We report the address of the first gap found. */ static void vm_map_guard_exception( vm_map_offset_t gap_start, unsigned reason) { mach_exception_code_t code = 0; unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; unsigned int target = 0; /* should we pass in pid associated with map? */ mach_exception_data_type_t subcode = (uint64_t)gap_start; boolean_t fatal = FALSE; task_t task = current_task_early(); /* Can't deliver exceptions to a NULL task (early boot) or kernel task */ if (task == NULL || task == kernel_task) { return; } EXC_GUARD_ENCODE_TYPE(code, guard_type); EXC_GUARD_ENCODE_FLAVOR(code, reason); EXC_GUARD_ENCODE_TARGET(code, target); if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) { fatal = TRUE; } thread_guard_violation(current_thread(), code, subcode, fatal); } static kern_return_t vm_map_delete_submap_recurse( vm_map_t submap, vm_map_offset_t submap_start, vm_map_offset_t submap_end) { vm_map_entry_t submap_entry; /* * Verify that the submap does not contain any "permanent" entries * within the specified range. We permit TPRO ranges to be overwritten * as we only reach this path if TPRO const protection is disabled for a * given map. * * We do not care about gaps. */ vm_map_lock(submap); if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) { submap_entry = submap_entry->vme_next; } for (; submap_entry != vm_map_to_entry(submap) && submap_entry->vme_start < submap_end; submap_entry = submap_entry->vme_next) { if (submap_entry->vme_permanent #ifdef __arm64e__ /* allow TPRO submap entries to be overwritten */ && !submap_entry->used_for_tpro #endif ) { /* "permanent" entry -> fail */ vm_map_unlock(submap); return KERN_PROTECTION_FAILURE; } } /* no "permanent" entries in the range -> success */ vm_map_unlock(submap); return KERN_SUCCESS; } __abortlike static void __vm_map_delete_misaligned_panic( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x", map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map)); } __abortlike static void __vm_map_delete_failed_panic( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, kern_return_t kr) { panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d", map, (uint64_t)start, (uint64_t)end, kr); } __abortlike static void __vm_map_delete_gap_panic( vm_map_t map, vm_map_offset_t where, vm_map_offset_t start, vm_map_offset_t end) { panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx", map, (uint64_t)start, (uint64_t)end, (uint64_t)where); } __abortlike static void __vm_map_delete_permanent_panic( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_map_entry_t entry) { panic("vm_map_delete(%p,0x%llx,0x%llx): " "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]", map, (uint64_t)start, (uint64_t)end, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end); } __options_decl(vm_map_delete_state_t, uint32_t, { VMDS_NONE = 0x0000, VMDS_FOUND_GAP = 0x0001, VMDS_GAPS_OK = 0x0002, VMDS_KERNEL_PMAP = 0x0004, VMDS_NEEDS_LOOKUP = 0x0008, VMDS_NEEDS_WAKEUP = 0x0010, VMDS_KERNEL_KMEMPTR = 0x0020 }); /* * vm_map_clamp_to_pmap(map, start, end) * * Modify *start and *end so they fall within the bounds of map->pmap. */ #if MACH_ASSERT static void vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end) { vm_map_address_t min; vm_map_address_t max; #if __x86_64__ /* x86_64 struct pmap does not have min and max fields */ if (map->pmap == kernel_pmap) { min = VM_MIN_KERNEL_AND_KEXT_ADDRESS; max = VM_MAX_KERNEL_ADDRESS; } else { min = VM_MAP_MIN_ADDRESS; max = VM_MAP_MAX_ADDRESS; } #else min = map->pmap->min; max = map->pmap->max; #endif if (*start < min) { *start = min; } else if (*start > max) { *start = max; } if (*end < min) { *end = min; } else if (*end > max) { *end = max; } } #endif int vm_log_map_delete_permanent_prot_none = 0; /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target map. * Removes all user wirings. Unwires one kernel wiring if * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set. * * * When the map is a kernel map, then any error in removing mappings * will lead to a panic so that clients do not have to repeat the panic * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE * is also passed, then KERN_ABORTED will not lead to a panic. * * This routine is called with map locked and leaves map locked. */ static kmem_return_t vm_map_delete( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vmr_flags_t flags, kmem_guard_t guard, vm_map_zap_t zap_list) { vm_map_entry_t entry, next; int interruptible; vm_map_offset_t gap_start = 0; vm_map_offset_t clear_in_transition_end = 0; __unused vm_map_offset_t save_start = start; __unused vm_map_offset_t save_end = end; vm_map_delete_state_t state = VMDS_NONE; kmem_return_t ret = { }; vm_map_range_id_t range_id = 0; struct kmem_page_meta *meta = NULL; uint32_t size_idx, slot_idx; struct mach_vm_range slot; if (vm_map_pmap(map) == kernel_pmap) { state |= VMDS_KERNEL_PMAP; range_id = kmem_addr_get_range(start, end - start); if (kmem_is_ptr_range(range_id)) { state |= VMDS_KERNEL_KMEMPTR; slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta, &size_idx, &slot); } } if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) { state |= VMDS_GAPS_OK; } if (map->corpse_source && !(flags & VM_MAP_REMOVE_TO_OVERWRITE) && !map->terminated) { /* * The map is being used for corpses related diagnostics. * So skip any entry removal to avoid perturbing the map state. * The cleanup will happen in task_terminate_internal after the * call to task_port_no_senders. */ goto out; } interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ? THREAD_ABORTSAFE : THREAD_UNINT; if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 && (start & VM_MAP_PAGE_MASK(map))) { __vm_map_delete_misaligned_panic(map, start, end); } if ((state & VMDS_GAPS_OK) == 0) { /* * If the map isn't terminated then all deletions must have * no gaps, and be within the [min, max) of the map. * * We got here without VM_MAP_RANGE_CHECK() being called, * and hence must validate bounds manually. * * It is worth noting that because vm_deallocate() will * round_page() the deallocation size, it's possible for "end" * to be 0 here due to overflow. We hence must treat it as being * beyond vm_map_max(map). * * Similarly, end < start means some wrap around happend, * which should cause an error or panic. */ if (end == 0 || end > vm_map_max(map)) { state |= VMDS_FOUND_GAP; gap_start = vm_map_max(map); if (state & VMDS_KERNEL_PMAP) { __vm_map_delete_gap_panic(map, gap_start, start, end); } goto out; } if (end < start) { if (state & VMDS_KERNEL_PMAP) { __vm_map_delete_gap_panic(map, vm_map_max(map), start, end); } ret.kmr_return = KERN_INVALID_ARGUMENT; goto out; } if (start < vm_map_min(map)) { state |= VMDS_FOUND_GAP; gap_start = start; if (state & VMDS_KERNEL_PMAP) { __vm_map_delete_gap_panic(map, gap_start, start, end); } goto out; } } else { /* * If the map is terminated, we must accept start/end * being beyond the boundaries of the map as this is * how some of the mappings like commpage mappings * can be destroyed (they're outside of those bounds). * * end < start is still something we can't cope with, * so just bail. */ if (end < start) { goto out; } } /* * Find the start of the region. * * If in a superpage, extend the range * to include the start of the mapping. */ while (vm_map_lookup_entry_or_next(map, start, &entry)) { if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { start = SUPERPAGE_ROUND_DOWN(start); } else { SAVE_HINT_MAP_WRITE(map, entry->vme_prev); break; } } if (entry->superpage_size) { end = SUPERPAGE_ROUND_UP(end); } /* * Step through all entries in this region */ for (vm_map_offset_t s = start; s < end;) { /* * At this point, we have deleted all the memory entries * in [start, s) and are proceeding with the [s, end) range. * * This loop might drop the map lock, and it is possible that * some memory was already reallocated within [start, s) * and we don't want to mess with those entries. * * Some of those entries could even have been re-assembled * with an entry after "s" (in vm_map_simplify_entry()), so * we may have to vm_map_clip_start() again. * * When clear_in_transition_end is set, the we had marked * [start, clear_in_transition_end) as "in_transition" * during a previous iteration and we need to clear it. */ /* * Step 1: If needed (because we dropped locks), * lookup the entry again. * * If we're coming back from unwiring (Step 5), * we also need to mark the entries as no longer * in transition after that. */ if (state & VMDS_NEEDS_LOOKUP) { state &= ~VMDS_NEEDS_LOOKUP; if (vm_map_lookup_entry_or_next(map, s, &entry)) { SAVE_HINT_MAP_WRITE(map, entry->vme_prev); } if (state & VMDS_KERNEL_KMEMPTR) { kmem_validate_slot(s, meta, size_idx, slot_idx); } } if (clear_in_transition_end) { for (vm_map_entry_t it = entry; it != vm_map_to_entry(map) && it->vme_start < clear_in_transition_end; it = it->vme_next) { assert(it->in_transition); it->in_transition = FALSE; if (it->needs_wakeup) { it->needs_wakeup = FALSE; state |= VMDS_NEEDS_WAKEUP; } } clear_in_transition_end = 0; } /* * Step 2: Perform various policy checks * before we do _anything_ to this entry. */ if (entry == vm_map_to_entry(map) || s < entry->vme_start) { if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) { /* * Either we found a gap already, * or we are tearing down a map, * keep going. */ } else if (state & VMDS_KERNEL_PMAP) { __vm_map_delete_gap_panic(map, s, start, end); } else if (s < end) { state |= VMDS_FOUND_GAP; gap_start = s; } if (entry == vm_map_to_entry(map) || end <= entry->vme_start) { break; } s = entry->vme_start; } if (state & VMDS_KERNEL_PMAP) { /* * In the kernel map and its submaps, * permanent entries never die, even * if VM_MAP_REMOVE_IMMUTABLE is passed. */ if (entry->vme_permanent) { __vm_map_delete_permanent_panic(map, start, end, entry); } if (flags & VM_MAP_REMOVE_GUESS_SIZE) { end = entry->vme_end; flags &= ~VM_MAP_REMOVE_GUESS_SIZE; } /* * In the kernel map and its submaps, * the removal of an atomic/guarded entry is strict. * * An atomic entry is processed only if it was * specifically targeted. * * We might have deleted non-atomic entries before * we reach this this point however... */ kmem_entry_validate_guard(map, entry, start, end - start, guard); } /* * Step 2.1: handle "permanent" and "submap" entries * *before* clipping to avoid triggering some unnecessary * un-nesting of the shared region. */ if (entry->vme_permanent && entry->is_sub_map) { // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__); /* * Un-mapping a "permanent" mapping of a user-space * submap is not allowed unless... */ if (flags & VM_MAP_REMOVE_IMMUTABLE) { /* * a. explicitly requested by the kernel caller. */ // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__); } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) && developer_mode_state()) { /* * b. we're in "developer" mode (for * breakpoints, dtrace probes, ...). */ // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__); } else if (map->terminated) { /* * c. this is the final address space cleanup. */ // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__); } else { vm_map_offset_t submap_start, submap_end; kern_return_t submap_kr; /* * Check if there are any "permanent" mappings * in this range in the submap. */ if (entry->in_transition) { /* can that even happen ? */ goto in_transition; } /* compute the clipped range in the submap */ submap_start = s - entry->vme_start; submap_start += VME_OFFSET(entry); submap_end = end - entry->vme_start; submap_end += VME_OFFSET(entry); submap_kr = vm_map_delete_submap_recurse( VME_SUBMAP(entry), submap_start, submap_end); if (submap_kr != KERN_SUCCESS) { /* * There are some "permanent" mappings * in the submap: we are not allowed * to remove this range. */ printf("%d[%s] removing permanent submap entry " "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); DTRACE_VM6(vm_map_delete_permanent_deny_submap, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); ret.kmr_return = KERN_PROTECTION_FAILURE; goto out; } /* no permanent mappings: proceed */ } } /* * Step 3: Perform any clipping needed. * * After this, "entry" starts at "s", ends before "end" */ if (entry->vme_start < s) { if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && entry->map_aligned && !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) { /* * The entry will no longer be map-aligned * after clipping and the caller said it's OK. */ entry->map_aligned = FALSE; } vm_map_clip_start(map, entry, s); SAVE_HINT_MAP_WRITE(map, entry->vme_prev); } if (end < entry->vme_end) { if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && entry->map_aligned && !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) { /* * The entry will no longer be map-aligned * after clipping and the caller said it's OK. */ entry->map_aligned = FALSE; } vm_map_clip_end(map, entry, end); } if (entry->vme_permanent && entry->is_sub_map) { /* * We already went through step 2.1 which did not deny * the removal of this "permanent" and "is_sub_map" * entry. * Now that we've clipped what we actually want to * delete, undo the "permanent" part to allow the * removal to proceed. */ DTRACE_VM6(vm_map_delete_permanent_allow_submap, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); entry->vme_permanent = false; } assert(s == entry->vme_start); assert(entry->vme_end <= end); /* * Step 4: If the entry is in flux, wait for this to resolve. */ if (entry->in_transition) { wait_result_t wait_result; in_transition: /* * Another thread is wiring/unwiring this entry. * Let the other thread know we are waiting. */ entry->needs_wakeup = TRUE; /* * wake up anybody waiting on entries that we have * already unwired/deleted. */ if (state & VMDS_NEEDS_WAKEUP) { vm_map_entry_wakeup(map); state &= ~VMDS_NEEDS_WAKEUP; } wait_result = vm_map_entry_wait(map, interruptible); if (interruptible && wait_result == THREAD_INTERRUPTED) { /* * We do not clear the needs_wakeup flag, * since we cannot tell if we were the only one. */ ret.kmr_return = KERN_ABORTED; return ret; } /* * The entry could have been clipped or it * may not exist anymore. Look it up again. */ state |= VMDS_NEEDS_LOOKUP; continue; } /* * Step 5: Handle wiring */ if (entry->wired_count) { struct vm_map_entry tmp_entry; boolean_t user_wire; unsigned int last_timestamp; user_wire = entry->user_wired_count > 0; /* * Remove a kernel wiring if requested */ if (flags & VM_MAP_REMOVE_KUNWIRE) { entry->wired_count--; vme_btref_consider_and_put(entry); } /* * Remove all user wirings for proper accounting */ while (entry->user_wired_count) { subtract_wire_counts(map, entry, user_wire); } /* * All our DMA I/O operations in IOKit are currently * done by wiring through the map entries of the task * requesting the I/O. * * Because of this, we must always wait for kernel wirings * to go away on the entries before deleting them. * * Any caller who wants to actually remove a kernel wiring * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to * properly remove one wiring instead of blasting through * them all. */ if (entry->wired_count != 0) { assert(map != kernel_map); /* * Cannot continue. Typical case is when * a user thread has physical io pending on * on this page. Either wait for the * kernel wiring to go away or return an * error. */ wait_result_t wait_result; entry->needs_wakeup = TRUE; wait_result = vm_map_entry_wait(map, interruptible); if (interruptible && wait_result == THREAD_INTERRUPTED) { /* * We do not clear the * needs_wakeup flag, since we * cannot tell if we were the * only one. */ ret.kmr_return = KERN_ABORTED; return ret; } /* * The entry could have been clipped or * it may not exist anymore. Look it * up again. */ state |= VMDS_NEEDS_LOOKUP; continue; } /* * We can unlock the map now. * * The entry might be split once we unlock the map, * but we need the range as defined by this entry * to be stable. So we must make a local copy. * * The underlying objects do not change during clips, * and the in_transition state guarentees existence * of the entry. */ last_timestamp = map->timestamp; entry->in_transition = TRUE; tmp_entry = *entry; vm_map_unlock(map); if (tmp_entry.is_sub_map) { vm_map_t sub_map; vm_map_offset_t sub_start, sub_end; pmap_t pmap; vm_map_offset_t pmap_addr; sub_map = VME_SUBMAP(&tmp_entry); sub_start = VME_OFFSET(&tmp_entry); sub_end = sub_start + (tmp_entry.vme_end - tmp_entry.vme_start); if (tmp_entry.use_pmap) { pmap = sub_map->pmap; pmap_addr = tmp_entry.vme_start; } else { pmap = map->pmap; pmap_addr = tmp_entry.vme_start; } (void) vm_map_unwire_nested(sub_map, sub_start, sub_end, user_wire, pmap, pmap_addr); } else { vm_map_offset_t entry_end = tmp_entry.vme_end; vm_map_offset_t max_end; if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) { max_end = end - VM_MAP_PAGE_SIZE(map); if (entry_end > max_end) { entry_end = max_end; } } if (tmp_entry.vme_kernel_object) { pmap_protect_options( map->pmap, tmp_entry.vme_start, entry_end, VM_PROT_NONE, PMAP_OPTIONS_REMOVE, NULL); } vm_fault_unwire(map, &tmp_entry, tmp_entry.vme_kernel_object, map->pmap, tmp_entry.vme_start, entry_end); } vm_map_lock(map); /* * Unwiring happened, we can now go back to deleting * them (after we clear the in_transition bit for the range). */ if (last_timestamp + 1 != map->timestamp) { state |= VMDS_NEEDS_LOOKUP; } clear_in_transition_end = tmp_entry.vme_end; continue; } assert(entry->wired_count == 0); assert(entry->user_wired_count == 0); /* * Step 6: Entry is unwired and ready for us to delete ! */ if (!entry->vme_permanent) { /* * Typical case: the entry really shouldn't be permanent */ } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) && (entry->protection & VM_PROT_EXECUTE) && developer_mode_state()) { /* * Allow debuggers to undo executable mappings * when developer mode is on. */ #if 0 printf("FBDP %d[%s] removing permanent executable entry " "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n", proc_selfpid(), (current_task()->bsd_info ? proc_name_address(current_task()->bsd_info) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); #endif entry->vme_permanent = FALSE; } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) { #if 0 printf("FBDP %d[%s] removing permanent entry " "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n", proc_selfpid(), (current_task()->bsd_info ? proc_name_address(current_task()->bsd_info) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); #endif entry->vme_permanent = FALSE; #if CODE_SIGNING_MONITOR } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) { entry->vme_permanent = FALSE; printf("%d[%s] %s(0x%llx,0x%llx): " "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] " "prot 0x%x/0x%x\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, (uint64_t)start, (uint64_t)end, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); #endif } else { DTRACE_VM6(vm_map_delete_permanent, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); } if (entry->is_sub_map) { assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map), "map %p (%d) entry %p submap %p (%d)\n", map, VM_MAP_PAGE_SHIFT(map), entry, VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry))); if (entry->use_pmap) { #ifndef NO_NESTED_PMAP int pmap_flags; if (map->terminated) { /* * This is the final cleanup of the * address space being terminated. * No new mappings are expected and * we don't really need to unnest the * shared region (and lose the "global" * pmap mappings, if applicable). * * Tell the pmap layer that we're * "clean" wrt nesting. */ pmap_flags = PMAP_UNNEST_CLEAN; } else { /* * We're unmapping part of the nested * shared region, so we can't keep the * nested pmap. */ pmap_flags = 0; } pmap_unnest_options( map->pmap, (addr64_t)entry->vme_start, entry->vme_end - entry->vme_start, pmap_flags); #endif /* NO_NESTED_PMAP */ if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, entry->vme_end, VME_SUBMAP(entry), VME_OFFSET(entry)); } } else { vm_map_submap_pmap_clean( map, entry->vme_start, entry->vme_end, VME_SUBMAP(entry), VME_OFFSET(entry)); } } else if (entry->vme_kernel_object || VME_OBJECT(entry) == compressor_object) { /* * nothing to do */ } else if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0) { vm_object_pmap_protect_options( VME_OBJECT(entry), VME_OFFSET(entry), entry->vme_end - entry->vme_start, PMAP_NULL, PAGE_SIZE, entry->vme_start, VM_PROT_NONE, PMAP_OPTIONS_REMOVE); } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) || (state & VMDS_KERNEL_PMAP)) { /* Remove translations associated * with this range unless the entry * does not have an object, or * it's the kernel map or a descendant * since the platform could potentially * create "backdoor" mappings invisible * to the VM. It is expected that * objectless, non-kernel ranges * do not have such VM invisible * translations. */ vm_map_address_t remove_start = entry->vme_start; vm_map_address_t remove_end = entry->vme_end; #if MACH_ASSERT /* * Prevent panics in pmap_remove() from some vm test code * which uses virtual address ranges that pmap disallows. */ if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) { vm_map_clamp_to_pmap(map, &remove_start, &remove_end); } #endif /* MACH_ASSERT */ pmap_remove(map->pmap, remove_start, remove_end); } #if DEBUG /* * All pmap mappings for this map entry must have been * cleared by now. */ assert(pmap_is_empty(map->pmap, entry->vme_start, entry->vme_end)); #endif /* DEBUG */ if (entry->iokit_acct) { /* alternate accounting */ DTRACE_VM4(vm_map_iokit_unmapped_region, vm_map_t, map, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, int, VME_ALIAS(entry)); vm_map_iokit_unmapped_region(map, (entry->vme_end - entry->vme_start)); entry->iokit_acct = FALSE; entry->use_pmap = FALSE; } /* move "s" forward */ s = entry->vme_end; next = entry->vme_next; if (!entry->map_aligned) { vm_map_offset_t rounded_s; /* * Skip artificial gap due to mis-aligned entry * on devices with a page size smaller than the * map's page size (i.e. 16k task on a 4k device). */ rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map)); if (next == vm_map_to_entry(map)) { s = rounded_s; } else if (s < rounded_s) { s = MIN(rounded_s, next->vme_start); } } ret.kmr_size += s - entry->vme_start; if (entry->vme_permanent) { /* * A permanent entry can not be removed, so leave it * in place but remove all access permissions. */ if (__improbable(vm_log_map_delete_permanent_prot_none)) { printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n", __FUNCTION__, __LINE__, proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->is_sub_map, entry->protection, entry->max_protection); } DTRACE_VM6(vm_map_delete_permanent_prot_none, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); entry->protection = VM_PROT_NONE; entry->max_protection = VM_PROT_NONE; #ifdef __arm64e__ entry->used_for_tpro = FALSE; #endif } else { vm_map_entry_zap(map, entry, zap_list); } entry = next; next = VM_MAP_ENTRY_NULL; if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) { unsigned int last_timestamp = map->timestamp++; if (lck_rw_lock_yield_exclusive(&map->lock, LCK_RW_YIELD_ANY_WAITER)) { if (last_timestamp != map->timestamp + 1) { state |= VMDS_NEEDS_LOOKUP; } } else { /* we didn't yield, undo our change */ map->timestamp--; } } } if (map->wait_for_space) { thread_wakeup((event_t) map); } if (state & VMDS_NEEDS_WAKEUP) { vm_map_entry_wakeup(map); } out: if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) { __vm_map_delete_failed_panic(map, start, end, ret.kmr_return); } if (state & VMDS_KERNEL_KMEMPTR) { kmem_free_space(start, end, range_id, &slot); } if (state & VMDS_FOUND_GAP) { DTRACE_VM3(kern_vm_deallocate_gap, vm_map_offset_t, gap_start, vm_map_offset_t, save_start, vm_map_offset_t, save_end); if (flags & VM_MAP_REMOVE_GAPS_FAIL) { ret.kmr_return = KERN_INVALID_VALUE; } else { vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP); } } return ret; } kmem_return_t vm_map_remove_and_unlock( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vmr_flags_t flags, kmem_guard_t guard) { kmem_return_t ret; VM_MAP_ZAP_DECLARE(zap); ret = vm_map_delete(map, start, end, flags, guard, &zap); vm_map_unlock(map); vm_map_zap_dispose(&zap); return ret; } /* * vm_map_remove_guard: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ kmem_return_t vm_map_remove_guard( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vmr_flags_t flags, kmem_guard_t guard) { vm_map_lock(map); return vm_map_remove_and_unlock(map, start, end, flags, guard); } /* * vm_map_terminate: * * Clean out a task's map. */ kern_return_t vm_map_terminate( vm_map_t map) { vm_map_lock(map); map->terminated = TRUE; vm_map_disable_hole_optimization(map); (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset, VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE); return KERN_SUCCESS; } /* * Routine: vm_map_copy_allocate * * Description: * Allocates and initializes a map copy object. */ static vm_map_copy_t vm_map_copy_allocate(uint16_t type) { vm_map_copy_t new_copy; new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO); new_copy->type = type; if (type == VM_MAP_COPY_ENTRY_LIST) { new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; vm_map_store_init(&new_copy->cpy_hdr); } return new_copy; } /* * Routine: vm_map_copy_discard * * Description: * Dispose of a map copy object (returned by * vm_map_copyin). */ void vm_map_copy_discard( vm_map_copy_t copy) { if (copy == VM_MAP_COPY_NULL) { return; } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); switch (copy->type) { case VM_MAP_COPY_ENTRY_LIST: while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { vm_map_entry_t entry = vm_map_copy_first_entry(copy); vm_map_copy_entry_unlink(copy, entry); if (entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(entry)); } else { vm_object_deallocate(VME_OBJECT(entry)); } vm_map_copy_entry_dispose(entry); } break; case VM_MAP_COPY_KERNEL_BUFFER: /* * The vm_map_copy_t and possibly the data buffer were * allocated by a single call to kalloc_data(), i.e. the * vm_map_copy_t was not allocated out of the zone. */ if (copy->size > msg_ool_size_small || copy->offset) { panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", (long long)copy->size, (long long)copy->offset); } kfree_data(copy->cpy_kdata, copy->size); } zfree_id(ZONE_ID_VM_MAP_COPY, copy); } #if XNU_PLATFORM_MacOSX __exported extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy); /* * Routine: vm_map_copy_copy * * Description: * Move the information in a map copy object to * a new map copy object, leaving the old one * empty. * * This is used by kernel routines that need * to look at out-of-line data (in copyin form) * before deciding whether to return SUCCESS. * If the routine returns FAILURE, the original * copy object will be deallocated; therefore, * these routines must make a copy of the copy * object and leave the original empty so that * deallocation will not fail. */ vm_map_copy_t vm_map_copy_copy( vm_map_copy_t copy) { vm_map_copy_t new_copy; if (copy == VM_MAP_COPY_NULL) { return VM_MAP_COPY_NULL; } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); /* * Allocate a new copy object, and copy the information * from the old one into it. */ new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL); memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy)); #if __has_feature(ptrauth_calls) if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { new_copy->cpy_kdata = copy->cpy_kdata; } #endif if (copy->type == VM_MAP_COPY_ENTRY_LIST) { /* * The links in the entry chain must be * changed to point to the new copy object. */ vm_map_copy_first_entry(copy)->vme_prev = vm_map_copy_to_entry(new_copy); vm_map_copy_last_entry(copy)->vme_next = vm_map_copy_to_entry(new_copy); } /* * Change the old copy object into one that contains * nothing to be deallocated. */ bzero(copy, sizeof(struct vm_map_copy)); copy->type = VM_MAP_COPY_KERNEL_BUFFER; /* * Return the new object. */ return new_copy; } #endif /* XNU_PLATFORM_MacOSX */ static boolean_t vm_map_entry_is_overwritable( vm_map_t dst_map __unused, vm_map_entry_t entry) { if (!(entry->protection & VM_PROT_WRITE)) { /* can't overwrite if not writable */ return FALSE; } #if !__x86_64__ if (entry->used_for_jit && vm_map_cs_enforcement(dst_map) && !dst_map->cs_debugged) { /* * Can't overwrite a JIT region while cs_enforced * and not cs_debugged. */ return FALSE; } #if __arm64e__ /* Do not allow overwrite HW assisted TPRO entries */ if (entry->used_for_tpro) { return FALSE; } #endif /* __arm64e__ */ if (entry->vme_permanent) { if (entry->is_sub_map) { /* * We can't tell if the submap contains "permanent" * entries within the range targeted by the caller. * The caller will have to check for that with * vm_map_overwrite_submap_recurse() for example. */ } else { /* * Do not allow overwriting of a "permanent" * entry. */ DTRACE_VM6(vm_map_delete_permanent_deny_overwrite, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); return FALSE; } } #endif /* !__x86_64__ */ if (entry->is_sub_map) { /* remember not to assume every entry has a VM object... */ } return TRUE; } static kern_return_t vm_map_overwrite_submap_recurse( vm_map_t dst_map, vm_map_offset_t dst_addr, vm_map_size_t dst_size) { vm_map_offset_t dst_end; vm_map_entry_t tmp_entry; vm_map_entry_t entry; kern_return_t result; boolean_t encountered_sub_map = FALSE; /* * Verify that the destination is all writeable * initially. We have to trunc the destination * address and round the copy size or we'll end up * splitting entries in strange ways. */ dst_end = vm_map_round_page(dst_addr + dst_size, VM_MAP_PAGE_MASK(dst_map)); vm_map_lock(dst_map); start_pass_1: if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } vm_map_clip_start(dst_map, tmp_entry, vm_map_trunc_page(dst_addr, VM_MAP_PAGE_MASK(dst_map))); if (tmp_entry->is_sub_map) { /* clipping did unnest if needed */ assert(!tmp_entry->use_pmap); } for (entry = tmp_entry;;) { vm_map_entry_t next; next = entry->vme_next; while (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; vm_map_offset_t local_end; if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto start_pass_1; } encountered_sub_map = TRUE; sub_start = VME_OFFSET(entry); if (entry->vme_end < dst_end) { sub_end = entry->vme_end; } else { sub_end = dst_end; } sub_end -= entry->vme_start; sub_end += VME_OFFSET(entry); local_end = entry->vme_end; vm_map_unlock(dst_map); result = vm_map_overwrite_submap_recurse( VME_SUBMAP(entry), sub_start, sub_end - sub_start); if (result != KERN_SUCCESS) { return result; } if (dst_end <= entry->vme_end) { return KERN_SUCCESS; } vm_map_lock(dst_map); if (!vm_map_lookup_entry(dst_map, local_end, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } entry = tmp_entry; next = entry->vme_next; } assert(!entry->is_sub_map); if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen * when we unlock the map, so start over. */ if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto start_pass_1; } /* * our range is contained completely within this map entry */ if (dst_end <= entry->vme_end) { vm_map_unlock(dst_map); return KERN_SUCCESS; } /* * check that range specified is contiguous region */ if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } /* * Check for permanent objects in the destination. */ assert(!entry->is_sub_map); if ((VME_OBJECT(entry) != VM_OBJECT_NULL) && ((!VME_OBJECT(entry)->internal) || (VME_OBJECT(entry)->true_share))) { if (encountered_sub_map) { vm_map_unlock(dst_map); return KERN_FAILURE; } } entry = next; }/* for */ vm_map_unlock(dst_map); return KERN_SUCCESS; } /* * Routine: vm_map_copy_overwrite * * Description: * Copy the memory described by the map copy * object (copy; returned by vm_map_copyin) onto * the specified destination region (dst_map, dst_addr). * The destination must be writeable. * * Unlike vm_map_copyout, this routine actually * writes over previously-mapped memory. If the * previous mapping was to a permanent (user-supplied) * memory object, it is preserved. * * The attributes (protection and inheritance) of the * destination region are preserved. * * If successful, consumes the copy object. * Otherwise, the caller is responsible for it. * * Implementation notes: * To overwrite aligned temporary virtual memory, it is * sufficient to remove the previous mapping and insert * the new copy. This replacement is done either on * the whole region (if no permanent virtual memory * objects are embedded in the destination region) or * in individual map entries. * * To overwrite permanent virtual memory , it is necessary * to copy each page, as the external memory management * interface currently does not provide any optimizations. * * Unaligned memory also has to be copied. It is possible * to use 'vm_trickery' to copy the aligned data. This is * not done but not hard to implement. * * Once a page of permanent memory has been overwritten, * it is impossible to interrupt this function; otherwise, * the call would be neither atomic nor location-independent. * The kernel-state portion of a user thread must be * interruptible. * * It may be expensive to forward all requests that might * overwrite permanent memory (vm_write, vm_copy) to * uninterruptible kernel threads. This routine may be * called by interruptible threads; however, success is * not guaranteed -- if the request cannot be performed * atomically and interruptibly, an error indication is * returned. * * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ static kern_return_t vm_map_copy_overwrite_nested( vm_map_t dst_map, vm_map_address_t dst_addr, vm_map_copy_t copy, boolean_t interruptible, pmap_t pmap, boolean_t discard_on_success) { vm_map_offset_t dst_end; vm_map_entry_t tmp_entry; vm_map_entry_t entry; kern_return_t kr; boolean_t aligned = TRUE; boolean_t contains_permanent_objects = FALSE; boolean_t encountered_sub_map = FALSE; vm_map_offset_t base_addr; vm_map_size_t copy_size; vm_map_size_t total_size; uint16_t copy_page_shift; /* * Check for special kernel buffer allocated * by new_ipc_kmsg_copyin. */ if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { kr = vm_map_copyout_kernel_buffer( dst_map, &dst_addr, copy, copy->size, TRUE, discard_on_success); return kr; } /* * Only works for entry lists at the moment. Will * support page lists later. */ assert(copy->type == VM_MAP_COPY_ENTRY_LIST); if (copy->size == 0) { if (discard_on_success) { vm_map_copy_discard(copy); } return KERN_SUCCESS; } copy_page_shift = copy->cpy_hdr.page_shift; /* * Verify that the destination is all writeable * initially. We have to trunc the destination * address and round the copy size or we'll end up * splitting entries in strange ways. */ if (!VM_MAP_PAGE_ALIGNED(copy->size, VM_MAP_PAGE_MASK(dst_map)) || !VM_MAP_PAGE_ALIGNED(copy->offset, VM_MAP_PAGE_MASK(dst_map)) || !VM_MAP_PAGE_ALIGNED(dst_addr, VM_MAP_PAGE_MASK(dst_map)) || copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) { aligned = FALSE; dst_end = vm_map_round_page(dst_addr + copy->size, VM_MAP_PAGE_MASK(dst_map)); } else { dst_end = dst_addr + copy->size; } vm_map_lock(dst_map); /* LP64todo - remove this check when vm_map_commpage64() * no longer has to stuff in a map_entry for the commpage * above the map's max_offset. */ if (dst_addr >= dst_map->max_offset) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } start_pass_1: if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } vm_map_clip_start(dst_map, tmp_entry, vm_map_trunc_page(dst_addr, VM_MAP_PAGE_MASK(dst_map))); for (entry = tmp_entry;;) { vm_map_entry_t next = entry->vme_next; while (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; vm_map_offset_t local_end; if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto start_pass_1; } local_end = entry->vme_end; if (!(entry->needs_copy)) { /* if needs_copy we are a COW submap */ /* in such a case we just replace so */ /* there is no need for the follow- */ /* ing check. */ encountered_sub_map = TRUE; sub_start = VME_OFFSET(entry); if (entry->vme_end < dst_end) { sub_end = entry->vme_end; } else { sub_end = dst_end; } sub_end -= entry->vme_start; sub_end += VME_OFFSET(entry); vm_map_unlock(dst_map); kr = vm_map_overwrite_submap_recurse( VME_SUBMAP(entry), sub_start, sub_end - sub_start); if (kr != KERN_SUCCESS) { return kr; } vm_map_lock(dst_map); } if (dst_end <= entry->vme_end) { goto start_overwrite; } if (!vm_map_lookup_entry(dst_map, local_end, &entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } next = entry->vme_next; } assert(!entry->is_sub_map); if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen * when we unlock the map, so start over. */ if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto start_pass_1; } /* * our range is contained completely within this map entry */ if (dst_end <= entry->vme_end) { break; } /* * check that range specified is contiguous region */ if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } /* * Check for permanent objects in the destination. */ assert(!entry->is_sub_map); if ((VME_OBJECT(entry) != VM_OBJECT_NULL) && ((!VME_OBJECT(entry)->internal) || (VME_OBJECT(entry)->true_share))) { contains_permanent_objects = TRUE; } entry = next; }/* for */ start_overwrite: /* * If there are permanent objects in the destination, then * the copy cannot be interrupted. */ if (interruptible && contains_permanent_objects) { vm_map_unlock(dst_map); return KERN_FAILURE; /* XXX */ } /* * * Make a second pass, overwriting the data * At the beginning of each loop iteration, * the next entry to be overwritten is "tmp_entry" * (initially, the value returned from the lookup above), * and the starting address expected in that entry * is "start". */ total_size = copy->size; if (encountered_sub_map) { copy_size = 0; /* re-calculate tmp_entry since we've had the map */ /* unlocked */ if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } } else { copy_size = copy->size; } base_addr = dst_addr; while (TRUE) { /* deconstruct the copy object and do in parts */ /* only in sub_map, interruptable case */ vm_map_entry_t copy_entry; vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL; vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL; int nentries; int remaining_entries = 0; vm_map_offset_t new_offset = 0; for (entry = tmp_entry; copy_size == 0;) { vm_map_entry_t next; next = entry->vme_next; /* tmp_entry and base address are moved along */ /* each time we encounter a sub-map. Otherwise */ /* entry can outpase tmp_entry, and the copy_size */ /* may reflect the distance between them */ /* if the current entry is found to be in transition */ /* we will start over at the beginning or the last */ /* encounter of a submap as dictated by base_addr */ /* we will zero copy_size accordingly. */ if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); if (!vm_map_lookup_entry(dst_map, base_addr, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } copy_size = 0; entry = tmp_entry; continue; } if (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; vm_map_offset_t local_end; if (entry->needs_copy) { /* if this is a COW submap */ /* just back the range with a */ /* anonymous entry */ assert(!entry->vme_permanent); if (entry->vme_end < dst_end) { sub_end = entry->vme_end; } else { sub_end = dst_end; } if (entry->vme_start < base_addr) { sub_start = base_addr; } else { sub_start = entry->vme_start; } vm_map_clip_end( dst_map, entry, sub_end); vm_map_clip_start( dst_map, entry, sub_start); assert(!entry->use_pmap); assert(!entry->iokit_acct); entry->use_pmap = TRUE; vm_map_deallocate(VME_SUBMAP(entry)); assert(!entry->vme_permanent); VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0); VME_OFFSET_SET(entry, 0); entry->is_shared = FALSE; entry->needs_copy = FALSE; entry->protection = VM_PROT_DEFAULT; entry->max_protection = VM_PROT_ALL; entry->wired_count = 0; entry->user_wired_count = 0; if (entry->inheritance == VM_INHERIT_SHARE) { entry->inheritance = VM_INHERIT_COPY; } continue; } /* first take care of any non-sub_map */ /* entries to send */ if (base_addr < entry->vme_start) { /* stuff to send */ copy_size = entry->vme_start - base_addr; break; } sub_start = VME_OFFSET(entry); if (entry->vme_end < dst_end) { sub_end = entry->vme_end; } else { sub_end = dst_end; } sub_end -= entry->vme_start; sub_end += VME_OFFSET(entry); local_end = entry->vme_end; vm_map_unlock(dst_map); copy_size = sub_end - sub_start; /* adjust the copy object */ if (total_size > copy_size) { vm_map_size_t local_size = 0; vm_map_size_t entry_size; nentries = 1; new_offset = copy->offset; copy_entry = vm_map_copy_first_entry(copy); while (copy_entry != vm_map_copy_to_entry(copy)) { entry_size = copy_entry->vme_end - copy_entry->vme_start; if ((local_size < copy_size) && ((local_size + entry_size) >= copy_size)) { vm_map_copy_clip_end(copy, copy_entry, copy_entry->vme_start + (copy_size - local_size)); entry_size = copy_entry->vme_end - copy_entry->vme_start; local_size += entry_size; new_offset += entry_size; } if (local_size >= copy_size) { next_copy = copy_entry->vme_next; copy_entry->vme_next = vm_map_copy_to_entry(copy); previous_prev = copy->cpy_hdr.links.prev; copy->cpy_hdr.links.prev = copy_entry; copy->size = copy_size; remaining_entries = copy->cpy_hdr.nentries; remaining_entries -= nentries; copy->cpy_hdr.nentries = nentries; break; } else { local_size += entry_size; new_offset += entry_size; nentries++; } copy_entry = copy_entry->vme_next; } } if ((entry->use_pmap) && (pmap == NULL)) { kr = vm_map_copy_overwrite_nested( VME_SUBMAP(entry), sub_start, copy, interruptible, VME_SUBMAP(entry)->pmap, TRUE); } else if (pmap != NULL) { kr = vm_map_copy_overwrite_nested( VME_SUBMAP(entry), sub_start, copy, interruptible, pmap, TRUE); } else { kr = vm_map_copy_overwrite_nested( VME_SUBMAP(entry), sub_start, copy, interruptible, dst_map->pmap, TRUE); } if (kr != KERN_SUCCESS) { if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; copy->cpy_hdr.links.prev->vme_next = next_copy; copy->cpy_hdr.links.prev = previous_prev; copy->size = total_size; } return kr; } if (dst_end <= local_end) { return KERN_SUCCESS; } /* otherwise copy no longer exists, it was */ /* destroyed after successful copy_overwrite */ copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); copy->offset = new_offset; copy->cpy_hdr.page_shift = copy_page_shift; total_size -= copy_size; copy_size = 0; /* put back remainder of copy in container */ if (next_copy != NULL) { copy->cpy_hdr.nentries = remaining_entries; copy->cpy_hdr.links.next = next_copy; copy->cpy_hdr.links.prev = previous_prev; copy->size = total_size; next_copy->vme_prev = vm_map_copy_to_entry(copy); next_copy = NULL; } base_addr = local_end; vm_map_lock(dst_map); if (!vm_map_lookup_entry(dst_map, local_end, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } entry = tmp_entry; continue; } assert(!entry->is_sub_map); if (dst_end <= entry->vme_end) { copy_size = dst_end - base_addr; break; } if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } entry = next; }/* for */ next_copy = NULL; nentries = 1; /* adjust the copy object */ if (total_size > copy_size) { vm_map_size_t local_size = 0; vm_map_size_t entry_size; new_offset = copy->offset; copy_entry = vm_map_copy_first_entry(copy); while (copy_entry != vm_map_copy_to_entry(copy)) { entry_size = copy_entry->vme_end - copy_entry->vme_start; if ((local_size < copy_size) && ((local_size + entry_size) >= copy_size)) { vm_map_copy_clip_end(copy, copy_entry, copy_entry->vme_start + (copy_size - local_size)); entry_size = copy_entry->vme_end - copy_entry->vme_start; local_size += entry_size; new_offset += entry_size; } if (local_size >= copy_size) { next_copy = copy_entry->vme_next; copy_entry->vme_next = vm_map_copy_to_entry(copy); previous_prev = copy->cpy_hdr.links.prev; copy->cpy_hdr.links.prev = copy_entry; copy->size = copy_size; remaining_entries = copy->cpy_hdr.nentries; remaining_entries -= nentries; copy->cpy_hdr.nentries = nentries; break; } else { local_size += entry_size; new_offset += entry_size; nentries++; } copy_entry = copy_entry->vme_next; } } if (aligned) { pmap_t local_pmap; if (pmap) { local_pmap = pmap; } else { local_pmap = dst_map->pmap; } if ((kr = vm_map_copy_overwrite_aligned( dst_map, tmp_entry, copy, base_addr, local_pmap)) != KERN_SUCCESS) { if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; copy->cpy_hdr.links.prev->vme_next = next_copy; copy->cpy_hdr.links.prev = previous_prev; copy->size += copy_size; } return kr; } vm_map_unlock(dst_map); } else { /* * Performance gain: * * if the copy and dst address are misaligned but the same * offset within the page we can copy_not_aligned the * misaligned parts and copy aligned the rest. If they are * aligned but len is unaligned we simply need to copy * the end bit unaligned. We'll need to split the misaligned * bits of the region in this case ! */ /* ALWAYS UNLOCKS THE dst_map MAP */ kr = vm_map_copy_overwrite_unaligned( dst_map, tmp_entry, copy, base_addr, discard_on_success); if (kr != KERN_SUCCESS) { if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; copy->cpy_hdr.links.prev->vme_next = next_copy; copy->cpy_hdr.links.prev = previous_prev; copy->size += copy_size; } return kr; } } total_size -= copy_size; if (total_size == 0) { break; } base_addr += copy_size; copy_size = 0; copy->offset = new_offset; if (next_copy != NULL) { copy->cpy_hdr.nentries = remaining_entries; copy->cpy_hdr.links.next = next_copy; copy->cpy_hdr.links.prev = previous_prev; next_copy->vme_prev = vm_map_copy_to_entry(copy); copy->size = total_size; } vm_map_lock(dst_map); while (TRUE) { if (!vm_map_lookup_entry(dst_map, base_addr, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } if (tmp_entry->in_transition) { entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); } else { break; } } vm_map_clip_start(dst_map, tmp_entry, vm_map_trunc_page(base_addr, VM_MAP_PAGE_MASK(dst_map))); entry = tmp_entry; } /* while */ /* * Throw away the vm_map_copy object */ if (discard_on_success) { vm_map_copy_discard(copy); } return KERN_SUCCESS; }/* vm_map_copy_overwrite */ static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_copy_addr_size_sanitize( vm_map_t map, vm_map_offset_ut addr_u, vm_map_size_ut size_u, vm_sanitize_caller_t vm_sanitize_caller, vm_map_offset_t *addr, vm_map_offset_t *end, vm_map_size_t *size) { vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES; return vm_sanitize_addr_size(addr_u, size_u, vm_sanitize_caller, map, flags, addr, end, size); } kern_return_t vm_map_copy_overwrite( vm_map_t dst_map, vm_map_offset_ut dst_addr_u, vm_map_copy_t copy, vm_map_size_ut copy_size_u, boolean_t interruptible) { vm_map_offset_t dst_addr, dst_end; vm_map_size_t copy_size; vm_map_size_t head_size, tail_size; vm_map_copy_t head_copy, tail_copy; vm_map_offset_t head_addr, tail_addr; vm_map_entry_t entry; kern_return_t kr; vm_map_offset_t effective_page_mask, effective_page_size; uint16_t copy_page_shift; head_size = 0; tail_size = 0; head_copy = NULL; tail_copy = NULL; head_addr = 0; tail_addr = 0; /* * Check for null copy object. */ if (copy == VM_MAP_COPY_NULL) { return KERN_SUCCESS; } /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_copy_addr_size_sanitize( dst_map, dst_addr_u, copy_size_u, VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE, &dst_addr, &dst_end, ©_size); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); if (interruptible || copy->type != VM_MAP_COPY_ENTRY_LIST) { /* * We can't split the "copy" map if we're interruptible * or if we don't have a "copy" map... */ blunt_copy: kr = vm_map_copy_overwrite_nested(dst_map, dst_addr, copy, interruptible, (pmap_t) NULL, TRUE); if (kr) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */); } return kr; } copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy); if (copy_page_shift < PAGE_SHIFT || VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) { goto blunt_copy; } if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) { effective_page_mask = VM_MAP_PAGE_MASK(dst_map); } else { effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK); effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy), effective_page_mask); } effective_page_size = effective_page_mask + 1; if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) { /* * Too small to bother with optimizing... */ goto blunt_copy; } if ((dst_addr & effective_page_mask) != (copy->offset & effective_page_mask)) { /* * Incompatible mis-alignment of source and destination... */ goto blunt_copy; } /* * Proper alignment or identical mis-alignment at the beginning. * Let's try and do a small unaligned copy first (if needed) * and then an aligned copy for the rest. */ if (!vm_map_page_aligned(dst_addr, effective_page_mask)) { head_addr = dst_addr; head_size = (effective_page_size - (copy->offset & effective_page_mask)); head_size = MIN(head_size, copy_size); } if (!vm_map_page_aligned(copy->offset + copy_size, effective_page_mask)) { /* * Mis-alignment at the end. * Do an aligned copy up to the last page and * then an unaligned copy for the remaining bytes. */ tail_size = ((copy->offset + copy_size) & effective_page_mask); tail_size = MIN(tail_size, copy_size); tail_addr = dst_addr + copy_size - tail_size; assert(tail_addr >= head_addr + head_size); } assert(head_size + tail_size <= copy_size); if (head_size + tail_size == copy_size) { /* * It's all unaligned, no optimization possible... */ goto blunt_copy; } /* * Can't optimize if there are any submaps in the * destination due to the way we free the "copy" map * progressively in vm_map_copy_overwrite_nested() * in that case. */ vm_map_lock_read(dst_map); if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) { vm_map_unlock_read(dst_map); goto blunt_copy; } for (; (entry != vm_map_to_entry(dst_map) && entry->vme_start < dst_addr + copy_size); entry = entry->vme_next) { if (entry->is_sub_map) { vm_map_unlock_read(dst_map); goto blunt_copy; } } vm_map_unlock_read(dst_map); if (head_size) { /* * Unaligned copy of the first "head_size" bytes, to reach * a page boundary. */ /* * Extract "head_copy" out of "copy". */ head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); head_copy->cpy_hdr.entries_pageable = copy->cpy_hdr.entries_pageable; head_copy->cpy_hdr.page_shift = copy_page_shift; entry = vm_map_copy_first_entry(copy); if (entry->vme_end < copy->offset + head_size) { head_size = entry->vme_end - copy->offset; } head_copy->offset = copy->offset; head_copy->size = head_size; copy->offset += head_size; copy->size -= head_size; copy_size -= head_size; assert(copy_size > 0); vm_map_copy_clip_end(copy, entry, copy->offset); vm_map_copy_entry_unlink(copy, entry); vm_map_copy_entry_link(head_copy, vm_map_copy_to_entry(head_copy), entry); /* * Do the unaligned copy. */ kr = vm_map_copy_overwrite_nested(dst_map, head_addr, head_copy, interruptible, (pmap_t) NULL, FALSE); if (kr != KERN_SUCCESS) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */); goto done; } } if (tail_size) { /* * Extract "tail_copy" out of "copy". */ tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); tail_copy->cpy_hdr.entries_pageable = copy->cpy_hdr.entries_pageable; tail_copy->cpy_hdr.page_shift = copy_page_shift; tail_copy->offset = copy->offset + copy_size - tail_size; tail_copy->size = tail_size; copy->size -= tail_size; copy_size -= tail_size; assert(copy_size > 0); entry = vm_map_copy_last_entry(copy); vm_map_copy_clip_start(copy, entry, tail_copy->offset); entry = vm_map_copy_last_entry(copy); vm_map_copy_entry_unlink(copy, entry); vm_map_copy_entry_link(tail_copy, vm_map_copy_last_entry(tail_copy), entry); } /* * If we are here from ipc_kmsg_copyout_ool_descriptor(), * we want to avoid TOCTOU issues w.r.t copy->size but * we don't need to change vm_map_copy_overwrite_nested() * and all other vm_map_copy_overwrite variants. * * So we assign the original copy_size that was passed into * this routine back to copy. * * This use of local 'copy_size' passed into this routine is * to try and protect against TOCTOU attacks where the kernel * has been exploited. We don't expect this to be an issue * during normal system operation. */ assertf(copy->size == copy_size, "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size); copy->size = copy_size; /* * Copy most (or possibly all) of the data. */ kr = vm_map_copy_overwrite_nested(dst_map, dst_addr + head_size, copy, interruptible, (pmap_t) NULL, FALSE); if (kr != KERN_SUCCESS) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */); goto done; } if (tail_size) { kr = vm_map_copy_overwrite_nested(dst_map, tail_addr, tail_copy, interruptible, (pmap_t) NULL, FALSE); if (kr) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */); } } done: assert(copy->type == VM_MAP_COPY_ENTRY_LIST); if (kr == KERN_SUCCESS) { /* * Discard all the copy maps. */ if (head_copy) { vm_map_copy_discard(head_copy); head_copy = NULL; } vm_map_copy_discard(copy); if (tail_copy) { vm_map_copy_discard(tail_copy); tail_copy = NULL; } } else { /* * Re-assemble the original copy map. */ if (head_copy) { entry = vm_map_copy_first_entry(head_copy); vm_map_copy_entry_unlink(head_copy, entry); vm_map_copy_entry_link(copy, vm_map_copy_to_entry(copy), entry); copy->offset -= head_size; copy->size += head_size; vm_map_copy_discard(head_copy); head_copy = NULL; } if (tail_copy) { entry = vm_map_copy_last_entry(tail_copy); vm_map_copy_entry_unlink(tail_copy, entry); vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), entry); copy->size += tail_size; vm_map_copy_discard(tail_copy); tail_copy = NULL; } } return kr; } /* * Routine: vm_map_copy_overwrite_unaligned [internal use only] * * Decription: * Physically copy unaligned data * * Implementation: * Unaligned parts of pages have to be physically copied. We use * a modified form of vm_fault_copy (which understands none-aligned * page offsets and sizes) to do the copy. We attempt to copy as * much memory in one go as possibly, however vm_fault_copy copies * within 1 memory object so we have to find the smaller of "amount left" * "source object data size" and "target object data size". With * unaligned data we don't need to split regions, therefore the source * (copy) object should be one map entry, the target range may be split * over multiple map entries however. In any event we are pessimistic * about these assumptions. * * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. * * Assumptions: * dst_map is locked on entry and is return locked on success, * unlocked on error. */ static kern_return_t vm_map_copy_overwrite_unaligned( vm_map_t dst_map, vm_map_entry_t entry, vm_map_copy_t copy, vm_map_offset_t start, boolean_t discard_on_success) { vm_map_entry_t copy_entry; vm_map_entry_t copy_entry_next; vm_map_version_t version; vm_object_t dst_object; vm_object_offset_t dst_offset; vm_object_offset_t src_offset; vm_object_offset_t entry_offset; vm_map_offset_t entry_end; vm_map_size_t src_size, dst_size, copy_size, amount_left; kern_return_t kr = KERN_SUCCESS; copy_entry = vm_map_copy_first_entry(copy); vm_map_lock_write_to_read(dst_map); src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy)); amount_left = copy->size; /* * unaligned so we never clipped this entry, we need the offset into * the vm_object not just the data. */ while (amount_left > 0) { if (entry == vm_map_to_entry(dst_map)) { vm_map_unlock_read(dst_map); return KERN_INVALID_ADDRESS; } /* "start" must be within the current map entry */ assert((start >= entry->vme_start) && (start < entry->vme_end)); /* * Check protection again */ if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock_read(dst_map); return KERN_PROTECTION_FAILURE; } if (entry->is_sub_map) { /* not implemented... */ vm_map_unlock_read(dst_map); return KERN_INVALID_ARGUMENT; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock_read(dst_map); return KERN_PROTECTION_FAILURE; } /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen * when we unlock the map, so start over. */ if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto RetryLookup; } dst_offset = start - entry->vme_start; dst_size = entry->vme_end - start; src_size = copy_entry->vme_end - (copy_entry->vme_start + src_offset); if (dst_size < src_size) { /* * we can only copy dst_size bytes before * we have to get the next destination entry */ copy_size = dst_size; } else { /* * we can only copy src_size bytes before * we have to get the next source copy entry */ copy_size = src_size; } if (copy_size > amount_left) { copy_size = amount_left; } /* * Entry needs copy, create a shadow shadow object for * Copy on write region. */ assert(!entry->is_sub_map); if (entry->needs_copy) { if (vm_map_lock_read_to_write(dst_map)) { vm_map_lock_read(dst_map); goto RetryLookup; } VME_OBJECT_SHADOW(entry, (vm_map_size_t)(entry->vme_end - entry->vme_start), vm_map_always_shadow(dst_map)); entry->needs_copy = FALSE; vm_map_lock_write_to_read(dst_map); } dst_object = VME_OBJECT(entry); /* * unlike with the virtual (aligned) copy we're going * to fault on it therefore we need a target object. */ if (dst_object == VM_OBJECT_NULL) { if (vm_map_lock_read_to_write(dst_map)) { vm_map_lock_read(dst_map); goto RetryLookup; } dst_object = vm_object_allocate((vm_map_size_t) entry->vme_end - entry->vme_start); VME_OBJECT_SET(entry, dst_object, false, 0); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); vm_map_lock_write_to_read(dst_map); } /* * Take an object reference and unlock map. The "entry" may * disappear or change when the map is unlocked. */ vm_object_reference(dst_object); version.main_timestamp = dst_map->timestamp; entry_offset = VME_OFFSET(entry); entry_end = entry->vme_end; vm_map_unlock_read(dst_map); /* * Copy as much as possible in one pass */ kr = vm_fault_copy( VME_OBJECT(copy_entry), VME_OFFSET(copy_entry) + src_offset, ©_size, dst_object, entry_offset + dst_offset, dst_map, &version, THREAD_UNINT ); start += copy_size; src_offset += copy_size; amount_left -= copy_size; /* * Release the object reference */ vm_object_deallocate(dst_object); /* * If a hard error occurred, return it now */ if (kr != KERN_SUCCESS) { return kr; } if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end || amount_left == 0) { /* * all done with this copy entry, dispose. */ copy_entry_next = copy_entry->vme_next; if (discard_on_success) { vm_map_copy_entry_unlink(copy, copy_entry); assert(!copy_entry->is_sub_map); vm_object_deallocate(VME_OBJECT(copy_entry)); vm_map_copy_entry_dispose(copy_entry); } if (copy_entry_next == vm_map_copy_to_entry(copy) && amount_left) { /* * not finished copying but run out of source */ return KERN_INVALID_ADDRESS; } copy_entry = copy_entry_next; src_offset = 0; } if (amount_left == 0) { return KERN_SUCCESS; } vm_map_lock_read(dst_map); if (version.main_timestamp == dst_map->timestamp) { if (start == entry_end) { /* * destination region is split. Use the version * information to avoid a lookup in the normal * case. */ entry = entry->vme_next; /* * should be contiguous. Fail if we encounter * a hole in the destination. */ if (start != entry->vme_start) { vm_map_unlock_read(dst_map); return KERN_INVALID_ADDRESS; } } } else { /* * Map version check failed. * we must lookup the entry because somebody * might have changed the map behind our backs. */ RetryLookup: if (!vm_map_lookup_entry(dst_map, start, &entry)) { vm_map_unlock_read(dst_map); return KERN_INVALID_ADDRESS; } } }/* while */ return KERN_SUCCESS; }/* vm_map_copy_overwrite_unaligned */ /* * Routine: vm_map_copy_overwrite_aligned [internal use only] * * Description: * Does all the vm_trickery possible for whole pages. * * Implementation: * * If there are no permanent objects in the destination, * and the source and destination map entry zones match, * and the destination map entry is not shared, * then the map entries can be deleted and replaced * with those from the copy. The following code is the * basic idea of what to do, but there are lots of annoying * little details about getting protection and inheritance * right. Should add protection, inheritance, and sharing checks * to the above pass and make sure that no wiring is involved. * * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ int vm_map_copy_overwrite_aligned_src_not_internal = 0; int vm_map_copy_overwrite_aligned_src_not_symmetric = 0; int vm_map_copy_overwrite_aligned_src_large = 0; static kern_return_t vm_map_copy_overwrite_aligned( vm_map_t dst_map, vm_map_entry_t tmp_entry, vm_map_copy_t copy, vm_map_offset_t start, __unused pmap_t pmap) { vm_object_t object; vm_map_entry_t copy_entry; vm_map_size_t copy_size; vm_map_size_t size; vm_map_entry_t entry; while ((copy_entry = vm_map_copy_first_entry(copy)) != vm_map_copy_to_entry(copy)) { copy_size = (copy_entry->vme_end - copy_entry->vme_start); entry = tmp_entry; if (entry->is_sub_map) { /* unnested when clipped earlier */ assert(!entry->use_pmap); } if (entry == vm_map_to_entry(dst_map)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } size = (entry->vme_end - entry->vme_start); /* * Make sure that no holes popped up in the * address map, and that the protection is * still valid, in case the map was unlocked * earlier. */ if ((entry->vme_start != start) || ((entry->is_sub_map) && !entry->needs_copy)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } assert(entry != vm_map_to_entry(dst_map)); /* * Check protection again */ if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } if (entry->is_sub_map) { /* not properly implemented */ vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); return KERN_PROTECTION_FAILURE; } /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen * when we unlock the map, so start over. */ if (entry->in_transition) { /* * Say that we are waiting, and wait for entry. */ entry->needs_wakeup = TRUE; vm_map_entry_wait(dst_map, THREAD_UNINT); goto RetryLookup; } /* * Adjust to source size first */ if (copy_size < size) { if (entry->map_aligned && !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size, VM_MAP_PAGE_MASK(dst_map))) { /* no longer map-aligned */ entry->map_aligned = FALSE; } vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); size = copy_size; } /* * Adjust to destination size */ if (size < copy_size) { vm_map_copy_clip_end(copy, copy_entry, copy_entry->vme_start + size); copy_size = size; } assert((entry->vme_end - entry->vme_start) == size); assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); assert((copy_entry->vme_end - copy_entry->vme_start) == size); /* * If the destination contains temporary unshared memory, * we can perform the copy by throwing it away and * installing the source data. * * Exceptions for mappings with special semantics: * + "permanent" entries, * + JIT regions, * + TPRO regions, * + pmap-specific protection policies, * + VM objects with COPY_NONE copy strategy. */ object = VME_OBJECT(entry); if ((!entry->is_shared && !entry->vme_permanent && !entry->used_for_jit && #if __arm64e__ !entry->used_for_tpro && #endif /* __arm64e__ */ !(entry->protection & VM_PROT_EXECUTE) && !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) && ((object == VM_OBJECT_NULL) || (object->internal && !object->true_share && object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) || entry->needs_copy) { vm_object_t old_object = VME_OBJECT(entry); vm_object_offset_t old_offset = VME_OFFSET(entry); vm_object_offset_t offset; assert(!entry->is_sub_map); /* * Ensure that the source and destination aren't * identical */ if (old_object == VME_OBJECT(copy_entry) && old_offset == VME_OFFSET(copy_entry)) { vm_map_copy_entry_unlink(copy, copy_entry); vm_map_copy_entry_dispose(copy_entry); if (old_object != VM_OBJECT_NULL) { vm_object_deallocate(old_object); } start = tmp_entry->vme_end; tmp_entry = tmp_entry->vme_next; continue; } #if XNU_TARGET_OS_OSX #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */ #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */ if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL && VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE && copy_size <= __TRADEOFF1_COPY_SIZE) { /* * Virtual vs. Physical copy tradeoff #1. * * Copying only a few pages out of a large * object: do a physical copy instead of * a virtual copy, to avoid possibly keeping * the entire large object alive because of * those few copy-on-write pages. */ vm_map_copy_overwrite_aligned_src_large++; goto slow_copy; } #endif /* XNU_TARGET_OS_OSX */ if ((dst_map->pmap != kernel_pmap) && (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) && (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) { vm_object_t new_object, new_shadow; /* * We're about to map something over a mapping * established by malloc()... */ new_object = VME_OBJECT(copy_entry); if (new_object != VM_OBJECT_NULL) { vm_object_lock_shared(new_object); } while (new_object != VM_OBJECT_NULL && #if XNU_TARGET_OS_OSX !new_object->true_share && new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && #endif /* XNU_TARGET_OS_OSX */ new_object->internal) { new_shadow = new_object->shadow; if (new_shadow == VM_OBJECT_NULL) { break; } vm_object_lock_shared(new_shadow); vm_object_unlock(new_object); new_object = new_shadow; } if (new_object != VM_OBJECT_NULL) { if (!new_object->internal) { /* * The new mapping is backed * by an external object. We * don't want malloc'ed memory * to be replaced with such a * non-anonymous mapping, so * let's go off the optimized * path... */ vm_map_copy_overwrite_aligned_src_not_internal++; vm_object_unlock(new_object); goto slow_copy; } #if XNU_TARGET_OS_OSX if (new_object->true_share || new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* * Same if there's a "true_share" * object in the shadow chain, or * an object with a non-default * (SYMMETRIC) copy strategy. */ vm_map_copy_overwrite_aligned_src_not_symmetric++; vm_object_unlock(new_object); goto slow_copy; } #endif /* XNU_TARGET_OS_OSX */ vm_object_unlock(new_object); } /* * The new mapping is still backed by * anonymous (internal) memory, so it's * OK to substitute it for the original * malloc() mapping. */ } if (old_object != VM_OBJECT_NULL) { assert(!entry->vme_permanent); if (entry->is_sub_map) { if (entry->use_pmap) { #ifndef NO_NESTED_PMAP pmap_unnest(dst_map->pmap, (addr64_t)entry->vme_start, entry->vme_end - entry->vme_start); #endif /* NO_NESTED_PMAP */ if (dst_map->mapped_in_other_pmaps) { /* clean up parent */ /* map/maps */ vm_map_submap_pmap_clean( dst_map, entry->vme_start, entry->vme_end, VME_SUBMAP(entry), VME_OFFSET(entry)); } } else { vm_map_submap_pmap_clean( dst_map, entry->vme_start, entry->vme_end, VME_SUBMAP(entry), VME_OFFSET(entry)); } vm_map_deallocate(VME_SUBMAP(entry)); } else { if (dst_map->mapped_in_other_pmaps) { vm_object_pmap_protect_options( VME_OBJECT(entry), VME_OFFSET(entry), entry->vme_end - entry->vme_start, PMAP_NULL, PAGE_SIZE, entry->vme_start, VM_PROT_NONE, PMAP_OPTIONS_REMOVE); } else { pmap_remove_options( dst_map->pmap, (addr64_t)(entry->vme_start), (addr64_t)(entry->vme_end), PMAP_OPTIONS_REMOVE); } vm_object_deallocate(old_object); } } if (entry->iokit_acct) { /* keep using iokit accounting */ entry->use_pmap = FALSE; } else { /* use pmap accounting */ entry->use_pmap = TRUE; } assert(!entry->vme_permanent); VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0); object = VME_OBJECT(entry); entry->needs_copy = copy_entry->needs_copy; entry->wired_count = 0; entry->user_wired_count = 0; offset = VME_OFFSET(copy_entry); VME_OFFSET_SET(entry, offset); vm_map_copy_entry_unlink(copy, copy_entry); vm_map_copy_entry_dispose(copy_entry); /* * we could try to push pages into the pmap at this point, BUT * this optimization only saved on average 2 us per page if ALL * the pages in the source were currently mapped * and ALL the pages in the dest were touched, if there were fewer * than 2/3 of the pages touched, this optimization actually cost more cycles * it also puts a lot of pressure on the pmap layer w/r to mapping structures */ /* * Set up for the next iteration. The map * has not been unlocked, so the next * address should be at the end of this * entry, and the next map entry should be * the one following it. */ start = tmp_entry->vme_end; tmp_entry = tmp_entry->vme_next; } else { vm_map_version_t version; vm_object_t dst_object; vm_object_offset_t dst_offset; kern_return_t r; slow_copy: if (entry->needs_copy) { VME_OBJECT_SHADOW(entry, (entry->vme_end - entry->vme_start), vm_map_always_shadow(dst_map)); entry->needs_copy = FALSE; } dst_object = VME_OBJECT(entry); dst_offset = VME_OFFSET(entry); /* * Take an object reference, and record * the map version information so that the * map can be safely unlocked. */ if (dst_object == VM_OBJECT_NULL) { /* * We would usually have just taken the * optimized path above if the destination * object has not been allocated yet. But we * now disable that optimization if the copy * entry's object is not backed by anonymous * memory to avoid replacing malloc'ed * (i.e. re-usable) anonymous memory with a * not-so-anonymous mapping. * So we have to handle this case here and * allocate a new VM object for this map entry. */ dst_object = vm_object_allocate( entry->vme_end - entry->vme_start); dst_offset = 0; VME_OBJECT_SET(entry, dst_object, false, 0); VME_OFFSET_SET(entry, dst_offset); assert(entry->use_pmap); } vm_object_reference(dst_object); /* account for unlock bumping up timestamp */ version.main_timestamp = dst_map->timestamp + 1; vm_map_unlock(dst_map); /* * Copy as much as possible in one pass */ copy_size = size; r = vm_fault_copy( VME_OBJECT(copy_entry), VME_OFFSET(copy_entry), ©_size, dst_object, dst_offset, dst_map, &version, THREAD_UNINT ); /* * Release the object reference */ vm_object_deallocate(dst_object); /* * If a hard error occurred, return it now */ if (r != KERN_SUCCESS) { return r; } if (copy_size != 0) { /* * Dispose of the copied region */ vm_map_copy_clip_end(copy, copy_entry, copy_entry->vme_start + copy_size); vm_map_copy_entry_unlink(copy, copy_entry); vm_object_deallocate(VME_OBJECT(copy_entry)); vm_map_copy_entry_dispose(copy_entry); } /* * Pick up in the destination map where we left off. * * Use the version information to avoid a lookup * in the normal case. */ start += copy_size; vm_map_lock(dst_map); if (version.main_timestamp == dst_map->timestamp && copy_size != 0) { /* We can safely use saved tmp_entry value */ if (tmp_entry->map_aligned && !VM_MAP_PAGE_ALIGNED( start, VM_MAP_PAGE_MASK(dst_map))) { /* no longer map-aligned */ tmp_entry->map_aligned = FALSE; } vm_map_clip_end(dst_map, tmp_entry, start); tmp_entry = tmp_entry->vme_next; } else { /* Must do lookup of tmp_entry */ RetryLookup: if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } if (tmp_entry->map_aligned && !VM_MAP_PAGE_ALIGNED( start, VM_MAP_PAGE_MASK(dst_map))) { /* no longer map-aligned */ tmp_entry->map_aligned = FALSE; } vm_map_clip_start(dst_map, tmp_entry, start); } } }/* while */ return KERN_SUCCESS; }/* vm_map_copy_overwrite_aligned */ /* * Routine: vm_map_copyin_kernel_buffer [internal use only] * * Description: * Copy in data to a kernel buffer from space in the * source map. The original space may be optionally * deallocated. * * If successful, returns a new copy object. */ static kern_return_t vm_map_copyin_kernel_buffer( vm_map_t src_map, vm_map_offset_t src_addr, vm_map_size_t len, boolean_t src_destroy, vm_map_copy_t *copy_result) { kern_return_t kr; vm_map_copy_t copy; void *kdata; if (len > msg_ool_size_small) { return KERN_INVALID_ARGUMENT; } kdata = kalloc_data(len, Z_WAITOK); if (kdata == NULL) { return KERN_RESOURCE_SHORTAGE; } kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len); if (kr != KERN_SUCCESS) { kfree_data(kdata, len); return kr; } copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER); copy->cpy_kdata = kdata; copy->size = len; copy->offset = 0; if (src_destroy) { vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE; if (src_map == kernel_map) { flags |= VM_MAP_REMOVE_KUNWIRE; } (void)vm_map_remove_guard(src_map, vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)), vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)), flags, KMEM_GUARD_NONE); } *copy_result = copy; return KERN_SUCCESS; } /* * Routine: vm_map_copyout_kernel_buffer [internal use only] * * Description: * Copy out data from a kernel buffer into space in the * destination map. The space may be otpionally dynamically * allocated. * * If successful, consumes the copy object. * Otherwise, the caller is responsible for it. * * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ static int vm_map_copyout_kernel_buffer_failures = 0; static kern_return_t vm_map_copyout_kernel_buffer( vm_map_t map, vm_map_address_t *addr, /* IN/OUT */ vm_map_copy_t copy, vm_map_size_t copy_size, boolean_t overwrite, boolean_t consume_on_success) { kern_return_t kr = KERN_SUCCESS; thread_t thread = current_thread(); assert(copy->size == copy_size); /* * check for corrupted vm_map_copy structure */ if (copy_size > msg_ool_size_small || copy->offset) { panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", (long long)copy->size, (long long)copy->offset); } if (!overwrite) { /* * Allocate space in the target map for the data */ vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE(); if (map == kernel_map) { vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; } *addr = 0; kr = vm_map_enter(map, addr, vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(map)), (vm_map_offset_t) 0, vmk_flags, VM_OBJECT_NULL, (vm_object_offset_t) 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { return kr; } #if KASAN if (map->pmap == kernel_pmap) { kasan_notify_address(*addr, copy->size); } #endif } /* * Copyout the data from the kernel buffer to the target map. */ if (thread->map == map) { /* * If the target map is the current map, just do * the copy. */ assert((vm_size_t)copy_size == copy_size); if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { kr = KERN_INVALID_ADDRESS; } } else { vm_map_t oldmap; /* * If the target map is another map, assume the * target's address space identity for the duration * of the copy. */ vm_map_reference(map); oldmap = vm_map_switch(map); assert((vm_size_t)copy_size == copy_size); if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { vm_map_copyout_kernel_buffer_failures++; kr = KERN_INVALID_ADDRESS; } (void) vm_map_switch(oldmap); vm_map_deallocate(map); } if (kr != KERN_SUCCESS) { /* the copy failed, clean up */ if (!overwrite) { /* * Deallocate the space we allocated in the target map. */ (void) vm_map_remove(map, vm_map_trunc_page(*addr, VM_MAP_PAGE_MASK(map)), vm_map_round_page((*addr + vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(map))), VM_MAP_PAGE_MASK(map))); *addr = 0; } } else { /* copy was successful, dicard the copy structure */ if (consume_on_success) { kfree_data(copy->cpy_kdata, copy_size); zfree_id(ZONE_ID_VM_MAP_COPY, copy); } } return kr; } /* * Routine: vm_map_copy_insert [internal use only] * * Description: * Link a copy chain ("copy") into a map at the * specified location (after "where"). * * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. * Side effects: * The copy chain is destroyed. */ static void vm_map_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) { vm_map_entry_t entry; while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { entry = vm_map_copy_first_entry(copy); vm_map_copy_entry_unlink(copy, entry); vm_map_store_entry_link(map, after_where, entry, VM_MAP_KERNEL_FLAGS_NONE); after_where = entry; } zfree_id(ZONE_ID_VM_MAP_COPY, copy); } /* * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ void vm_map_copy_remap( vm_map_t map, vm_map_entry_t where, vm_map_copy_t copy, vm_map_offset_t adjustment, vm_prot_t cur_prot, vm_prot_t max_prot, vm_inherit_t inheritance) { vm_map_entry_t copy_entry, new_entry; for (copy_entry = vm_map_copy_first_entry(copy); copy_entry != vm_map_copy_to_entry(copy); copy_entry = copy_entry->vme_next) { /* get a new VM map entry for the map */ new_entry = vm_map_entry_create(map); /* copy the "copy entry" to the new entry */ vm_map_entry_copy(map, new_entry, copy_entry); /* adjust "start" and "end" */ new_entry->vme_start += adjustment; new_entry->vme_end += adjustment; /* clear some attributes */ new_entry->inheritance = inheritance; new_entry->protection = cur_prot; new_entry->max_protection = max_prot; new_entry->behavior = VM_BEHAVIOR_DEFAULT; /* take an extra reference on the entry's "object" */ if (new_entry->is_sub_map) { assert(!new_entry->use_pmap); /* not nested */ vm_map_reference(VME_SUBMAP(new_entry)); } else { vm_object_reference(VME_OBJECT(new_entry)); } /* insert the new entry in the map */ vm_map_store_entry_link(map, where, new_entry, VM_MAP_KERNEL_FLAGS_NONE); /* continue inserting the "copy entries" after the new entry */ where = new_entry; } } /* * Returns true if *size matches (or is in the range of) copy->size. * Upon returning true, the *size field is updated with the actual size of the * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types) */ boolean_t vm_map_copy_validate_size( vm_map_t dst_map, vm_map_copy_t copy, vm_map_size_t *size) { if (copy == VM_MAP_COPY_NULL) { return FALSE; } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); vm_map_size_t copy_sz = copy->size; vm_map_size_t sz = *size; switch (copy->type) { case VM_MAP_COPY_KERNEL_BUFFER: if (sz == copy_sz) { return TRUE; } break; case VM_MAP_COPY_ENTRY_LIST: /* * potential page-size rounding prevents us from exactly * validating this flavor of vm_map_copy, but we can at least * assert that it's within a range. */ if (copy_sz >= sz && copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) { *size = copy_sz; return TRUE; } break; default: break; } return FALSE; } static kern_return_t vm_map_copyout_internal( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy, vm_map_size_ut copy_size_u, boolean_t consume_on_success, vm_prot_t cur_protection, vm_prot_t max_protection, vm_inherit_t inheritance) { vm_map_size_t size, copy_size; vm_map_size_t adjustment; vm_map_offset_t start; vm_object_offset_t vm_copy_start; vm_map_entry_t last; vm_map_entry_t entry; vm_map_copy_t original_copy; kern_return_t kr; vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE(); /* * Check for null copy object. */ if (copy == VM_MAP_COPY_NULL) { *dst_addr = 0; return KERN_SUCCESS; } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) { *dst_addr = 0; ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */); return KERN_FAILURE; } copy_size = copy->size; /* * Check for special kernel buffer allocated * by new_ipc_kmsg_copyin. */ if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr, copy, copy_size, FALSE, consume_on_success); if (kr) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */); } return kr; } original_copy = copy; if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) { vm_map_copy_t target_copy; vm_map_offset_t overmap_start, overmap_end, trimmed_start; target_copy = VM_MAP_COPY_NULL; DEBUG4K_ADJUST("adjusting...\n"); kr = vm_map_copy_adjust_to_target( copy, 0, /* offset */ copy->size, /* size */ dst_map, TRUE, /* copy */ &target_copy, &overmap_start, &overmap_end, &trimmed_start); if (kr != KERN_SUCCESS) { DEBUG4K_COPY("adjust failed 0x%x\n", kr); ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */); return kr; } DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start); if (target_copy != copy) { copy = target_copy; } copy_size = copy->size; } /* * Find space for the data */ vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset, VM_MAP_COPY_PAGE_MASK(copy)); size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size, VM_MAP_COPY_PAGE_MASK(copy)) - vm_copy_start; vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size); vm_map_lock(dst_map); kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags, &start, &last); if (kr != KERN_SUCCESS) { vm_map_unlock(dst_map); ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */); return kr; } adjustment = start - vm_copy_start; if (!consume_on_success) { /* * We're not allowed to consume "copy", so we'll have to * copy its map entries into the destination map below. * No need to re-allocate map entries from the correct * (pageable or not) zone, since we'll get new map entries * during the transfer. * We'll also adjust the map entries's "start" and "end" * during the transfer, to keep "copy"'s entries consistent * with its "offset". */ goto after_adjustments; } /* * Since we're going to just drop the map * entries from the copy into the destination * map, they must come from the same pool. */ if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) { /* * Mismatches occur when dealing with the default * pager. */ vm_map_entry_t next, new; /* * Find the zone that the copies were allocated from */ entry = vm_map_copy_first_entry(copy); /* * Reinitialize the copy so that vm_map_copy_entry_link * will work. */ vm_map_store_copy_reset(copy, entry); copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable; /* * Copy each entry. */ while (entry != vm_map_copy_to_entry(copy)) { new = vm_map_copy_entry_create(copy); vm_map_entry_copy_full(new, entry); new->vme_no_copy_on_read = FALSE; assert(!new->iokit_acct); if (new->is_sub_map) { /* clr address space specifics */ new->use_pmap = FALSE; } vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), new); next = entry->vme_next; vm_map_entry_dispose(entry); entry = next; } } /* * Adjust the addresses in the copy chain, and * reset the region attributes. */ for (entry = vm_map_copy_first_entry(copy); entry != vm_map_copy_to_entry(copy); entry = entry->vme_next) { if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) { /* * We're injecting this copy entry into a map that * has the standard page alignment, so clear * "map_aligned" (which might have been inherited * from the original map entry). */ entry->map_aligned = FALSE; } entry->vme_start += adjustment; entry->vme_end += adjustment; if (entry->map_aligned) { assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, VM_MAP_PAGE_MASK(dst_map))); assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, VM_MAP_PAGE_MASK(dst_map))); } entry->inheritance = VM_INHERIT_DEFAULT; entry->protection = VM_PROT_DEFAULT; entry->max_protection = VM_PROT_ALL; entry->behavior = VM_BEHAVIOR_DEFAULT; /* * If the entry is now wired, * map the pages into the destination map. */ if (entry->wired_count != 0) { vm_map_offset_t va; vm_object_offset_t offset; vm_object_t object; vm_prot_t prot; int type_of_fault; uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE; /* TODO4K would need to use actual page size */ assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT); object = VME_OBJECT(entry); offset = VME_OFFSET(entry); va = entry->vme_start; pmap_pageable(dst_map->pmap, entry->vme_start, entry->vme_end, TRUE); while (va < entry->vme_end) { vm_page_t m; struct vm_object_fault_info fault_info = {}; /* * Look up the page in the object. * Assert that the page will be found in the * top object: * either * the object was newly created by * vm_object_copy_slowly, and has * copies of all of the pages from * the source object * or * the object was moved from the old * map entry; because the old map * entry was wired, all of the pages * were in the top-level object. * (XXX not true if we wire pages for * reading) */ vm_object_lock(object); m = vm_page_lookup(object, offset); if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) || m->vmp_absent) { panic("vm_map_copyout: wiring %p", m); } prot = entry->protection; if (override_nx(dst_map, VME_ALIAS(entry)) && prot) { prot |= VM_PROT_EXECUTE; } type_of_fault = DBG_CACHE_HIT_FAULT; fault_info.user_tag = VME_ALIAS(entry); fault_info.pmap_options = 0; if (entry->iokit_acct || (!entry->is_sub_map && !entry->use_pmap)) { fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; } if (entry->vme_xnu_user_debug && !VM_PAGE_OBJECT(m)->code_signed) { /* * Modified code-signed executable * region: this page does not belong * to a code-signed VM object, so it * must have been copied and should * therefore be typed XNU_USER_DEBUG * rather than XNU_USER_EXEC. */ fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG; } vm_fault_enter(m, dst_map->pmap, va, PAGE_SIZE, 0, prot, prot, VM_PAGE_WIRED(m), FALSE, /* change_wiring */ VM_KERN_MEMORY_NONE, /* tag - not wiring */ &fault_info, NULL, /* need_retry */ &type_of_fault, &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/ vm_object_unlock(object); offset += PAGE_SIZE_64; va += PAGE_SIZE; } } } after_adjustments: /* * Correct the page alignment for the result */ *dst_addr = start + (copy->offset - vm_copy_start); #if KASAN kasan_notify_address(*dst_addr, size); #endif /* * Update the hints and the map size */ if (consume_on_success) { SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy)); } else { SAVE_HINT_MAP_WRITE(dst_map, last); } dst_map->size += size; /* * Link in the copy */ if (consume_on_success) { vm_map_copy_insert(dst_map, last, copy); if (copy != original_copy) { vm_map_copy_discard(original_copy); original_copy = VM_MAP_COPY_NULL; } } else { vm_map_copy_remap(dst_map, last, copy, adjustment, cur_protection, max_protection, inheritance); if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) { vm_map_copy_discard(copy); copy = original_copy; } } vm_map_unlock(dst_map); /* * XXX If wiring_required, call vm_map_pageable */ return KERN_SUCCESS; } /* * Routine: vm_map_copyout_size * * Description: * Copy out a copy chain ("copy") into newly-allocated * space in the destination map. Uses a prevalidated * size for the copy object (vm_map_copy_validate_size). * * If successful, consumes the copy object. * Otherwise, the caller is responsible for it. */ kern_return_t vm_map_copyout_size( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy, vm_map_size_ut copy_size) { return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size, TRUE, /* consume_on_success */ VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); } /* * Routine: vm_map_copyout * * Description: * Copy out a copy chain ("copy") into newly-allocated * space in the destination map. * * If successful, consumes the copy object. * Otherwise, the caller is responsible for it. */ kern_return_t vm_map_copyout( vm_map_t dst_map, vm_map_address_t *dst_addr, /* OUT */ vm_map_copy_t copy) { return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0, TRUE, /* consume_on_success */ VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); } /* * Routine: vm_map_copyin * * Description: * see vm_map_copyin_common. Exported via Unsupported.exports. * */ kern_return_t vm_map_copyin( vm_map_t src_map, vm_map_address_ut src_addr, vm_map_size_ut len, boolean_t src_destroy, vm_map_copy_t *copy_result) /* OUT */ { return vm_map_copyin_common(src_map, src_addr, len, src_destroy, FALSE, copy_result, FALSE); } /* * Routine: vm_map_copyin_common * * Description: * Copy the specified region (src_addr, len) from the * source address space (src_map), possibly removing * the region from the source address space (src_destroy). * * Returns: * A vm_map_copy_t object (copy_result), suitable for * insertion into another address space (using vm_map_copyout), * copying over another address space region (using * vm_map_copy_overwrite). If the copy is unused, it * should be destroyed (using vm_map_copy_discard). * * In/out conditions: * The source map should not be locked on entry. */ typedef struct submap_map { vm_map_t parent_map; vm_map_offset_t base_start; vm_map_offset_t base_end; vm_map_size_t base_len; struct submap_map *next; } submap_map_t; kern_return_t vm_map_copyin_common( vm_map_t src_map, vm_map_address_ut src_addr, vm_map_size_ut len, boolean_t src_destroy, __unused boolean_t src_volatile, vm_map_copy_t *copy_result, /* OUT */ boolean_t use_maxprot) { int flags; flags = 0; if (src_destroy) { flags |= VM_MAP_COPYIN_SRC_DESTROY; } if (use_maxprot) { flags |= VM_MAP_COPYIN_USE_MAXPROT; } return vm_map_copyin_internal(src_map, src_addr, len, flags, copy_result); } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_copyin_sanitize( vm_map_t src_map, vm_map_address_ut src_addr_u, vm_map_size_ut len_u, vm_map_offset_t *src_start, vm_map_offset_t *src_end, vm_map_size_t *len, vm_map_offset_t *src_addr_unaligned) { kern_return_t kr; vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES; if (src_map->pmap == kernel_pmap) { flags |= VM_SANITIZE_FLAGS_CANONICALIZE; } kr = vm_sanitize_addr_size(src_addr_u, len_u, VM_SANITIZE_CALLER_VM_MAP_COPYIN, src_map, flags, src_start, src_end, len); if (__improbable(kr != KERN_SUCCESS)) { return kr; } /* * Compute (page aligned) start and end of region */ *src_addr_unaligned = *src_start; /* remember unaligned value */ *src_start = vm_map_trunc_page(*src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)); *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map)); return KERN_SUCCESS; } kern_return_t vm_map_copyin_internal( vm_map_t src_map, vm_map_address_ut src_addr_u, vm_map_size_ut len_u, int flags, vm_map_copy_t *copy_result) /* OUT */ { vm_map_entry_t tmp_entry; /* Result of last map lookup -- * in multi-level lookup, this * entry contains the actual * vm_object/offset. */ vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */ vm_map_offset_t src_start; /* Start of current entry -- * where copy is taking place now */ vm_map_offset_t src_end; /* End of entire region to be * copied */ vm_map_offset_t src_addr_unaligned; vm_map_offset_t src_base; vm_map_size_t len; vm_map_t base_map = src_map; boolean_t map_share = FALSE; submap_map_t *parent_maps = NULL; vm_map_copy_t copy; /* Resulting copy */ vm_map_address_t copy_addr; vm_map_size_t copy_size; boolean_t src_destroy; boolean_t use_maxprot; boolean_t preserve_purgeable; boolean_t entry_was_shared; vm_map_entry_t saved_src_entry; kern_return_t kr; if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) { return KERN_INVALID_ARGUMENT; } /* * Check for copies of zero bytes. */ if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) { *copy_result = VM_MAP_COPY_NULL; return KERN_SUCCESS; } /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_copyin_sanitize( src_map, src_addr_u, len_u, &src_start, &src_end, &len, &src_addr_unaligned); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE; use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE; preserve_purgeable = (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE; /* * If the copy is sufficiently small, use a kernel buffer instead * of making a virtual copy. The theory being that the cost of * setting up VM (and taking C-O-W faults) dominates the copy costs * for small regions. */ if ((len <= msg_ool_size_small) && !use_maxprot && !preserve_purgeable && !(flags & VM_MAP_COPYIN_ENTRY_LIST) && /* * Since the "msg_ool_size_small" threshold was increased and * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the * address space limits, we revert to doing a virtual copy if the * copied range goes beyond those limits. Otherwise, mach_vm_read() * of the commpage would now fail when it used to work. */ (src_start >= vm_map_min(src_map) && src_start < vm_map_max(src_map) && src_end >= vm_map_min(src_map) && src_end < vm_map_max(src_map))) { return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len, src_destroy, copy_result); } /* * Allocate a header element for the list. * * Use the start and end in the header to * remember the endpoints prior to rounding. */ copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); copy->cpy_hdr.entries_pageable = TRUE; copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map); copy->offset = src_addr_unaligned; copy->size = len; new_entry = vm_map_copy_entry_create(copy); #define RETURN(x) \ MACRO_BEGIN \ vm_map_unlock(src_map); \ if(src_map != base_map) \ vm_map_deallocate(src_map); \ if (new_entry != VM_MAP_ENTRY_NULL) \ vm_map_copy_entry_dispose(new_entry); \ vm_map_copy_discard(copy); \ { \ submap_map_t *_ptr; \ \ for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \ parent_maps=parent_maps->next; \ if (_ptr->parent_map != base_map) \ vm_map_deallocate(_ptr->parent_map); \ kfree_type(submap_map_t, _ptr); \ } \ } \ MACRO_RETURN(x); \ MACRO_END /* * Find the beginning of the region. */ vm_map_lock(src_map); /* * Lookup the original "src_addr_unaligned" rather than the truncated * "src_start", in case "src_start" falls in a non-map-aligned * map entry *before* the map entry that contains "src_addr_unaligned"... */ if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) { RETURN(KERN_INVALID_ADDRESS); } if (!tmp_entry->is_sub_map) { /* * ... but clip to the map-rounded "src_start" rather than * "src_addr_unaligned" to preserve map-alignment. We'll adjust the * first copy entry at the end, if needed. */ vm_map_clip_start(src_map, tmp_entry, src_start); } if (src_start < tmp_entry->vme_start) { /* * Move "src_start" up to the start of the * first map entry to copy. */ src_start = tmp_entry->vme_start; } /* set for later submap fix-up */ copy_addr = src_start; /* * Go through entries until we get to the end. */ while (TRUE) { vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ vm_map_size_t src_size; /* Size of source * map entry (in both * maps) */ vm_object_t src_object; /* Object to copy */ vm_object_offset_t src_offset; vm_object_t new_copy_object;/* vm_object_copy_* result */ boolean_t src_needs_copy; /* Should source map * be made read-only * for copy-on-write? */ boolean_t new_entry_needs_copy; /* Will new entry be COW? */ boolean_t was_wired; /* Was source wired? */ boolean_t saved_used_for_jit; /* Saved used_for_jit. */ vm_map_version_t version; /* Version before locks * dropped to make copy */ kern_return_t result; /* Return value from * copy_strategically. */ while (tmp_entry->is_sub_map) { vm_map_size_t submap_len; submap_map_t *ptr; ptr = kalloc_type(submap_map_t, Z_WAITOK); ptr->next = parent_maps; parent_maps = ptr; ptr->parent_map = src_map; ptr->base_start = src_start; ptr->base_end = src_end; submap_len = tmp_entry->vme_end - src_start; if (submap_len > (src_end - src_start)) { submap_len = src_end - src_start; } ptr->base_len = submap_len; src_start -= tmp_entry->vme_start; src_start += VME_OFFSET(tmp_entry); src_end = src_start + submap_len; src_map = VME_SUBMAP(tmp_entry); vm_map_lock(src_map); /* keep an outstanding reference for all maps in */ /* the parents tree except the base map */ vm_map_reference(src_map); vm_map_unlock(ptr->parent_map); if (!vm_map_lookup_entry( src_map, src_start, &tmp_entry)) { RETURN(KERN_INVALID_ADDRESS); } map_share = TRUE; if (!tmp_entry->is_sub_map) { vm_map_clip_start(src_map, tmp_entry, src_start); } src_entry = tmp_entry; } /* we are now in the lowest level submap... */ if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) && (VME_OBJECT(tmp_entry)->phys_contiguous)) { /* This is not, supported for now.In future */ /* we will need to detect the phys_contig */ /* condition and then upgrade copy_slowly */ /* to do physical copy from the device mem */ /* based object. We can piggy-back off of */ /* the was wired boolean to set-up the */ /* proper handling */ RETURN(KERN_PROTECTION_FAILURE); } /* * Create a new address map entry to hold the result. * Fill in the fields from the appropriate source entries. * We must unlock the source map to do this if we need * to allocate a map entry. */ if (new_entry == VM_MAP_ENTRY_NULL) { version.main_timestamp = src_map->timestamp; vm_map_unlock(src_map); new_entry = vm_map_copy_entry_create(copy); vm_map_lock(src_map); if ((version.main_timestamp + 1) != src_map->timestamp) { if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { RETURN(KERN_INVALID_ADDRESS); } if (!tmp_entry->is_sub_map) { vm_map_clip_start(src_map, tmp_entry, src_start); } continue; /* restart w/ new tmp_entry */ } } /* * Verify that the region can be read. */ if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE && !use_maxprot) || (src_entry->max_protection & VM_PROT_READ) == 0) { RETURN(KERN_PROTECTION_FAILURE); } /* * Clip against the endpoints of the entire region. */ vm_map_clip_end(src_map, src_entry, src_end); src_size = src_entry->vme_end - src_start; src_object = VME_OBJECT(src_entry); src_offset = VME_OFFSET(src_entry); was_wired = (src_entry->wired_count != 0); vm_map_entry_copy(src_map, new_entry, src_entry); if (new_entry->is_sub_map) { /* clr address space specifics */ new_entry->use_pmap = FALSE; } else { /* * We're dealing with a copy-on-write operation, * so the resulting mapping should not inherit the * original mapping's accounting settings. * "iokit_acct" should have been cleared in * vm_map_entry_copy(). * "use_pmap" should be reset to its default (TRUE) * so that the new mapping gets accounted for in * the task's memory footprint. */ assert(!new_entry->iokit_acct); new_entry->use_pmap = TRUE; } /* * Attempt non-blocking copy-on-write optimizations. */ /* * If we are destroying the source, and the object * is internal, we could move the object reference * from the source to the copy. The copy is * copy-on-write only if the source is. * We make another reference to the object, because * destroying the source entry will deallocate it. * * This memory transfer has to be atomic, (to prevent * the VM object from being shared or copied while * it's being moved here), so we could only do this * if we won't have to unlock the VM map until the * original mapping has been fully removed. */ RestartCopy: if ((src_object == VM_OBJECT_NULL || (!was_wired && !map_share && !tmp_entry->is_shared && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) && vm_object_copy_quickly( VME_OBJECT(new_entry), src_offset, src_size, &src_needs_copy, &new_entry_needs_copy)) { new_entry->needs_copy = new_entry_needs_copy; /* * Handle copy-on-write obligations */ if (src_needs_copy && !tmp_entry->needs_copy) { vm_prot_t prot; prot = src_entry->protection & ~VM_PROT_WRITE; if (override_nx(src_map, VME_ALIAS(src_entry)) && prot) { prot |= VM_PROT_EXECUTE; } vm_object_pmap_protect( src_object, src_offset, src_size, (src_entry->is_shared ? PMAP_NULL : src_map->pmap), VM_MAP_PAGE_SIZE(src_map), src_entry->vme_start, prot); assert(tmp_entry->wired_count == 0); tmp_entry->needs_copy = TRUE; } /* * The map has never been unlocked, so it's safe * to move to the next entry rather than doing * another lookup. */ goto CopySuccessful; } entry_was_shared = tmp_entry->is_shared; /* * Take an object reference, so that we may * release the map lock(s). */ assert(src_object != VM_OBJECT_NULL); vm_object_reference(src_object); /* * Record the timestamp for later verification. * Unlock the map. */ version.main_timestamp = src_map->timestamp; vm_map_unlock(src_map); /* Increments timestamp once! */ saved_src_entry = src_entry; tmp_entry = VM_MAP_ENTRY_NULL; src_entry = VM_MAP_ENTRY_NULL; /* * Perform the copy */ if (was_wired || (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK && !(flags & VM_MAP_COPYIN_FORK)) || (debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) { CopySlowly: vm_object_lock(src_object); result = vm_object_copy_slowly( src_object, src_offset, src_size, THREAD_UNINT, &new_copy_object); /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */ saved_used_for_jit = new_entry->used_for_jit; VME_OBJECT_SET(new_entry, new_copy_object, false, 0); new_entry->used_for_jit = saved_used_for_jit; VME_OFFSET_SET(new_entry, src_offset - vm_object_trunc_page(src_offset)); new_entry->needs_copy = FALSE; } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && (entry_was_shared || map_share)) { vm_object_t new_object; vm_object_lock_shared(src_object); new_object = vm_object_copy_delayed( src_object, src_offset, src_size, TRUE); if (new_object == VM_OBJECT_NULL) { goto CopySlowly; } VME_OBJECT_SET(new_entry, new_object, false, 0); assert(new_entry->wired_count == 0); new_entry->needs_copy = TRUE; assert(!new_entry->iokit_acct); assert(new_object->purgable == VM_PURGABLE_DENY); assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry); result = KERN_SUCCESS; } else { vm_object_offset_t new_offset; new_offset = VME_OFFSET(new_entry); result = vm_object_copy_strategically(src_object, src_offset, src_size, (flags & VM_MAP_COPYIN_FORK), &new_copy_object, &new_offset, &new_entry_needs_copy); /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */ saved_used_for_jit = new_entry->used_for_jit; VME_OBJECT_SET(new_entry, new_copy_object, false, 0); new_entry->used_for_jit = saved_used_for_jit; if (new_offset != VME_OFFSET(new_entry)) { VME_OFFSET_SET(new_entry, new_offset); } new_entry->needs_copy = new_entry_needs_copy; } if (result == KERN_SUCCESS && ((preserve_purgeable && src_object->purgable != VM_PURGABLE_DENY) || new_entry->used_for_jit)) { /* * Purgeable objects should be COPY_NONE, true share; * this should be propogated to the copy. * * Also force mappings the pmap specially protects to * be COPY_NONE; trying to COW these mappings would * change the effective protections, which could have * side effects if the pmap layer relies on the * specified protections. */ vm_object_t new_object; new_object = VME_OBJECT(new_entry); assert(new_object != src_object); vm_object_lock(new_object); assert(os_ref_get_count_raw(&new_object->ref_count) == 1); assert(new_object->shadow == VM_OBJECT_NULL); assert(new_object->vo_copy == VM_OBJECT_NULL); assert(new_object->vo_owner == NULL); new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; if (preserve_purgeable && src_object->purgable != VM_PURGABLE_DENY) { VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE); /* start as non-volatile with no owner... */ VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE); vm_purgeable_nonvolatile_enqueue(new_object, NULL); /* ... and move to src_object's purgeable state */ if (src_object->purgable != VM_PURGABLE_NONVOLATILE) { int state; state = src_object->purgable; vm_object_purgable_control( new_object, VM_PURGABLE_SET_STATE_FROM_KERNEL, &state); } /* no pmap accounting for purgeable objects */ new_entry->use_pmap = FALSE; } vm_object_unlock(new_object); new_object = VM_OBJECT_NULL; } /* * Throw away the extra reference */ vm_object_deallocate(src_object); if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { vm_map_lock(src_map); RETURN(result); } /* * Verify that the map has not substantially * changed while the copy was being made. */ vm_map_lock(src_map); if ((version.main_timestamp + 1) == src_map->timestamp) { /* src_map hasn't changed: src_entry is still valid */ src_entry = saved_src_entry; goto VerificationSuccessful; } /* * Simple version comparison failed. * * Retry the lookup and verify that the * same object/offset are still present. * * [Note: a memory manager that colludes with * the calling task can detect that we have * cheated. While the map was unlocked, the * mapping could have been changed and restored.] */ if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { if (result != KERN_MEMORY_RESTART_COPY) { vm_object_deallocate(VME_OBJECT(new_entry)); VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0); /* reset accounting state */ new_entry->iokit_acct = FALSE; new_entry->use_pmap = TRUE; } RETURN(KERN_INVALID_ADDRESS); } src_entry = tmp_entry; vm_map_clip_start(src_map, src_entry, src_start); if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) && !use_maxprot) || ((src_entry->max_protection & VM_PROT_READ) == 0)) { goto VerificationFailed; } if (src_entry->vme_end < new_entry->vme_end) { /* * This entry might have been shortened * (vm_map_clip_end) or been replaced with * an entry that ends closer to "src_start" * than before. * Adjust "new_entry" accordingly; copying * less memory would be correct but we also * redo the copy (see below) if the new entry * no longer points at the same object/offset. */ assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end, VM_MAP_COPY_PAGE_MASK(copy))); new_entry->vme_end = src_entry->vme_end; src_size = new_entry->vme_end - src_start; } else if (src_entry->vme_end > new_entry->vme_end) { /* * This entry might have been extended * (vm_map_entry_simplify() or coalesce) * or been replaced with an entry that ends farther * from "src_start" than before. * * We've called vm_object_copy_*() only on * the previous range, so we can't * just extend new_entry. We have to re-do * the copy based on the new entry as if it was * pointing at a different object/offset (see * "Verification failed" below). */ } if ((VME_OBJECT(src_entry) != src_object) || (VME_OFFSET(src_entry) != src_offset) || (src_entry->vme_end > new_entry->vme_end)) { /* * Verification failed. * * Start over with this top-level entry. */ VerificationFailed: ; vm_object_deallocate(VME_OBJECT(new_entry)); tmp_entry = src_entry; continue; } /* * Verification succeeded. */ VerificationSuccessful:; if (result == KERN_MEMORY_RESTART_COPY) { goto RestartCopy; } /* * Copy succeeded. */ CopySuccessful: ; /* * Link in the new copy entry. */ vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), new_entry); /* * Determine whether the entire region * has been copied. */ src_base = src_start; src_start = new_entry->vme_end; new_entry = VM_MAP_ENTRY_NULL; while ((src_start >= src_end) && (src_end != 0)) { submap_map_t *ptr; if (src_map == base_map) { /* back to the top */ break; } ptr = parent_maps; assert(ptr != NULL); parent_maps = parent_maps->next; /* fix up the damage we did in that submap */ vm_map_simplify_range(src_map, src_base, src_end); vm_map_unlock(src_map); vm_map_deallocate(src_map); vm_map_lock(ptr->parent_map); src_map = ptr->parent_map; src_base = ptr->base_start; src_start = ptr->base_start + ptr->base_len; src_end = ptr->base_end; if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry) && (src_end > src_start)) { RETURN(KERN_INVALID_ADDRESS); } kfree_type(submap_map_t, ptr); if (parent_maps == NULL) { map_share = FALSE; } src_entry = tmp_entry->vme_prev; } if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) && (src_start >= src_addr_unaligned + len) && (src_addr_unaligned + len != 0)) { /* * Stop copying now, even though we haven't reached * "src_end". We'll adjust the end of the last copy * entry at the end, if needed. * * If src_map's aligment is different from the * system's page-alignment, there could be * extra non-map-aligned map entries between * the original (non-rounded) "src_addr_unaligned + len" * and the rounded "src_end". * We do not want to copy those map entries since * they're not part of the copied range. */ break; } if ((src_start >= src_end) && (src_end != 0)) { break; } /* * Verify that there are no gaps in the region */ tmp_entry = src_entry->vme_next; if ((tmp_entry->vme_start != src_start) || (tmp_entry == vm_map_to_entry(src_map))) { RETURN(KERN_INVALID_ADDRESS); } } /* * If the source should be destroyed, do it now, since the * copy was successful. */ if (src_destroy) { vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS; if (src_map == kernel_map) { remove_flags |= VM_MAP_REMOVE_KUNWIRE; } (void)vm_map_remove_and_unlock(src_map, vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)), src_end, remove_flags, KMEM_GUARD_NONE); } else { /* fix up the damage we did in the base map */ vm_map_simplify_range( src_map, vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)), vm_map_round_page(src_end, VM_MAP_PAGE_MASK(src_map))); vm_map_unlock(src_map); } tmp_entry = VM_MAP_ENTRY_NULL; if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT && VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) { vm_map_offset_t original_start, original_offset, original_end; assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK); /* adjust alignment of first copy_entry's "vme_start" */ tmp_entry = vm_map_copy_first_entry(copy); if (tmp_entry != vm_map_copy_to_entry(copy)) { vm_map_offset_t adjustment; original_start = tmp_entry->vme_start; original_offset = VME_OFFSET(tmp_entry); /* map-align the start of the first copy entry... */ adjustment = (tmp_entry->vme_start - vm_map_trunc_page( tmp_entry->vme_start, VM_MAP_PAGE_MASK(src_map))); tmp_entry->vme_start -= adjustment; VME_OFFSET_SET(tmp_entry, VME_OFFSET(tmp_entry) - adjustment); copy_addr -= adjustment; assert(tmp_entry->vme_start < tmp_entry->vme_end); /* ... adjust for mis-aligned start of copy range */ adjustment = (vm_map_trunc_page(copy->offset, PAGE_MASK) - vm_map_trunc_page(copy->offset, VM_MAP_PAGE_MASK(src_map))); if (adjustment) { assert(page_aligned(adjustment)); assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); tmp_entry->vme_start += adjustment; VME_OFFSET_SET(tmp_entry, (VME_OFFSET(tmp_entry) + adjustment)); copy_addr += adjustment; assert(tmp_entry->vme_start < tmp_entry->vme_end); } /* * Assert that the adjustments haven't exposed * more than was originally copied... */ assert(tmp_entry->vme_start >= original_start); assert(VME_OFFSET(tmp_entry) >= original_offset); /* * ... and that it did not adjust outside of a * a single 16K page. */ assert(vm_map_trunc_page(tmp_entry->vme_start, VM_MAP_PAGE_MASK(src_map)) == vm_map_trunc_page(original_start, VM_MAP_PAGE_MASK(src_map))); } /* adjust alignment of last copy_entry's "vme_end" */ tmp_entry = vm_map_copy_last_entry(copy); if (tmp_entry != vm_map_copy_to_entry(copy)) { vm_map_offset_t adjustment; original_end = tmp_entry->vme_end; /* map-align the end of the last copy entry... */ tmp_entry->vme_end = vm_map_round_page(tmp_entry->vme_end, VM_MAP_PAGE_MASK(src_map)); /* ... adjust for mis-aligned end of copy range */ adjustment = (vm_map_round_page((copy->offset + copy->size), VM_MAP_PAGE_MASK(src_map)) - vm_map_round_page((copy->offset + copy->size), PAGE_MASK)); if (adjustment) { assert(page_aligned(adjustment)); assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); tmp_entry->vme_end -= adjustment; assert(tmp_entry->vme_start < tmp_entry->vme_end); } /* * Assert that the adjustments haven't exposed * more than was originally copied... */ assert(tmp_entry->vme_end <= original_end); /* * ... and that it did not adjust outside of a * a single 16K page. */ assert(vm_map_round_page(tmp_entry->vme_end, VM_MAP_PAGE_MASK(src_map)) == vm_map_round_page(original_end, VM_MAP_PAGE_MASK(src_map))); } } /* Fix-up start and end points in copy. This is necessary */ /* when the various entries in the copy object were picked */ /* up from different sub-maps */ tmp_entry = vm_map_copy_first_entry(copy); copy_size = 0; /* compute actual size */ while (tmp_entry != vm_map_copy_to_entry(copy)) { assert(VM_MAP_PAGE_ALIGNED( copy_addr + (tmp_entry->vme_end - tmp_entry->vme_start), MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK))); assert(VM_MAP_PAGE_ALIGNED( copy_addr, MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK))); /* * The copy_entries will be injected directly into the * destination map and might not be "map aligned" there... */ tmp_entry->map_aligned = FALSE; tmp_entry->vme_end = copy_addr + (tmp_entry->vme_end - tmp_entry->vme_start); tmp_entry->vme_start = copy_addr; assert(tmp_entry->vme_start < tmp_entry->vme_end); copy_addr += tmp_entry->vme_end - tmp_entry->vme_start; copy_size += tmp_entry->vme_end - tmp_entry->vme_start; tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next; } if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT && copy_size < copy->size) { /* * The actual size of the VM map copy is smaller than what * was requested by the caller. This must be because some * PAGE_SIZE-sized pages are missing at the end of the last * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range. * The caller might not have been aware of those missing * pages and might not want to be aware of it, which is * fine as long as they don't try to access (and crash on) * those missing pages. * Let's adjust the size of the "copy", to avoid failing * in vm_map_copyout() or vm_map_copy_overwrite(). */ assert(vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(src_map)) == vm_map_round_page(copy->size, VM_MAP_PAGE_MASK(src_map))); copy->size = copy_size; } *copy_result = copy; return KERN_SUCCESS; #undef RETURN } kern_return_t vm_map_copy_extract( vm_map_t src_map, vm_map_address_t src_addr, vm_map_size_t len, boolean_t do_copy, vm_map_copy_t *copy_result, /* OUT */ vm_prot_t *cur_prot, /* IN/OUT */ vm_prot_t *max_prot, /* IN/OUT */ vm_inherit_t inheritance, vm_map_kernel_flags_t vmk_flags) { vm_map_copy_t copy; kern_return_t kr; vm_prot_t required_cur_prot, required_max_prot; /* * Check for copies of zero bytes. */ if (len == 0) { *copy_result = VM_MAP_COPY_NULL; return KERN_SUCCESS; } /* * Check that the end address doesn't overflow */ if (src_addr + len < src_addr) { return KERN_INVALID_ADDRESS; } if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) { return KERN_INVALID_ADDRESS; } if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) { DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len)); } required_cur_prot = *cur_prot; required_max_prot = *max_prot; /* * Allocate a header element for the list. * * Use the start and end in the header to * remember the endpoints prior to rounding. */ copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable; copy->offset = 0; copy->size = len; kr = vm_map_remap_extract(src_map, src_addr, len, do_copy, /* copy */ copy, cur_prot, /* IN/OUT */ max_prot, /* IN/OUT */ inheritance, vmk_flags); if (kr != KERN_SUCCESS) { vm_map_copy_discard(copy); if ((kr == KERN_INVALID_ADDRESS || kr == KERN_INVALID_ARGUMENT) && src_map->terminated) { /* tell the caller that this address space is gone */ kr = KERN_TERMINATED; } return kr; } if (required_cur_prot != VM_PROT_NONE) { assert((*cur_prot & required_cur_prot) == required_cur_prot); assert((*max_prot & required_max_prot) == required_max_prot); } *copy_result = copy; return KERN_SUCCESS; } static void vm_map_fork_share( vm_map_t old_map, vm_map_entry_t old_entry, vm_map_t new_map) { vm_object_t object; vm_map_entry_t new_entry; /* * New sharing code. New map entry * references original object. Internal * objects use asynchronous copy algorithm for * future copies. First make sure we have * the right object. If we need a shadow, * or someone else already has one, then * make a new shadow and share it. */ if (!old_entry->is_sub_map) { object = VME_OBJECT(old_entry); } if (old_entry->is_sub_map) { assert(old_entry->wired_count == 0); #ifndef NO_NESTED_PMAP #if !PMAP_FORK_NEST if (old_entry->use_pmap) { kern_return_t result; result = pmap_nest(new_map->pmap, (VME_SUBMAP(old_entry))->pmap, (addr64_t)old_entry->vme_start, (uint64_t)(old_entry->vme_end - old_entry->vme_start)); if (result) { panic("vm_map_fork_share: pmap_nest failed!"); } } #endif /* !PMAP_FORK_NEST */ #endif /* NO_NESTED_PMAP */ } else if (object == VM_OBJECT_NULL) { object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end - old_entry->vme_start)); VME_OFFSET_SET(old_entry, 0); VME_OBJECT_SET(old_entry, object, false, 0); old_entry->use_pmap = TRUE; // assert(!old_entry->needs_copy); } else if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* * We are already using an asymmetric * copy, and therefore we already have * the right object. */ assert(!old_entry->needs_copy); } else if (old_entry->needs_copy || /* case 1 */ object->shadowed || /* case 2 */ (!object->true_share && /* case 3 */ !old_entry->is_shared && (object->vo_size > (vm_map_size_t)(old_entry->vme_end - old_entry->vme_start)))) { bool is_writable; /* * We need to create a shadow. * There are three cases here. * In the first case, we need to * complete a deferred symmetrical * copy that we participated in. * In the second and third cases, * we need to create the shadow so * that changes that we make to the * object do not interfere with * any symmetrical copies which * have occured (case 2) or which * might occur (case 3). * * The first case is when we had * deferred shadow object creation * via the entry->needs_copy mechanism. * This mechanism only works when * only one entry points to the source * object, and we are about to create * a second entry pointing to the * same object. The problem is that * there is no way of mapping from * an object to the entries pointing * to it. (Deferred shadow creation * works with one entry because occurs * at fault time, and we walk from the * entry to the object when handling * the fault.) * * The second case is when the object * to be shared has already been copied * with a symmetric copy, but we point * directly to the object without * needs_copy set in our entry. (This * can happen because different ranges * of an object can be pointed to by * different entries. In particular, * a single entry pointing to an object * can be split by a call to vm_inherit, * which, combined with task_create, can * result in the different entries * having different needs_copy values.) * The shadowed flag in the object allows * us to detect this case. The problem * with this case is that if this object * has or will have shadows, then we * must not perform an asymmetric copy * of this object, since such a copy * allows the object to be changed, which * will break the previous symmetrical * copies (which rely upon the object * not changing). In a sense, the shadowed * flag says "don't change this object". * We fix this by creating a shadow * object for this object, and sharing * that. This works because we are free * to change the shadow object (and thus * to use an asymmetric copy strategy); * this is also semantically correct, * since this object is temporary, and * therefore a copy of the object is * as good as the object itself. (This * is not true for permanent objects, * since the pager needs to see changes, * which won't happen if the changes * are made to a copy.) * * The third case is when the object * to be shared has parts sticking * outside of the entry we're working * with, and thus may in the future * be subject to a symmetrical copy. * (This is a preemptive version of * case 2.) */ VME_OBJECT_SHADOW(old_entry, (vm_map_size_t) (old_entry->vme_end - old_entry->vme_start), vm_map_always_shadow(old_map)); /* * If we're making a shadow for other than * copy on write reasons, then we have * to remove write permission. */ is_writable = false; if (old_entry->protection & VM_PROT_WRITE) { is_writable = true; #if __arm64e__ } else if (old_entry->used_for_tpro) { is_writable = true; #endif /* __arm64e__ */ } if (!old_entry->needs_copy && is_writable) { vm_prot_t prot; if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, old_map, old_map->pmap, old_entry, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end, old_entry->protection); } prot = old_entry->protection & ~VM_PROT_WRITE; if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, old_map, old_map->pmap, old_entry, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end, prot); } if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) { prot |= VM_PROT_EXECUTE; } if (old_map->mapped_in_other_pmaps) { vm_object_pmap_protect( VME_OBJECT(old_entry), VME_OFFSET(old_entry), (old_entry->vme_end - old_entry->vme_start), PMAP_NULL, PAGE_SIZE, old_entry->vme_start, prot); } else { pmap_protect(old_map->pmap, old_entry->vme_start, old_entry->vme_end, prot); } } old_entry->needs_copy = FALSE; object = VME_OBJECT(old_entry); } /* * If object was using a symmetric copy strategy, * change its copy strategy to the default * asymmetric copy strategy, which is copy_delay * in the non-norma case and copy_call in the * norma case. Bump the reference count for the * new entry. */ if (old_entry->is_sub_map) { vm_map_reference(VME_SUBMAP(old_entry)); } else { vm_object_lock(object); vm_object_reference_locked(object); if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; } vm_object_unlock(object); } /* * Clone the entry, using object ref from above. * Mark both entries as shared. */ new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */ vm_map_entry_copy(old_map, new_entry, old_entry); old_entry->is_shared = TRUE; new_entry->is_shared = TRUE; /* * We're dealing with a shared mapping, so the resulting mapping * should inherit some of the original mapping's accounting settings. * "iokit_acct" should have been cleared in vm_map_entry_copy(). * "use_pmap" should stay the same as before (if it hasn't been reset * to TRUE when we cleared "iokit_acct"). */ assert(!new_entry->iokit_acct); /* * If old entry's inheritence is VM_INHERIT_NONE, * the new entry is for corpse fork, remove the * write permission from the new entry. */ if (old_entry->inheritance == VM_INHERIT_NONE) { new_entry->protection &= ~VM_PROT_WRITE; new_entry->max_protection &= ~VM_PROT_WRITE; } /* * Insert the entry into the new map -- we * know we're inserting at the end of the new * map. */ vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry, VM_MAP_KERNEL_FLAGS_NONE); /* * Update the physical map */ if (old_entry->is_sub_map) { /* Bill Angell pmap support goes here */ } else { pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start, old_entry->vme_end - old_entry->vme_start, old_entry->vme_start); } } static boolean_t vm_map_fork_copy( vm_map_t old_map, vm_map_entry_t *old_entry_p, vm_map_t new_map, int vm_map_copyin_flags) { vm_map_entry_t old_entry = *old_entry_p; vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start; vm_map_offset_t start = old_entry->vme_start; vm_map_copy_t copy; vm_map_entry_t last = vm_map_last_entry(new_map); vm_map_unlock(old_map); /* * Use maxprot version of copyin because we * care about whether this memory can ever * be accessed, not just whether it's accessible * right now. */ vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT; if (vm_map_copyin_internal(old_map, start, entry_size, vm_map_copyin_flags, ©) != KERN_SUCCESS) { /* * The map might have changed while it * was unlocked, check it again. Skip * any blank space or permanently * unreadable region. */ vm_map_lock(old_map); if (!vm_map_lookup_entry(old_map, start, &last) || (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) { last = last->vme_next; } *old_entry_p = last; /* * XXX For some error returns, want to * XXX skip to the next element. Note * that INVALID_ADDRESS and * PROTECTION_FAILURE are handled above. */ return FALSE; } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); /* * Insert the copy into the new map */ vm_map_copy_insert(new_map, last, copy); /* * Pick up the traversal at the end of * the copied region. */ vm_map_lock(old_map); start += entry_size; if (!vm_map_lookup_entry(old_map, start, &last)) { last = last->vme_next; } else { if (last->vme_start == start) { /* * No need to clip here and we don't * want to cause any unnecessary * unnesting... */ } else { vm_map_clip_start(old_map, last, start); } } *old_entry_p = last; return TRUE; } #if PMAP_FORK_NEST #define PMAP_FORK_NEST_DEBUG 0 static inline void vm_map_fork_unnest( pmap_t new_pmap, vm_map_offset_t pre_nested_start, vm_map_offset_t pre_nested_end, vm_map_offset_t start, vm_map_offset_t end) { kern_return_t kr; vm_map_offset_t nesting_mask, start_unnest, end_unnest; assertf(pre_nested_start <= pre_nested_end, "pre_nested start 0x%llx end 0x%llx", (uint64_t)pre_nested_start, (uint64_t)pre_nested_end); assertf(start <= end, "start 0x%llx end 0x%llx", (uint64_t) start, (uint64_t)end); if (pre_nested_start == pre_nested_end) { /* nothing was pre-nested: done */ return; } if (end <= pre_nested_start) { /* fully before pre-nested range: done */ return; } if (start >= pre_nested_end) { /* fully after pre-nested range: done */ return; } /* ignore parts of range outside of pre_nested range */ if (start < pre_nested_start) { start = pre_nested_start; } if (end > pre_nested_end) { end = pre_nested_end; } nesting_mask = pmap_shared_region_size_min(new_pmap) - 1; start_unnest = start & ~nesting_mask; end_unnest = (end + nesting_mask) & ~nesting_mask; kr = pmap_unnest(new_pmap, (addr64_t)start_unnest, (uint64_t)(end_unnest - start_unnest)); #if PMAP_FORK_NEST_DEBUG printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr); #endif /* PMAP_FORK_NEST_DEBUG */ assertf(kr == KERN_SUCCESS, "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x", (uint64_t)start, (uint64_t)end, new_pmap, (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest), kr); } #endif /* PMAP_FORK_NEST */ void vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map) { new_map->size_limit = old_map->size_limit; new_map->data_limit = old_map->data_limit; new_map->user_wire_limit = old_map->user_wire_limit; new_map->reserved_regions = old_map->reserved_regions; } /* * vm_map_fork: * * Create and return a new map based on the old * map, according to the inheritance values on the * regions in that map and the options. * * The source map must not be locked. */ vm_map_t vm_map_fork( ledger_t ledger, vm_map_t old_map, int options) { pmap_t new_pmap; vm_map_t new_map; vm_map_entry_t old_entry; vm_map_size_t new_size = 0, entry_size; vm_map_entry_t new_entry; boolean_t src_needs_copy; boolean_t new_entry_needs_copy; boolean_t pmap_is64bit; int vm_map_copyin_flags; vm_inherit_t old_entry_inheritance; int map_create_options; kern_return_t footprint_collect_kr; if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE | VM_MAP_FORK_PRESERVE_PURGEABLE | VM_MAP_FORK_CORPSE_FOOTPRINT | VM_MAP_FORK_SHARE_IF_OWNED)) { /* unsupported option */ return VM_MAP_NULL; } pmap_is64bit = #if defined(__i386__) || defined(__x86_64__) old_map->pmap->pm_task_map != TASK_MAP_32BIT; #elif defined(__arm64__) old_map->pmap->is_64bit; #else #error Unknown architecture. #endif unsigned int pmap_flags = 0; pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0; #if defined(HAS_APPLE_PAC) pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0; #endif #if CONFIG_ROSETTA pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0; #endif #if PMAP_CREATE_FORCE_4K_PAGES if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE && PAGE_SIZE != FOURK_PAGE_SIZE) { pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES; } #endif /* PMAP_CREATE_FORCE_4K_PAGES */ new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags); if (new_pmap == NULL) { return VM_MAP_NULL; } vm_map_reference(old_map); vm_map_lock(old_map); map_create_options = 0; if (old_map->hdr.entries_pageable) { map_create_options |= VM_MAP_CREATE_PAGEABLE; } if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT; footprint_collect_kr = KERN_SUCCESS; } new_map = vm_map_create_options(new_pmap, old_map->min_offset, old_map->max_offset, map_create_options); /* inherit cs_enforcement */ vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement); vm_map_lock(new_map); vm_commit_pagezero_status(new_map); /* inherit the parent map's page size */ vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map)); /* inherit the parent rlimits */ vm_map_inherit_limits(new_map, old_map); #if CONFIG_MAP_RANGES /* inherit the parent map's VM ranges */ vm_map_range_fork(new_map, old_map); #endif #if CODE_SIGNING_MONITOR /* Prepare the monitor for the fork */ csm_fork_prepare(old_map->pmap, new_pmap); #endif #if PMAP_FORK_NEST /* * Pre-nest the shared region's pmap. */ vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0; pmap_fork_nest(old_map->pmap, new_pmap, &pre_nested_start, &pre_nested_end); #if PMAP_FORK_NEST_DEBUG printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end); #endif /* PMAP_FORK_NEST_DEBUG */ #endif /* PMAP_FORK_NEST */ for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) { /* * Abort any corpse collection if the system is shutting down. */ if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) && get_system_inshutdown()) { #if PMAP_FORK_NEST new_entry = vm_map_last_entry(new_map); if (new_entry == vm_map_to_entry(new_map)) { /* unnest all that was pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, vm_map_min(new_map), vm_map_max(new_map)); } else if (new_entry->vme_end < vm_map_max(new_map)) { /* unnest hole at the end, if pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, new_entry->vme_end, vm_map_max(new_map)); } #endif /* PMAP_FORK_NEST */ vm_map_corpse_footprint_collect_done(new_map); vm_map_unlock(new_map); vm_map_unlock(old_map); vm_map_deallocate(new_map); vm_map_deallocate(old_map); printf("Aborting corpse map due to system shutdown\n"); return VM_MAP_NULL; } entry_size = old_entry->vme_end - old_entry->vme_start; #if PMAP_FORK_NEST /* * Undo any unnecessary pre-nesting. */ vm_map_offset_t prev_end; if (old_entry == vm_map_first_entry(old_map)) { prev_end = vm_map_min(old_map); } else { prev_end = old_entry->vme_prev->vme_end; } if (prev_end < old_entry->vme_start) { /* unnest hole before this entry, if pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, prev_end, old_entry->vme_start); } if (old_entry->is_sub_map && old_entry->use_pmap) { /* keep this entry nested in the child */ #if PMAP_FORK_NEST_DEBUG printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end); #endif /* PMAP_FORK_NEST_DEBUG */ } else { /* undo nesting for this entry, if pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, old_entry->vme_start, old_entry->vme_end); } #endif /* PMAP_FORK_NEST */ old_entry_inheritance = old_entry->inheritance; /* * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option * share VM_INHERIT_NONE entries that are not backed by a * device pager. */ if (old_entry_inheritance == VM_INHERIT_NONE && (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) && (old_entry->protection & VM_PROT_READ) && !(!old_entry->is_sub_map && VME_OBJECT(old_entry) != NULL && VME_OBJECT(old_entry)->pager != NULL && is_device_pager_ops( VME_OBJECT(old_entry)->pager->mo_pager_ops))) { old_entry_inheritance = VM_INHERIT_SHARE; } if (old_entry_inheritance == VM_INHERIT_COPY && (options & VM_MAP_FORK_SHARE_IF_OWNED) && !old_entry->is_sub_map && VME_OBJECT(old_entry) != VM_OBJECT_NULL) { vm_object_t object; task_t owner; object = VME_OBJECT(old_entry); owner = VM_OBJECT_OWNER(object); if (owner != TASK_NULL && owner->map == old_map) { /* * This mapping points at a VM object owned * by the task being forked. * Some tools reporting memory accounting * info rely on the object ID, so share this * mapping instead of copying, to make the * corpse look exactly like the original * task in that respect. */ assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC); old_entry_inheritance = VM_INHERIT_SHARE; } } if (old_entry_inheritance != VM_INHERIT_NONE && (options & VM_MAP_FORK_CORPSE_FOOTPRINT) && footprint_collect_kr == KERN_SUCCESS) { /* * The corpse won't have old_map->pmap to query * footprint information, so collect that data now * and store it in new_map->vmmap_corpse_footprint * for later autopsy. */ footprint_collect_kr = vm_map_corpse_footprint_collect(old_map, old_entry, new_map); } switch (old_entry_inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: vm_map_fork_share(old_map, old_entry, new_map); new_size += entry_size; break; case VM_INHERIT_COPY: /* * Inline the copy_quickly case; * upon failure, fall back on call * to vm_map_fork_copy. */ if (old_entry->is_sub_map) { break; } if ((old_entry->wired_count != 0) || ((VME_OBJECT(old_entry) != NULL) && (VME_OBJECT(old_entry)->true_share))) { goto slow_vm_map_fork_copy; } new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */ vm_map_entry_copy(old_map, new_entry, old_entry); if (old_entry->vme_permanent) { /* inherit "permanent" on fork() */ new_entry->vme_permanent = TRUE; } if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) { new_map->jit_entry_exists = TRUE; } if (new_entry->is_sub_map) { /* clear address space specifics */ new_entry->use_pmap = FALSE; } else { /* * We're dealing with a copy-on-write operation, * so the resulting mapping should not inherit * the original mapping's accounting settings. * "iokit_acct" should have been cleared in * vm_map_entry_copy(). * "use_pmap" should be reset to its default * (TRUE) so that the new mapping gets * accounted for in the task's memory footprint. */ assert(!new_entry->iokit_acct); new_entry->use_pmap = TRUE; } if (!vm_object_copy_quickly( VME_OBJECT(new_entry), VME_OFFSET(old_entry), (old_entry->vme_end - old_entry->vme_start), &src_needs_copy, &new_entry_needs_copy)) { vm_map_entry_dispose(new_entry); goto slow_vm_map_fork_copy; } /* * Handle copy-on-write obligations */ if (src_needs_copy && !old_entry->needs_copy) { vm_prot_t prot; if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, old_map, old_map->pmap, old_entry, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end, old_entry->protection); } prot = old_entry->protection & ~VM_PROT_WRITE; if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) { prot |= VM_PROT_EXECUTE; } if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, old_map, old_map->pmap, old_entry, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end, prot); } vm_object_pmap_protect( VME_OBJECT(old_entry), VME_OFFSET(old_entry), (old_entry->vme_end - old_entry->vme_start), ((old_entry->is_shared || old_map->mapped_in_other_pmaps) ? PMAP_NULL : old_map->pmap), VM_MAP_PAGE_SIZE(old_map), old_entry->vme_start, prot); assert(old_entry->wired_count == 0); old_entry->needs_copy = TRUE; } new_entry->needs_copy = new_entry_needs_copy; /* * Insert the entry at the end * of the map. */ vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry, VM_MAP_KERNEL_FLAGS_NONE); new_size += entry_size; break; slow_vm_map_fork_copy: vm_map_copyin_flags = VM_MAP_COPYIN_FORK; if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) { vm_map_copyin_flags |= VM_MAP_COPYIN_PRESERVE_PURGEABLE; } if (vm_map_fork_copy(old_map, &old_entry, new_map, vm_map_copyin_flags)) { new_size += entry_size; } continue; } old_entry = old_entry->vme_next; } #if PMAP_FORK_NEST new_entry = vm_map_last_entry(new_map); if (new_entry == vm_map_to_entry(new_map)) { /* unnest all that was pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, vm_map_min(new_map), vm_map_max(new_map)); } else if (new_entry->vme_end < vm_map_max(new_map)) { /* unnest hole at the end, if pre-nested */ vm_map_fork_unnest(new_pmap, pre_nested_start, pre_nested_end, new_entry->vme_end, vm_map_max(new_map)); } #endif /* PMAP_FORK_NEST */ #if defined(__arm64__) pmap_insert_commpage(new_map->pmap); #endif /* __arm64__ */ new_map->size = new_size; if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { vm_map_corpse_footprint_collect_done(new_map); } /* Propagate JIT entitlement for the pmap layer. */ if (pmap_get_jit_entitled(old_map->pmap)) { /* Tell the pmap that it supports JIT. */ pmap_set_jit_entitled(new_map->pmap); } /* Propagate TPRO settings for the pmap layer */ if (pmap_get_tpro(old_map->pmap)) { /* Tell the pmap that it supports TPRO */ pmap_set_tpro(new_map->pmap); } vm_map_unlock(new_map); vm_map_unlock(old_map); vm_map_deallocate(old_map); return new_map; } /* * vm_map_exec: * * Setup the "new_map" with the proper execution environment according * to the type of executable (platform, 64bit, chroot environment). * Map the comm page and shared region, etc... */ kern_return_t vm_map_exec( vm_map_t new_map, task_t task, boolean_t is64bit, void *fsroot, cpu_type_t cpu, cpu_subtype_t cpu_subtype, boolean_t reslide, boolean_t is_driverkit, uint32_t rsr_version) { SHARED_REGION_TRACE_DEBUG( ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n", (void *)VM_KERNEL_ADDRPERM(current_task()), (void *)VM_KERNEL_ADDRPERM(new_map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, cpu_subtype)); (void) vm_commpage_enter(new_map, task, is64bit); (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version); SHARED_REGION_TRACE_DEBUG( ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n", (void *)VM_KERNEL_ADDRPERM(current_task()), (void *)VM_KERNEL_ADDRPERM(new_map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, cpu_subtype)); /* * Some devices have region(s) of memory that shouldn't get allocated by * user processes. The following code creates dummy vm_map_entry_t's for each * of the regions that needs to be reserved to prevent any allocations in * those regions. */ kern_return_t kr = KERN_FAILURE; vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(); vmk_flags.vmkf_beyond_max = true; const struct vm_reserved_region *regions = NULL; size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions); assert((num_regions == 0) || (num_regions > 0 && regions != NULL)); for (size_t i = 0; i < num_regions; ++i) { vm_map_offset_t address = regions[i].vmrr_addr; kr = vm_map_enter( new_map, &address, regions[i].vmrr_size, (vm_map_offset_t)0, vmk_flags, VM_OBJECT_NULL, (vm_object_offset_t)0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_COPY); if (kr != KERN_SUCCESS) { panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr); } } new_map->reserved_regions = (num_regions ? TRUE : FALSE); return KERN_SUCCESS; } uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0; uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0; uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0; uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0; uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0; uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0; uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0; uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0; uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0; uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0; uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0; uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0; uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0; /* * vm_map_lookup_and_lock_object: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Returns the (object, offset, protection) for * this address, whether it is wired down, and whether * this map has the only reference to the data in question. * In order to later verify this lookup, a "version" * is returned. * If contended != NULL, *contended will be set to * true iff the thread had to spin or block to acquire * an exclusive lock. * * The map MUST be locked by the caller and WILL be * locked on exit. In order to guarantee the * existence of the returned object, it is returned * locked. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ kern_return_t vm_map_lookup_and_lock_object( vm_map_t *var_map, /* IN/OUT */ vm_map_offset_t vaddr, vm_prot_t fault_type, int object_lock_type, vm_map_version_t *out_version, /* OUT */ vm_object_t *object, /* OUT */ vm_object_offset_t *offset, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired, /* OUT */ vm_object_fault_info_t fault_info, /* OUT */ vm_map_t *real_map, /* OUT */ bool *contended) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_map_t old_map = *var_map; vm_map_t cow_sub_map_parent = VM_MAP_NULL; vm_map_offset_t cow_parent_vaddr = 0; vm_map_offset_t old_start = 0; vm_map_offset_t old_end = 0; vm_prot_t prot; boolean_t mask_protections; boolean_t force_copy; boolean_t no_force_copy_if_executable; boolean_t submap_needed_copy; vm_prot_t original_fault_type; vm_map_size_t fault_page_mask; /* * VM_PROT_MASK means that the caller wants us to use "fault_type" * as a mask against the mapping's actual protections, not as an * absolute value. */ mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE; force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE; no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE; fault_type &= VM_PROT_ALL; original_fault_type = fault_type; if (contended) { *contended = false; } *real_map = map; fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK); vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask); RetryLookup: fault_type = original_fault_type; /* * If the map has an interesting hint, try it before calling * full blown lookup routine. */ entry = map->hint; if ((entry == vm_map_to_entry(map)) || (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { vm_map_entry_t tmp_entry; /* * Entry was either not a valid hint, or the vaddr * was not contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } return KERN_INVALID_ADDRESS; } entry = tmp_entry; } if (map == old_map) { old_start = entry->vme_start; old_end = entry->vme_end; } /* * Handle submaps. Drop lock on upper map, submap is * returned locked. */ submap_needed_copy = FALSE; submap_recurse: if (entry->is_sub_map) { vm_map_offset_t local_vaddr; vm_map_offset_t end_delta; vm_map_offset_t start_delta; vm_map_offset_t top_entry_saved_start; vm_object_offset_t top_entry_saved_offset; vm_map_entry_t submap_entry, saved_submap_entry; vm_object_offset_t submap_entry_offset; vm_object_size_t submap_entry_size; vm_prot_t subentry_protection; vm_prot_t subentry_max_protection; boolean_t subentry_no_copy_on_read; boolean_t subentry_permanent; boolean_t subentry_csm_associated; #if __arm64e__ boolean_t subentry_used_for_tpro; #endif /* __arm64e__ */ boolean_t mapped_needs_copy = FALSE; vm_map_version_t version; assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map), "map %p (%d) entry %p submap %p (%d)\n", map, VM_MAP_PAGE_SHIFT(map), entry, VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry))); local_vaddr = vaddr; top_entry_saved_start = entry->vme_start; top_entry_saved_offset = VME_OFFSET(entry); if ((entry->use_pmap && !((fault_type & VM_PROT_WRITE) || force_copy))) { /* if real_map equals map we unlock below */ if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = VME_SUBMAP(entry); } if (entry->needs_copy && ((fault_type & VM_PROT_WRITE) || force_copy)) { if (!mapped_needs_copy) { if (vm_map_lock_read_to_write(map)) { vm_map_lock_read(map); *real_map = map; goto RetryLookup; } vm_map_lock_read(VME_SUBMAP(entry)); *var_map = VME_SUBMAP(entry); cow_sub_map_parent = map; /* reset base to map before cow object */ /* this is the map which will accept */ /* the new cow object */ old_start = entry->vme_start; old_end = entry->vme_end; cow_parent_vaddr = vaddr; mapped_needs_copy = TRUE; } else { vm_map_lock_read(VME_SUBMAP(entry)); *var_map = VME_SUBMAP(entry); if ((cow_sub_map_parent != map) && (*real_map != map)) { vm_map_unlock(map); } } } else { if (entry->needs_copy) { submap_needed_copy = TRUE; } vm_map_lock_read(VME_SUBMAP(entry)); *var_map = VME_SUBMAP(entry); /* leave map locked if it is a target */ /* cow sub_map above otherwise, just */ /* follow the maps down to the object */ /* here we unlock knowing we are not */ /* revisiting the map. */ if ((*real_map != map) && (map != cow_sub_map_parent)) { vm_map_unlock_read(map); } } entry = NULL; map = *var_map; /* calculate the offset in the submap for vaddr */ local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset; assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask), "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n", (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask); RetrySubMap: if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) { if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = map; return KERN_INVALID_ADDRESS; } /* find the attenuated shadow of the underlying object */ /* on our target map */ /* in english the submap object may extend beyond the */ /* region mapped by the entry or, may only fill a portion */ /* of it. For our purposes, we only care if the object */ /* doesn't fill. In this case the area which will */ /* ultimately be clipped in the top map will only need */ /* to be as big as the portion of the underlying entry */ /* which is mapped */ start_delta = submap_entry->vme_start > top_entry_saved_offset ? submap_entry->vme_start - top_entry_saved_offset : 0; end_delta = (top_entry_saved_offset + start_delta + (old_end - old_start)) <= submap_entry->vme_end ? 0 : (top_entry_saved_offset + (old_end - old_start)) - submap_entry->vme_end; old_start += start_delta; old_end -= end_delta; if (submap_entry->is_sub_map) { entry = submap_entry; vaddr = local_vaddr; goto submap_recurse; } if (((fault_type & VM_PROT_WRITE) || force_copy) && cow_sub_map_parent) { vm_object_t sub_object, copy_object; vm_object_offset_t copy_offset; vm_map_offset_t local_start; vm_map_offset_t local_end; boolean_t object_copied = FALSE; vm_object_offset_t object_copied_offset = 0; boolean_t object_copied_needs_copy = FALSE; kern_return_t kr = KERN_SUCCESS; if (vm_map_lock_read_to_write(map)) { vm_map_lock_read(map); old_start -= start_delta; old_end += end_delta; goto RetrySubMap; } sub_object = VME_OBJECT(submap_entry); if (sub_object == VM_OBJECT_NULL) { sub_object = vm_object_allocate( (vm_map_size_t) (submap_entry->vme_end - submap_entry->vme_start)); VME_OBJECT_SET(submap_entry, sub_object, false, 0); VME_OFFSET_SET(submap_entry, 0); assert(!submap_entry->is_sub_map); assert(submap_entry->use_pmap); } local_start = local_vaddr - (cow_parent_vaddr - old_start); local_end = local_vaddr + (old_end - cow_parent_vaddr); vm_map_clip_start(map, submap_entry, local_start); vm_map_clip_end(map, submap_entry, local_end); if (submap_entry->is_sub_map) { /* unnesting was done when clipping */ assert(!submap_entry->use_pmap); } /* This is the COW case, lets connect */ /* an entry in our space to the underlying */ /* object in the submap, bypassing the */ /* submap. */ submap_entry_offset = VME_OFFSET(submap_entry); submap_entry_size = submap_entry->vme_end - submap_entry->vme_start; if ((submap_entry->wired_count != 0 || sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) && (submap_entry->protection & VM_PROT_EXECUTE) && no_force_copy_if_executable) { // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy); if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = map; ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */); vm_map_lock_write_to_read(map); kr = KERN_PROTECTION_FAILURE; DTRACE_VM4(submap_no_copy_executable, vm_map_t, map, vm_object_offset_t, submap_entry_offset, vm_object_size_t, submap_entry_size, int, kr); return kr; } if (submap_entry->wired_count != 0) { vm_object_reference(sub_object); assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)), "submap_entry %p offset 0x%llx\n", submap_entry, VME_OFFSET(submap_entry)); DTRACE_VM6(submap_copy_slowly, vm_map_t, cow_sub_map_parent, vm_map_offset_t, vaddr, vm_map_t, map, vm_object_size_t, submap_entry_size, int, submap_entry->wired_count, int, sub_object->copy_strategy); saved_submap_entry = submap_entry; version.main_timestamp = map->timestamp; vm_map_unlock(map); /* Increments timestamp by 1 */ submap_entry = VM_MAP_ENTRY_NULL; vm_object_lock(sub_object); kr = vm_object_copy_slowly(sub_object, submap_entry_offset, submap_entry_size, FALSE, ©_object); object_copied = TRUE; object_copied_offset = 0; /* 4k: account for extra offset in physical page */ object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset); object_copied_needs_copy = FALSE; vm_object_deallocate(sub_object); vm_map_lock(map); if (kr != KERN_SUCCESS && kr != KERN_MEMORY_RESTART_COPY) { if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = map; vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */); vm_map_lock_write_to_read(map); DTRACE_VM4(submap_copy_error_slowly, vm_object_t, sub_object, vm_object_offset_t, submap_entry_offset, vm_object_size_t, submap_entry_size, int, kr); vm_map_lookup_and_lock_object_copy_slowly_error++; return kr; } if ((kr == KERN_SUCCESS) && (version.main_timestamp + 1) == map->timestamp) { submap_entry = saved_submap_entry; } else { saved_submap_entry = NULL; old_start -= start_delta; old_end += end_delta; vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; vm_map_lock_write_to_read(map); vm_map_lookup_and_lock_object_copy_slowly_restart++; goto RetrySubMap; } vm_map_lookup_and_lock_object_copy_slowly_count++; vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size; if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) { vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size; } } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { submap_entry_offset = VME_OFFSET(submap_entry); copy_object = VM_OBJECT_NULL; object_copied_offset = submap_entry_offset; object_copied_needs_copy = FALSE; DTRACE_VM6(submap_copy_strategically, vm_map_t, cow_sub_map_parent, vm_map_offset_t, vaddr, vm_map_t, map, vm_object_size_t, submap_entry_size, int, submap_entry->wired_count, int, sub_object->copy_strategy); kr = vm_object_copy_strategically( sub_object, submap_entry_offset, submap_entry->vme_end - submap_entry->vme_start, false, /* forking */ ©_object, &object_copied_offset, &object_copied_needs_copy); if (kr == KERN_MEMORY_RESTART_COPY) { old_start -= start_delta; old_end += end_delta; vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; vm_map_lock_write_to_read(map); vm_map_lookup_and_lock_object_copy_strategically_restart++; goto RetrySubMap; } if (kr != KERN_SUCCESS) { if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = map; vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */); vm_map_lock_write_to_read(map); DTRACE_VM4(submap_copy_error_strategically, vm_object_t, sub_object, vm_object_offset_t, submap_entry_offset, vm_object_size_t, submap_entry_size, int, kr); vm_map_lookup_and_lock_object_copy_strategically_error++; return kr; } assert(copy_object != VM_OBJECT_NULL); assert(copy_object != sub_object); object_copied = TRUE; vm_map_lookup_and_lock_object_copy_strategically_count++; vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size; if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) { vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size; } } else { /* set up shadow object */ object_copied = FALSE; copy_object = sub_object; vm_object_lock(sub_object); vm_object_reference_locked(sub_object); VM_OBJECT_SET_SHADOWED(sub_object, TRUE); vm_object_unlock(sub_object); assert(submap_entry->wired_count == 0); submap_entry->needs_copy = TRUE; prot = submap_entry->protection; if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, submap_entry, (uint64_t)submap_entry->vme_start, (uint64_t)submap_entry->vme_end, prot); } prot = prot & ~VM_PROT_WRITE; if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, submap_entry, (uint64_t)submap_entry->vme_start, (uint64_t)submap_entry->vme_end, prot); } if (override_nx(old_map, VME_ALIAS(submap_entry)) && prot) { prot |= VM_PROT_EXECUTE; } vm_object_pmap_protect( sub_object, VME_OFFSET(submap_entry), submap_entry->vme_end - submap_entry->vme_start, (submap_entry->is_shared || map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap, VM_MAP_PAGE_SIZE(map), submap_entry->vme_start, prot); vm_map_lookup_and_lock_object_copy_shadow_count++; vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size; if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) { vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size; } } /* * Adjust the fault offset to the submap entry. */ copy_offset = (local_vaddr - submap_entry->vme_start + VME_OFFSET(submap_entry)); /* This works diffently than the */ /* normal submap case. We go back */ /* to the parent of the cow map and*/ /* clip out the target portion of */ /* the sub_map, substituting the */ /* new copy object, */ subentry_protection = submap_entry->protection; subentry_max_protection = submap_entry->max_protection; subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read; subentry_permanent = submap_entry->vme_permanent; subentry_csm_associated = submap_entry->csm_associated; #if __arm64e__ subentry_used_for_tpro = submap_entry->used_for_tpro; #endif // __arm64e__ vm_map_unlock(map); submap_entry = NULL; /* not valid after map unlock */ local_start = old_start; local_end = old_end; map = cow_sub_map_parent; *var_map = cow_sub_map_parent; vaddr = cow_parent_vaddr; cow_sub_map_parent = NULL; if (!vm_map_lookup_entry(map, vaddr, &entry)) { if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } if ((*real_map != map) && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } *real_map = map; vm_object_deallocate( copy_object); copy_object = VM_OBJECT_NULL; vm_map_lock_write_to_read(map); DTRACE_VM4(submap_lookup_post_unlock, uint64_t, (uint64_t)entry->vme_start, uint64_t, (uint64_t)entry->vme_end, vm_map_offset_t, vaddr, int, object_copied); return KERN_INVALID_ADDRESS; } /* clip out the portion of space */ /* mapped by the sub map which */ /* corresponds to the underlying */ /* object */ /* * Clip (and unnest) the smallest nested chunk * possible around the faulting address... */ local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1); local_end = local_start + pmap_shared_region_size_min(map->pmap); /* * ... but don't go beyond the "old_start" to "old_end" * range, to avoid spanning over another VM region * with a possibly different VM object and/or offset. */ if (local_start < old_start) { local_start = old_start; } if (local_end > old_end) { local_end = old_end; } /* * Adjust copy_offset to the start of the range. */ copy_offset -= (vaddr - local_start); vm_map_clip_start(map, entry, local_start); vm_map_clip_end(map, entry, local_end); if (entry->is_sub_map) { /* unnesting was done when clipping */ assert(!entry->use_pmap); } /* substitute copy object for */ /* shared map entry */ vm_map_deallocate(VME_SUBMAP(entry)); assert(!entry->iokit_acct); entry->use_pmap = TRUE; VME_OBJECT_SET(entry, copy_object, false, 0); /* propagate the submap entry's protections */ if (entry->protection != VM_PROT_READ) { /* * Someone has already altered the top entry's * protections via vm_protect(VM_PROT_COPY). * Respect these new values and ignore the * submap entry's protections. */ } else { /* * Regular copy-on-write: propagate the submap * entry's protections to the top map entry. */ entry->protection |= subentry_protection; } entry->max_protection |= subentry_max_protection; /* propagate some attributes from subentry */ entry->vme_no_copy_on_read = subentry_no_copy_on_read; entry->vme_permanent = subentry_permanent; entry->csm_associated = subentry_csm_associated; #if __arm64e__ /* propagate TPRO iff the destination map has TPRO enabled */ if (subentry_used_for_tpro) { if (vm_map_tpro(map)) { entry->used_for_tpro = subentry_used_for_tpro; } else { /* "permanent" came from being TPRO */ entry->vme_permanent = FALSE; } } #endif /* __arm64e */ if ((entry->protection & VM_PROT_WRITE) && (entry->protection & VM_PROT_EXECUTE) && #if XNU_TARGET_OS_OSX map->pmap != kernel_pmap && (vm_map_cs_enforcement(map) #if __arm64__ || !VM_MAP_IS_EXOTIC(map) #endif /* __arm64__ */ ) && #endif /* XNU_TARGET_OS_OSX */ #if CODE_SIGNING_MONITOR (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) && #endif !(entry->used_for_jit) && VM_MAP_POLICY_WX_STRIP_X(map)) { DTRACE_VM3(cs_wx, uint64_t, (uint64_t)entry->vme_start, uint64_t, (uint64_t)entry->vme_end, vm_prot_t, entry->protection); printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, #if DEVELOPMENT || DEBUG (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, #else /* DEVELOPMENT || DEBUG */ (uint64_t)0, (uint64_t)0, #endif /* DEVELOPMENT || DEBUG */ entry->protection); entry->protection &= ~VM_PROT_EXECUTE; } if (object_copied) { VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset); entry->needs_copy = object_copied_needs_copy; entry->is_shared = FALSE; } else { assert(VME_OBJECT(entry) != VM_OBJECT_NULL); assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); assert(entry->wired_count == 0); VME_OFFSET_SET(entry, copy_offset); entry->needs_copy = TRUE; if (map != old_map) { entry->is_shared = TRUE; } } if (entry->inheritance == VM_INHERIT_SHARE) { entry->inheritance = VM_INHERIT_COPY; } vm_map_lock_write_to_read(map); } else { if ((cow_sub_map_parent) && (cow_sub_map_parent != *real_map) && (cow_sub_map_parent != map)) { vm_map_unlock(cow_sub_map_parent); } entry = submap_entry; vaddr = local_vaddr; } } /* * Check whether this task is allowed to have * this page. */ prot = entry->protection; if (override_nx(old_map, VME_ALIAS(entry)) && prot) { /* * HACK -- if not a stack, then allow execution */ prot |= VM_PROT_EXECUTE; } #if __arm64e__ /* * If the entry we're dealing with is TPRO and we have a write * fault, inject VM_PROT_WRITE into protections. This allows us * to maintain RO permissions when not marked as TPRO. */ if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) { prot |= VM_PROT_WRITE; } #endif /* __arm64e__ */ if (mask_protections) { fault_type &= prot; if (fault_type == VM_PROT_NONE) { goto protection_failure; } } if (((fault_type & prot) != fault_type) #if __arm64__ /* prefetch abort in execute-only page */ && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE)) #elif defined(__x86_64__) /* Consider the UEXEC bit when handling an EXECUTE fault */ && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC)) #endif ) { protection_failure: if (*real_map != map) { vm_map_unlock(*real_map); } *real_map = map; if ((fault_type & VM_PROT_EXECUTE) && prot) { log_stack_execution_failure((addr64_t)vaddr, prot); } DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL); DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr); /* * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now. * * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0); */ return KERN_PROTECTION_FAILURE; } /* * If this page is not pageable, we have to get * it for all possible accesses. */ *wired = (entry->wired_count != 0); if (*wired) { fault_type = prot; } /* * If the entry was copy-on-write, we either ... */ if (entry->needs_copy) { /* * If we want to write the page, we may as well * handle that now since we've got the map locked. * * If we don't need to write the page, we just * demote the permissions allowed. */ if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) { /* * Make a new object, and place it in the * object chain. Note that no new references * have appeared -- one just moved from the * map to the new object. */ if (vm_map_lock_read_to_write(map)) { vm_map_lock_read(map); goto RetryLookup; } if (VME_OBJECT(entry)->shadowed == FALSE) { vm_object_lock(VME_OBJECT(entry)); VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE); vm_object_unlock(VME_OBJECT(entry)); } VME_OBJECT_SHADOW(entry, (vm_map_size_t) (entry->vme_end - entry->vme_start), vm_map_always_shadow(map)); entry->needs_copy = FALSE; vm_map_lock_write_to_read(map); } if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) { /* * We're attempting to read a copy-on-write * page -- don't allow writes. */ prot &= (~VM_PROT_WRITE); } } if (submap_needed_copy && (prot & VM_PROT_WRITE)) { /* * We went through a "needs_copy" submap without triggering * a copy, so granting write access to the page would bypass * that submap's "needs_copy". */ assert(!(fault_type & VM_PROT_WRITE)); assert(!*wired); assert(!force_copy); // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr); prot &= ~VM_PROT_WRITE; } /* * Create an object if necessary. */ if (VME_OBJECT(entry) == VM_OBJECT_NULL) { if (vm_map_lock_read_to_write(map)) { vm_map_lock_read(map); goto RetryLookup; } VME_OBJECT_SET(entry, vm_object_allocate( (vm_map_size_t)(entry->vme_end - entry->vme_start)), false, 0); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); vm_map_lock_write_to_read(map); } /* * Return the object/offset from this entry. If the entry * was copy-on-write or empty, it has been fixed up. Also * return the protection. */ *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry); *object = VME_OBJECT(entry); *out_prot = prot; KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0); if (fault_info) { /* ... the caller will change "interruptible" if needed */ fault_info->user_tag = VME_ALIAS(entry); fault_info->pmap_options = 0; if (entry->iokit_acct || (!entry->is_sub_map && !entry->use_pmap)) { fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT; } if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) { fault_info->behavior = entry->behavior; } fault_info->lo_offset = VME_OFFSET(entry); fault_info->hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry); fault_info->no_cache = entry->no_cache; fault_info->stealth = FALSE; fault_info->io_sync = FALSE; if (entry->used_for_jit || #if CODE_SIGNING_MONITOR (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) || #endif entry->vme_resilient_codesign) { fault_info->cs_bypass = TRUE; } else { fault_info->cs_bypass = FALSE; } fault_info->csm_associated = FALSE; #if CODE_SIGNING_MONITOR if (entry->csm_associated) { /* * The pmap layer will validate this page * before allowing it to be executed from. */ fault_info->csm_associated = TRUE; } #endif fault_info->mark_zf_absent = FALSE; fault_info->batch_pmap_op = FALSE; fault_info->resilient_media = entry->vme_resilient_media; fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug; fault_info->no_copy_on_read = entry->vme_no_copy_on_read; #if __arm64e__ fault_info->fi_used_for_tpro = entry->used_for_tpro; #else /* __arm64e__ */ fault_info->fi_used_for_tpro = FALSE; #endif if (entry->translated_allow_execute) { fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE; } } /* * Lock the object to prevent it from disappearing */ if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) { if (contended == NULL) { vm_object_lock(*object); } else { *contended = vm_object_lock_check_contended(*object); } } else { vm_object_lock_shared(*object); } /* * Save the version number */ out_version->main_timestamp = map->timestamp; return KERN_SUCCESS; } /* * vm_map_verify: * * Verifies that the map in question has not changed * since the given version. The map has to be locked * ("shared" mode is fine) before calling this function * and it will be returned locked too. */ boolean_t vm_map_verify( vm_map_t map, vm_map_version_t *version) /* REF */ { boolean_t result; vm_map_lock_assert_held(map); result = (map->timestamp == version->main_timestamp); return result; } /* * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY * Goes away after regular vm_region_recurse function migrates to * 64 bits * vm_region_recurse: A form of vm_region which follows the * submaps in a target map * */ kern_return_t vm_map_region_recurse_64( vm_map_t map, vm_map_offset_ut *address_u, /* IN/OUT */ vm_map_size_ut *size_u, /* OUT */ natural_t *nesting_depth, /* IN/OUT */ vm_region_submap_info_64_t submap_info, /* IN/OUT */ mach_msg_type_number_t *count) /* IN/OUT */ { mach_msg_type_number_t original_count; vm_region_extended_info_data_t extended; vm_map_entry_t tmp_entry; vm_map_offset_t user_address; unsigned int user_max_depth; /* * "curr_entry" is the VM map entry preceding or including the * address we're looking for. * "curr_map" is the map or sub-map containing "curr_entry". * "curr_address" is the equivalent of the top map's "user_address" * in the current map. * "curr_offset" is the cumulated offset of "curr_map" in the * target task's address space. * "curr_depth" is the depth of "curr_map" in the chain of * sub-maps. * * "curr_max_below" and "curr_max_above" limit the range (around * "curr_address") we should take into account in the current (sub)map. * They limit the range to what's visible through the map entries * we've traversed from the top map to the current map. * */ vm_map_entry_t curr_entry; vm_map_address_t curr_address; vm_map_offset_t curr_offset; vm_map_t curr_map; unsigned int curr_depth; vm_map_offset_t curr_max_below, curr_max_above; vm_map_offset_t curr_skip; /* * "next_" is the same as "curr_" but for the VM region immediately * after the address we're looking for. We need to keep track of this * too because we want to return info about that region if the * address we're looking for is not mapped. */ vm_map_entry_t next_entry; vm_map_offset_t next_offset; vm_map_offset_t next_address; vm_map_t next_map; unsigned int next_depth; vm_map_offset_t next_max_below, next_max_above; vm_map_offset_t next_skip; boolean_t look_for_pages; vm_region_submap_short_info_64_t short_info; boolean_t do_region_footprint; int effective_page_size, effective_page_shift; boolean_t submap_needed_copy; if (map == VM_MAP_NULL) { /* no address space to work on */ return KERN_INVALID_ARGUMENT; } user_address = vm_sanitize_addr(map, *address_u); effective_page_shift = vm_self_region_page_shift(map); effective_page_size = (1 << effective_page_shift); if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) { /* * "info" structure is not big enough and * would overflow */ return KERN_INVALID_ARGUMENT; } do_region_footprint = task_self_region_footprint(); original_count = *count; if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) { *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; look_for_pages = FALSE; short_info = (vm_region_submap_short_info_64_t) submap_info; submap_info = NULL; } else { look_for_pages = TRUE; *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64; short_info = NULL; if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64; } if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; } } user_max_depth = *nesting_depth; submap_needed_copy = FALSE; if (not_in_kdp) { vm_map_lock_read(map); } recurse_again: curr_entry = NULL; curr_map = map; curr_address = user_address; curr_offset = 0; curr_skip = 0; curr_depth = 0; curr_max_above = ((vm_map_offset_t) -1) - curr_address; curr_max_below = curr_address; next_entry = NULL; next_map = NULL; next_address = 0; next_offset = 0; next_skip = 0; next_depth = 0; next_max_above = (vm_map_offset_t) -1; next_max_below = (vm_map_offset_t) -1; for (;;) { if (vm_map_lookup_entry(curr_map, curr_address, &tmp_entry)) { /* tmp_entry contains the address we're looking for */ curr_entry = tmp_entry; } else { vm_map_offset_t skip; /* * The address is not mapped. "tmp_entry" is the * map entry preceding the address. We want the next * one, if it exists. */ curr_entry = tmp_entry->vme_next; if (curr_entry == vm_map_to_entry(curr_map) || (curr_entry->vme_start >= curr_address + curr_max_above)) { /* no next entry at this level: stop looking */ if (not_in_kdp) { vm_map_unlock_read(curr_map); } curr_entry = NULL; curr_map = NULL; curr_skip = 0; curr_offset = 0; curr_depth = 0; curr_max_above = 0; curr_max_below = 0; break; } /* adjust current address and offset */ skip = curr_entry->vme_start - curr_address; curr_address = curr_entry->vme_start; curr_skip += skip; curr_offset += skip; curr_max_above -= skip; curr_max_below = 0; } /* * Is the next entry at this level closer to the address (or * deeper in the submap chain) than the one we had * so far ? */ tmp_entry = curr_entry->vme_next; if (tmp_entry == vm_map_to_entry(curr_map)) { /* no next entry at this level */ } else if (tmp_entry->vme_start >= curr_address + curr_max_above) { /* * tmp_entry is beyond the scope of what we mapped of * this submap in the upper level: ignore it. */ } else if ((next_entry == NULL) || (tmp_entry->vme_start + curr_offset <= next_entry->vme_start + next_offset)) { /* * We didn't have a "next_entry" or this one is * closer to the address we're looking for: * use this "tmp_entry" as the new "next_entry". */ if (next_entry != NULL) { /* unlock the last "next_map" */ if (next_map != curr_map && not_in_kdp) { vm_map_unlock_read(next_map); } } next_entry = tmp_entry; next_map = curr_map; next_depth = curr_depth; next_address = next_entry->vme_start; next_skip = curr_skip; next_skip += (next_address - curr_address); next_offset = curr_offset; next_offset += (next_address - curr_address); next_max_above = MIN(next_max_above, curr_max_above); next_max_above = MIN(next_max_above, next_entry->vme_end - next_address); next_max_below = MIN(next_max_below, curr_max_below); next_max_below = MIN(next_max_below, next_address - next_entry->vme_start); } /* * "curr_max_{above,below}" allow us to keep track of the * portion of the submap that is actually mapped at this level: * the rest of that submap is irrelevant to us, since it's not * mapped here. * The relevant portion of the map starts at * "VME_OFFSET(curr_entry)" up to the size of "curr_entry". */ curr_max_above = MIN(curr_max_above, curr_entry->vme_end - curr_address); curr_max_below = MIN(curr_max_below, curr_address - curr_entry->vme_start); if (!curr_entry->is_sub_map || curr_depth >= user_max_depth) { /* * We hit a leaf map or we reached the maximum depth * we could, so stop looking. Keep the current map * locked. */ break; } /* * Get down to the next submap level. */ if (curr_entry->needs_copy) { /* everything below this is effectively copy-on-write */ submap_needed_copy = TRUE; } /* * Lock the next level and unlock the current level, * unless we need to keep it locked to access the "next_entry" * later. */ if (not_in_kdp) { vm_map_lock_read(VME_SUBMAP(curr_entry)); } if (curr_map == next_map) { /* keep "next_map" locked in case we need it */ } else { /* release this map */ if (not_in_kdp) { vm_map_unlock_read(curr_map); } } /* * Adjust the offset. "curr_entry" maps the submap * at relative address "curr_entry->vme_start" in the * curr_map but skips the first "VME_OFFSET(curr_entry)" * bytes of the submap. * "curr_offset" always represents the offset of a virtual * address in the curr_map relative to the absolute address * space (i.e. the top-level VM map). */ curr_offset += (VME_OFFSET(curr_entry) - curr_entry->vme_start); curr_address = user_address + curr_offset; /* switch to the submap */ curr_map = VME_SUBMAP(curr_entry); curr_depth++; curr_entry = NULL; } // LP64todo: all the current tools are 32bit, obviously never worked for 64b // so probably should be a real 32b ID vs. ptr. // Current users just check for equality if (curr_entry == NULL) { /* no VM region contains the address... */ if (do_region_footprint && /* we want footprint numbers */ next_entry == NULL && /* & there are no more regions */ /* & we haven't already provided our fake region: */ user_address <= vm_map_last_entry(map)->vme_end) { ledger_amount_t ledger_resident, ledger_compressed; /* * Add a fake memory region to account for * purgeable and/or ledger-tagged memory that * counts towards this task's memory footprint, * i.e. the resident/compressed pages of non-volatile * objects owned by that task. */ task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); if (ledger_resident + ledger_compressed == 0) { /* no purgeable memory usage to report */ return KERN_INVALID_ADDRESS; } /* fake region to show nonvolatile footprint */ if (look_for_pages) { submap_info->protection = VM_PROT_DEFAULT; submap_info->max_protection = VM_PROT_DEFAULT; submap_info->inheritance = VM_INHERIT_DEFAULT; submap_info->offset = 0; submap_info->user_tag = -1; submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size); submap_info->pages_shared_now_private = 0; submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size); submap_info->pages_dirtied = submap_info->pages_resident; submap_info->ref_count = 1; submap_info->shadow_depth = 0; submap_info->external_pager = 0; submap_info->share_mode = SM_PRIVATE; if (submap_needed_copy) { submap_info->share_mode = SM_COW; } submap_info->is_submap = 0; submap_info->behavior = VM_BEHAVIOR_DEFAULT; submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); submap_info->user_wired_count = 0; submap_info->pages_reusable = 0; } else { short_info->user_tag = -1; short_info->offset = 0; short_info->protection = VM_PROT_DEFAULT; short_info->inheritance = VM_INHERIT_DEFAULT; short_info->max_protection = VM_PROT_DEFAULT; short_info->behavior = VM_BEHAVIOR_DEFAULT; short_info->user_wired_count = 0; short_info->is_submap = 0; short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); short_info->external_pager = 0; short_info->shadow_depth = 0; short_info->share_mode = SM_PRIVATE; if (submap_needed_copy) { short_info->share_mode = SM_COW; } short_info->ref_count = 1; } *nesting_depth = 0; *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end); *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed); return KERN_SUCCESS; } if (next_entry == NULL) { /* ... and no VM region follows it either */ return KERN_INVALID_ADDRESS; } /* ... gather info about the next VM region */ curr_entry = next_entry; curr_map = next_map; /* still locked ... */ curr_address = next_address; curr_skip = next_skip; curr_offset = next_offset; curr_depth = next_depth; curr_max_above = next_max_above; curr_max_below = next_max_below; } else { /* we won't need "next_entry" after all */ if (next_entry != NULL) { /* release "next_map" */ if (next_map != curr_map && not_in_kdp) { vm_map_unlock_read(next_map); } } } next_entry = NULL; next_map = NULL; next_offset = 0; next_skip = 0; next_depth = 0; next_max_below = -1; next_max_above = -1; if (curr_entry->is_sub_map && curr_depth < user_max_depth) { /* * We're not as deep as we could be: we must have * gone back up after not finding anything mapped * below the original top-level map entry's. * Let's move "curr_address" forward and recurse again. */ user_address = curr_address; goto recurse_again; } *nesting_depth = curr_depth; *address_u = vm_sanitize_wrap_addr( user_address + curr_skip - curr_max_below); *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below); if (look_for_pages) { submap_info->user_tag = VME_ALIAS(curr_entry); submap_info->offset = VME_OFFSET(curr_entry); submap_info->protection = curr_entry->protection; submap_info->inheritance = curr_entry->inheritance; submap_info->max_protection = curr_entry->max_protection; submap_info->behavior = curr_entry->behavior; submap_info->user_wired_count = curr_entry->user_wired_count; submap_info->is_submap = curr_entry->is_sub_map; if (curr_entry->is_sub_map) { submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry)); } else { submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); } } else { short_info->user_tag = VME_ALIAS(curr_entry); short_info->offset = VME_OFFSET(curr_entry); short_info->protection = curr_entry->protection; short_info->inheritance = curr_entry->inheritance; short_info->max_protection = curr_entry->max_protection; short_info->behavior = curr_entry->behavior; short_info->user_wired_count = curr_entry->user_wired_count; short_info->is_submap = curr_entry->is_sub_map; if (curr_entry->is_sub_map) { short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry)); } else { short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); } } extended.pages_resident = 0; extended.pages_swapped_out = 0; extended.pages_shared_now_private = 0; extended.pages_dirtied = 0; extended.pages_reusable = 0; extended.external_pager = 0; extended.shadow_depth = 0; extended.share_mode = SM_EMPTY; extended.ref_count = 0; if (not_in_kdp) { if (!curr_entry->is_sub_map) { vm_map_offset_t range_start, range_end; range_start = MAX((curr_address - curr_max_below), curr_entry->vme_start); range_end = MIN((curr_address + curr_max_above), curr_entry->vme_end); vm_map_region_walk(curr_map, range_start, curr_entry, (VME_OFFSET(curr_entry) + (range_start - curr_entry->vme_start)), range_end - range_start, &extended, look_for_pages, VM_REGION_EXTENDED_INFO_COUNT); if (submap_needed_copy) { extended.share_mode = SM_COW; } } else { if (curr_entry->use_pmap) { extended.share_mode = SM_TRUESHARED; } else { extended.share_mode = SM_PRIVATE; } extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt); } } if (look_for_pages) { submap_info->pages_resident = extended.pages_resident; submap_info->pages_swapped_out = extended.pages_swapped_out; submap_info->pages_shared_now_private = extended.pages_shared_now_private; submap_info->pages_dirtied = extended.pages_dirtied; submap_info->external_pager = extended.external_pager; submap_info->shadow_depth = extended.shadow_depth; submap_info->share_mode = extended.share_mode; submap_info->ref_count = extended.ref_count; if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { submap_info->pages_reusable = extended.pages_reusable; } if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { if (curr_entry->is_sub_map) { submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry)); } else if (VME_OBJECT(curr_entry)) { submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry)); } else { submap_info->object_id_full = 0ull; } } } else { short_info->external_pager = extended.external_pager; short_info->shadow_depth = extended.shadow_depth; short_info->share_mode = extended.share_mode; short_info->ref_count = extended.ref_count; } if (not_in_kdp) { vm_map_unlock_read(curr_map); } return KERN_SUCCESS; } /* * vm_region: * * User call to obtain information about a region in * a task's address map. Currently, only one flavor is * supported. * * XXX The reserved and behavior fields cannot be filled * in until the vm merge from the IK is completed, and * vm_reserve is implemented. */ kern_return_t vm_map_region( vm_map_t map, vm_map_offset_ut *address_u, /* IN/OUT */ vm_map_size_ut *size_u, /* OUT */ vm_region_flavor_t flavor, /* IN */ vm_region_info_t info, /* OUT */ mach_msg_type_number_t *count, /* IN/OUT */ mach_port_t *object_name) /* OUT */ { vm_map_entry_t tmp_entry; vm_map_entry_t entry; vm_map_offset_t start; if (map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; } start = vm_sanitize_addr(map, *address_u); switch (flavor) { case VM_REGION_BASIC_INFO: /* legacy for old 32-bit objects info */ { vm_region_basic_info_t basic; if (*count < VM_REGION_BASIC_INFO_COUNT) { return KERN_INVALID_ARGUMENT; } basic = (vm_region_basic_info_t) info; *count = VM_REGION_BASIC_INFO_COUNT; vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; basic->offset = (uint32_t)VME_OFFSET(entry); basic->protection = entry->protection; basic->inheritance = entry->inheritance; basic->max_protection = entry->max_protection; basic->behavior = entry->behavior; basic->user_wired_count = entry->user_wired_count; basic->reserved = entry->is_sub_map; *address_u = vm_sanitize_wrap_addr(start); *size_u = vm_sanitize_wrap_size(entry->vme_end - start); if (object_name) { *object_name = IP_NULL; } if (entry->is_sub_map) { basic->shared = FALSE; } else { basic->shared = entry->is_shared; } vm_map_unlock_read(map); return KERN_SUCCESS; } case VM_REGION_BASIC_INFO_64: { vm_region_basic_info_64_t basic; if (*count < VM_REGION_BASIC_INFO_COUNT_64) { return KERN_INVALID_ARGUMENT; } basic = (vm_region_basic_info_64_t) info; *count = VM_REGION_BASIC_INFO_COUNT_64; vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; basic->offset = VME_OFFSET(entry); basic->protection = entry->protection; basic->inheritance = entry->inheritance; basic->max_protection = entry->max_protection; basic->behavior = entry->behavior; basic->user_wired_count = entry->user_wired_count; basic->reserved = entry->is_sub_map; *address_u = vm_sanitize_wrap_addr(start); *size_u = vm_sanitize_wrap_size(entry->vme_end - start); if (object_name) { *object_name = IP_NULL; } if (entry->is_sub_map) { basic->shared = FALSE; } else { basic->shared = entry->is_shared; } vm_map_unlock_read(map); return KERN_SUCCESS; } case VM_REGION_EXTENDED_INFO: if (*count < VM_REGION_EXTENDED_INFO_COUNT) { return KERN_INVALID_ARGUMENT; } OS_FALLTHROUGH; case VM_REGION_EXTENDED_INFO__legacy: { vm_region_extended_info_t extended; mach_msg_type_number_t original_count; int effective_page_size, effective_page_shift; if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) { return KERN_INVALID_ARGUMENT; } extended = (vm_region_extended_info_t) info; effective_page_shift = vm_self_region_page_shift(map); effective_page_size = (1 << effective_page_shift); vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; extended->protection = entry->protection; extended->user_tag = VME_ALIAS(entry); extended->pages_resident = 0; extended->pages_swapped_out = 0; extended->pages_shared_now_private = 0; extended->pages_dirtied = 0; extended->external_pager = 0; extended->shadow_depth = 0; original_count = *count; if (flavor == VM_REGION_EXTENDED_INFO__legacy) { *count = VM_REGION_EXTENDED_INFO_COUNT__legacy; } else { extended->pages_reusable = 0; *count = VM_REGION_EXTENDED_INFO_COUNT; } vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count); if (object_name) { *object_name = IP_NULL; } *address_u = vm_sanitize_wrap_addr(start); *size_u = vm_sanitize_wrap_size(entry->vme_end - start); vm_map_unlock_read(map); return KERN_SUCCESS; } case VM_REGION_TOP_INFO: { vm_region_top_info_t top; if (*count < VM_REGION_TOP_INFO_COUNT) { return KERN_INVALID_ARGUMENT; } top = (vm_region_top_info_t) info; *count = VM_REGION_TOP_INFO_COUNT; vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; top->private_pages_resident = 0; top->shared_pages_resident = 0; vm_map_region_top_walk(entry, top); if (object_name) { *object_name = IP_NULL; } *address_u = vm_sanitize_wrap_addr(start); *size_u = vm_sanitize_wrap_size(entry->vme_end - start); vm_map_unlock_read(map); return KERN_SUCCESS; } default: return KERN_INVALID_ARGUMENT; } } #define OBJ_RESIDENT_COUNT(obj, entry_size) \ MIN((entry_size), \ ((obj)->all_reusable ? \ (obj)->wired_page_count : \ (obj)->resident_page_count - (obj)->reusable_page_count)) void vm_map_region_top_walk( vm_map_entry_t entry, vm_region_top_info_t top) { if (entry->is_sub_map || VME_OBJECT(entry) == 0) { top->share_mode = SM_EMPTY; top->ref_count = 0; top->obj_id = 0; return; } { struct vm_object *obj, *tmp_obj; int ref_count; uint32_t entry_size; entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64); obj = VME_OBJECT(entry); vm_object_lock(obj); if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 && obj->paging_in_progress) { ref_count--; } assert(obj->reusable_page_count <= obj->resident_page_count); if (obj->shadow) { if (ref_count == 1) { top->private_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); } else { top->shared_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); } top->ref_count = ref_count; top->share_mode = SM_COW; while ((tmp_obj = obj->shadow)) { vm_object_lock(tmp_obj); vm_object_unlock(obj); obj = tmp_obj; if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 && obj->paging_in_progress) { ref_count--; } assert(obj->reusable_page_count <= obj->resident_page_count); top->shared_pages_resident += OBJ_RESIDENT_COUNT(obj, entry_size); top->ref_count += ref_count - 1; } } else { if (entry->superpage_size) { top->share_mode = SM_LARGE_PAGE; top->shared_pages_resident = 0; top->private_pages_resident = entry_size; } else if (entry->needs_copy) { top->share_mode = SM_COW; top->shared_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); } else { if (ref_count == 1 || (ref_count == 2 && obj->named)) { top->share_mode = SM_PRIVATE; top->private_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); } else { top->share_mode = SM_SHARED; top->shared_pages_resident = OBJ_RESIDENT_COUNT(obj, entry_size); } } top->ref_count = ref_count; } vm_object_unlock(obj); /* XXX K64: obj_id will be truncated */ top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj); } } void vm_map_region_walk( vm_map_t map, vm_map_offset_t va, vm_map_entry_t entry, vm_object_offset_t offset, vm_object_size_t range, vm_region_extended_info_t extended, boolean_t look_for_pages, mach_msg_type_number_t count) { struct vm_object *obj, *tmp_obj; vm_map_offset_t last_offset; int i; int ref_count; struct vm_object *shadow_object; unsigned short shadow_depth; boolean_t do_region_footprint; int effective_page_size, effective_page_shift; vm_map_offset_t effective_page_mask; do_region_footprint = task_self_region_footprint(); if ((entry->is_sub_map) || (VME_OBJECT(entry) == 0) || (VME_OBJECT(entry)->phys_contiguous && !entry->superpage_size)) { extended->share_mode = SM_EMPTY; extended->ref_count = 0; return; } if (entry->superpage_size) { extended->shadow_depth = 0; extended->share_mode = SM_LARGE_PAGE; extended->ref_count = 1; extended->external_pager = 0; /* TODO4K: Superpage in 4k mode? */ extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT); extended->shadow_depth = 0; return; } effective_page_shift = vm_self_region_page_shift(map); effective_page_size = (1 << effective_page_shift); effective_page_mask = effective_page_size - 1; offset = vm_map_trunc_page(offset, effective_page_mask); obj = VME_OBJECT(entry); vm_object_lock(obj); if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 && obj->paging_in_progress) { ref_count--; } if (look_for_pages) { for (last_offset = offset + range; offset < last_offset; offset += effective_page_size, va += effective_page_size) { if (do_region_footprint) { int disp; disp = 0; if (map->has_corpse_footprint) { /* * Query the page info data we saved * while forking the corpse. */ vm_map_corpse_footprint_query_page_info( map, va, &disp); } else { /* * Query the pmap. */ vm_map_footprint_query_page_info( map, entry, va, &disp); } if (disp & VM_PAGE_QUERY_PAGE_PRESENT) { extended->pages_resident++; } if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) { extended->pages_reusable++; } if (disp & VM_PAGE_QUERY_PAGE_DIRTY) { extended->pages_dirtied++; } if (disp & PMAP_QUERY_PAGE_COMPRESSED) { extended->pages_swapped_out++; } continue; } vm_map_region_look_for_page(map, va, obj, vm_object_trunc_page(offset), ref_count, 0, extended, count); } if (do_region_footprint) { goto collect_object_info; } } else { collect_object_info: shadow_object = obj->shadow; shadow_depth = 0; if (!(obj->internal)) { extended->external_pager = 1; } if (shadow_object != VM_OBJECT_NULL) { vm_object_lock(shadow_object); for (; shadow_object != VM_OBJECT_NULL; shadow_depth++) { vm_object_t next_shadow; if (!(shadow_object->internal)) { extended->external_pager = 1; } next_shadow = shadow_object->shadow; if (next_shadow) { vm_object_lock(next_shadow); } vm_object_unlock(shadow_object); shadow_object = next_shadow; } } extended->shadow_depth = shadow_depth; } if (extended->shadow_depth || entry->needs_copy) { extended->share_mode = SM_COW; } else { if (ref_count == 1) { extended->share_mode = SM_PRIVATE; } else { if (obj->true_share) { extended->share_mode = SM_TRUESHARED; } else { extended->share_mode = SM_SHARED; } } } extended->ref_count = ref_count - extended->shadow_depth; for (i = 0; i < extended->shadow_depth; i++) { if ((tmp_obj = obj->shadow) == 0) { break; } vm_object_lock(tmp_obj); vm_object_unlock(obj); if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 && tmp_obj->paging_in_progress) { ref_count--; } extended->ref_count += ref_count; obj = tmp_obj; } vm_object_unlock(obj); if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) { extended->share_mode = SM_PRIVATE; } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) { vm_map_entry_t cur; vm_map_entry_t last; int my_refs; obj = VME_OBJECT(entry); last = vm_map_to_entry(map); my_refs = 0; if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 && obj->paging_in_progress) { ref_count--; } for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) { if (vm_map_region_has_obj_ref(cur, obj)) { my_refs++; } } if (my_refs == ref_count) { extended->share_mode = SM_PRIVATE_ALIASED; } else if (my_refs > 1) { extended->share_mode = SM_SHARED_ALIASED; } } } /* object is locked on entry and locked on return */ static void vm_map_region_look_for_page( __unused vm_map_t map, __unused vm_map_offset_t va, vm_object_t object, vm_object_offset_t offset, int max_refcnt, unsigned short depth, vm_region_extended_info_t extended, mach_msg_type_number_t count) { vm_page_t p; vm_object_t shadow; int ref_count; vm_object_t caller_object; shadow = object->shadow; caller_object = object; while (TRUE) { if (!(object->internal)) { extended->external_pager = 1; } if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { if (shadow && (max_refcnt == 1)) { extended->pages_shared_now_private++; } if (!p->vmp_fictitious && (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { extended->pages_dirtied++; } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) { if (p->vmp_reusable || object->all_reusable) { extended->pages_reusable++; } } extended->pages_resident++; if (object != caller_object) { vm_object_unlock(object); } return; } if (object->internal && object->alive && !object->terminating && object->pager_ready) { if (vm_object_compressor_pager_state_get(object, offset) == VM_EXTERNAL_STATE_EXISTS) { /* the pager has that page */ extended->pages_swapped_out++; if (object != caller_object) { vm_object_unlock(object); } return; } } if (shadow) { vm_object_lock(shadow); if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 && shadow->paging_in_progress) { ref_count--; } if (++depth > extended->shadow_depth) { extended->shadow_depth = depth; } if (ref_count > max_refcnt) { max_refcnt = ref_count; } if (object != caller_object) { vm_object_unlock(object); } offset = offset + object->vo_shadow_offset; object = shadow; shadow = object->shadow; continue; } if (object != caller_object) { vm_object_unlock(object); } break; } } static inline boolean_t vm_map_region_has_obj_ref( vm_map_entry_t entry, vm_object_t object) { vm_object_t cur_obj; vm_object_t shadow_obj; if (entry->is_sub_map) { return FALSE; } cur_obj = VME_OBJECT(entry); if (cur_obj == VM_OBJECT_NULL) { return FALSE; } else if (cur_obj == object) { return TRUE; } /* * Avoid locks for first shadow check, otherwise diagnostic tools will * spend most of their time obtaining locks in this function when analyzing * processes with many VM entries which may commonly have no shadow chain. * * This is acceptable because: * - Shadow's fields are not accessed outside of its lock * - Objects are unlikely to be modified due to: * - Many diagnostic tools suspend the task * - VM map is locked * - The rare incorrect return from this function turns a guess into a * slightly worse guess * - Entire shadow chain is not locked as a whole, so can still change * while traversing, resulting in incorrect guess even with locking */ shadow_obj = cur_obj->shadow; if (shadow_obj == VM_OBJECT_NULL) { return FALSE; } else if (shadow_obj == object) { return TRUE; } vm_object_lock(cur_obj); while ((shadow_obj = cur_obj->shadow)) { /* check if object was found before grabbing a lock */ if (shadow_obj == object) { vm_object_unlock(cur_obj); return TRUE; } vm_object_lock(shadow_obj); vm_object_unlock(cur_obj); cur_obj = shadow_obj; } /* exhausted the shadow chain */ vm_object_unlock(cur_obj); return FALSE; } /* * Routine: vm_map_simplify * * Description: * Attempt to simplify the map representation in * the vicinity of the given starting address. * Note: * This routine is intended primarily to keep the * kernel maps more compact -- they generally don't * benefit from the "expand a map entry" technology * at allocation time because the adjacent entry * is often wired down. */ void vm_map_simplify_entry( vm_map_t map, vm_map_entry_t this_entry) { vm_map_entry_t prev_entry; prev_entry = this_entry->vme_prev; if ((this_entry != vm_map_to_entry(map)) && (prev_entry != vm_map_to_entry(map)) && (prev_entry->vme_end == this_entry->vme_start) && (prev_entry->is_sub_map == this_entry->is_sub_map) && (prev_entry->vme_object_value == this_entry->vme_object_value) && (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) && ((VME_OFFSET(prev_entry) + (prev_entry->vme_end - prev_entry->vme_start)) == VME_OFFSET(this_entry)) && (prev_entry->behavior == this_entry->behavior) && (prev_entry->needs_copy == this_entry->needs_copy) && (prev_entry->protection == this_entry->protection) && (prev_entry->max_protection == this_entry->max_protection) && (prev_entry->inheritance == this_entry->inheritance) && (prev_entry->use_pmap == this_entry->use_pmap) && (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) && (prev_entry->no_cache == this_entry->no_cache) && (prev_entry->vme_permanent == this_entry->vme_permanent) && (prev_entry->map_aligned == this_entry->map_aligned) && (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) && (prev_entry->used_for_jit == this_entry->used_for_jit) && #if __arm64e__ (prev_entry->used_for_tpro == this_entry->used_for_tpro) && #endif (prev_entry->csm_associated == this_entry->csm_associated) && (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) && (prev_entry->iokit_acct == this_entry->iokit_acct) && (prev_entry->vme_resilient_codesign == this_entry->vme_resilient_codesign) && (prev_entry->vme_resilient_media == this_entry->vme_resilient_media) && (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) && (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) && (prev_entry->wired_count == this_entry->wired_count) && (prev_entry->user_wired_count == this_entry->user_wired_count) && ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) && (prev_entry->in_transition == FALSE) && (this_entry->in_transition == FALSE) && (prev_entry->needs_wakeup == FALSE) && (this_entry->needs_wakeup == FALSE) && (prev_entry->is_shared == this_entry->is_shared) && (prev_entry->superpage_size == FALSE) && (this_entry->superpage_size == FALSE) ) { if (prev_entry->vme_permanent) { assert(this_entry->vme_permanent); prev_entry->vme_permanent = false; } vm_map_store_entry_unlink(map, prev_entry, true); assert(prev_entry->vme_start < this_entry->vme_end); if (prev_entry->map_aligned) { assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start, VM_MAP_PAGE_MASK(map))); } this_entry->vme_start = prev_entry->vme_start; VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry)); if (map->holelistenabled) { vm_map_store_update_first_free(map, this_entry, TRUE); } if (prev_entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(prev_entry)); } else { vm_object_deallocate(VME_OBJECT(prev_entry)); } vm_map_entry_dispose(prev_entry); SAVE_HINT_MAP_WRITE(map, this_entry); } } void vm_map_simplify( vm_map_t map, vm_map_offset_t start) { vm_map_entry_t this_entry; vm_map_lock(map); if (vm_map_lookup_entry(map, start, &this_entry)) { vm_map_simplify_entry(map, this_entry); vm_map_simplify_entry(map, this_entry->vme_next); } vm_map_unlock(map); } static void vm_map_simplify_range( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; /* * The map should be locked (for "write") by the caller. */ if (start >= end) { /* invalid address range */ return; } start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)); end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map)); if (!vm_map_lookup_entry(map, start, &entry)) { /* "start" is not mapped and "entry" ends before "start" */ if (entry == vm_map_to_entry(map)) { /* start with first entry in the map */ entry = vm_map_first_entry(map); } else { /* start with next entry */ entry = entry->vme_next; } } while (entry != vm_map_to_entry(map) && entry->vme_start <= end) { /* try and coalesce "entry" with its previous entry */ vm_map_simplify_entry(map, entry); entry = entry->vme_next; } } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_machine_attribute_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, mach_vm_offset_t *start, mach_vm_offset_t *end, vm_map_size_t *size) { return vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, size); } /* * Routine: vm_map_machine_attribute * Purpose: * Provide machine-specific attributes to mappings, * such as cachability etc. for machines that provide * them. NUMA architectures and machines with big/strange * caches will use this. * Note: * Responsibilities for locking and checking are handled here, * everything else in the pmap module. If any non-volatile * information must be kept, the pmap module should handle * it itself. [This assumes that attributes do not * need to be inherited, which seems ok to me] */ kern_return_t vm_map_machine_attribute( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_machine_attribute_t attribute, vm_machine_attribute_val_t *value) /* IN/OUT */ { mach_vm_offset_t start, end; vm_map_size_t sync_size; kern_return_t ret; vm_map_entry_t entry; ret = vm_map_machine_attribute_sanitize(map, start_u, end_u, &start, &end, &sync_size); if (__improbable(ret != KERN_SUCCESS)) { return vm_sanitize_get_kr(ret); } if (start < vm_map_min(map) || end > vm_map_max(map)) { return KERN_INVALID_ADDRESS; } vm_map_lock(map); if (attribute != MATTR_CACHE) { /* If we don't have to find physical addresses, we */ /* don't have to do an explicit traversal here. */ ret = pmap_attribute(map->pmap, start, end - start, attribute, value); vm_map_unlock(map); return ret; } ret = KERN_SUCCESS; /* Assume it all worked */ while (sync_size) { if (vm_map_lookup_entry(map, start, &entry)) { vm_map_size_t sub_size; if ((entry->vme_end - start) > sync_size) { sub_size = sync_size; sync_size = 0; } else { sub_size = entry->vme_end - start; sync_size -= sub_size; } if (entry->is_sub_map) { vm_map_offset_t sub_start; vm_map_offset_t sub_end; sub_start = (start - entry->vme_start) + VME_OFFSET(entry); sub_end = sub_start + sub_size; vm_map_machine_attribute( VME_SUBMAP(entry), sub_start, sub_end, attribute, value); } else if (VME_OBJECT(entry)) { vm_page_t m; vm_object_t object; vm_object_t base_object; vm_object_t last_object; vm_object_offset_t offset; vm_object_offset_t base_offset; vm_map_size_t range; range = sub_size; offset = (start - entry->vme_start) + VME_OFFSET(entry); offset = vm_object_trunc_page(offset); base_offset = offset; object = VME_OBJECT(entry); base_object = object; last_object = NULL; vm_object_lock(object); while (range) { m = vm_page_lookup( object, offset); if (m && !m->vmp_fictitious) { ret = pmap_attribute_cache_sync( VM_PAGE_GET_PHYS_PAGE(m), PAGE_SIZE, attribute, value); } else if (object->shadow) { offset = offset + object->vo_shadow_offset; last_object = object; object = object->shadow; vm_object_lock(last_object->shadow); vm_object_unlock(last_object); continue; } if (range < PAGE_SIZE) { range = 0; } else { range -= PAGE_SIZE; } if (base_object != object) { vm_object_unlock(object); vm_object_lock(base_object); object = base_object; } /* Bump to the next page */ base_offset += PAGE_SIZE; offset = base_offset; } vm_object_unlock(object); } start += sub_size; } else { vm_map_unlock(map); return KERN_FAILURE; } } vm_map_unlock(map); return ret; } /* * vm_map_behavior_set: * * Sets the paging reference behavior of the specified address * range in the target map. Paging reference behavior affects * how pagein operations resulting from faults on the map will be * clustered. */ kern_return_t vm_map_behavior_set( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end, vm_behavior_t new_behavior) { vm_map_entry_t entry; vm_map_entry_t temp_entry; if (start > end || start < vm_map_min(map) || end > vm_map_max(map)) { return KERN_NO_SPACE; } if (__improbable(vm_map_range_overflows(map, start, end - start))) { return KERN_INVALID_ADDRESS; } switch (new_behavior) { /* * This first block of behaviors all set a persistent state on the specified * memory range. All we have to do here is to record the desired behavior * in the vm_map_entry_t's. */ case VM_BEHAVIOR_DEFAULT: case VM_BEHAVIOR_RANDOM: case VM_BEHAVIOR_SEQUENTIAL: case VM_BEHAVIOR_RSEQNTL: case VM_BEHAVIOR_ZERO_WIRED_PAGES: vm_map_lock(map); /* * The entire address range must be valid for the map. * Note that vm_map_range_check() does a * vm_map_lookup_entry() internally and returns the * entry containing the start of the address range if * the entire range is valid. */ if (vm_map_range_check(map, start, end, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else { vm_map_unlock(map); return KERN_INVALID_ADDRESS; } while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { vm_map_clip_end(map, entry, end); if (entry->is_sub_map) { assert(!entry->use_pmap); } if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) { entry->zero_wired_pages = TRUE; } else { entry->behavior = new_behavior; } entry = entry->vme_next; } vm_map_unlock(map); break; /* * The rest of these are different from the above in that they cause * an immediate action to take place as opposed to setting a behavior that * affects future actions. */ case VM_BEHAVIOR_WILLNEED: return vm_map_willneed(map, start, end); case VM_BEHAVIOR_DONTNEED: return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS); case VM_BEHAVIOR_FREE: return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS); case VM_BEHAVIOR_REUSABLE: return vm_map_reusable_pages(map, start, end); case VM_BEHAVIOR_REUSE: return vm_map_reuse_pages(map, start, end); case VM_BEHAVIOR_CAN_REUSE: return vm_map_can_reuse(map, start, end); #if MACH_ASSERT case VM_BEHAVIOR_PAGEOUT: return vm_map_pageout(map, start, end); #endif /* MACH_ASSERT */ case VM_BEHAVIOR_ZERO: return vm_map_zero(map, start, end); default: return KERN_INVALID_ARGUMENT; } return KERN_SUCCESS; } /* * Internals for madvise(MADV_WILLNEED) system call. * * The implementation is to do:- * a) read-ahead if the mapping corresponds to a mapped regular file * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping */ static kern_return_t vm_map_willneed( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end ) { vm_map_entry_t entry; vm_object_t object; memory_object_t pager; struct vm_object_fault_info fault_info = {}; kern_return_t kr; vm_object_size_t len; vm_object_offset_t offset; KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START, task_pid(current_task()), start, end); fault_info.interruptible = THREAD_UNINT; /* ignored value */ fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; fault_info.stealth = TRUE; /* * The MADV_WILLNEED operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. */ if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, task_pid(current_task()), start, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ for (; entry != vm_map_to_entry(map) && start < end;) { /* * The first time through, the start address could be anywhere * within the vm_map_entry we found. So adjust the offset to * correspond. After that, the offset will always be zero to * correspond to the beginning of the current vm_map_entry. */ offset = (start - entry->vme_start) + VME_OFFSET(entry); /* * Set the length so we don't go beyond the end of the * map_entry or beyond the end of the range we were given. * This range could span also multiple map entries all of which * map different files, so make sure we only do the right amount * of I/O for each object. Note that it's possible for there * to be multiple map entries all referring to the same object * but with different page permissions, but it's not worth * trying to optimize that case. */ len = MIN(entry->vme_end - start, end - start); if ((vm_size_t) len != len) { /* 32-bit overflow */ len = (vm_size_t) (0 - PAGE_SIZE); } fault_info.cluster_size = (vm_size_t) len; fault_info.lo_offset = offset; fault_info.hi_offset = offset + len; fault_info.user_tag = VME_ALIAS(entry); fault_info.pmap_options = 0; if (entry->iokit_acct || (!entry->is_sub_map && !entry->use_pmap)) { fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; } fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug; /* * If the entry is a submap OR there's no read permission * to this mapping, then just skip it. */ if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) { entry = entry->vme_next; start = entry->vme_start; continue; } object = VME_OBJECT(entry); if (object == NULL || (object && object->internal)) { /* * Memory range backed by anonymous memory. */ vm_size_t region_size = 0, effective_page_size = 0; vm_map_offset_t addr = 0, effective_page_mask = 0; region_size = len; addr = start; effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK); effective_page_size = effective_page_mask + 1; vm_map_unlock_read(map); while (region_size) { vm_pre_fault( vm_map_trunc_page(addr, effective_page_mask), VM_PROT_READ | VM_PROT_WRITE); region_size -= effective_page_size; addr += effective_page_size; } } else { /* * Find the file object backing this map entry. If there is * none, then we simply ignore the "will need" advice for this * entry and go on to the next one. */ if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { entry = entry->vme_next; start = entry->vme_start; continue; } vm_object_paging_begin(object); pager = object->pager; vm_object_unlock(object); /* * The data_request() could take a long time, so let's * release the map lock to avoid blocking other threads. */ vm_map_unlock_read(map); /* * Get the data from the object asynchronously. * * Note that memory_object_data_request() places limits on the * amount of I/O it will do. Regardless of the len we * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it * silently truncates the len to that size. This isn't * necessarily bad since madvise shouldn't really be used to * page in unlimited amounts of data. Other Unix variants * limit the willneed case as well. If this turns out to be an * issue for developers, then we can always adjust the policy * here and still be backwards compatible since this is all * just "advice". */ kr = memory_object_data_request( pager, vm_object_trunc_page(offset) + object->paging_offset, 0, /* ignored */ VM_PROT_READ, (memory_object_fault_info_t)&fault_info); vm_object_lock(object); vm_object_paging_end(object); vm_object_unlock(object); /* * If we couldn't do the I/O for some reason, just give up on * the madvise. We still return success to the user since * madvise isn't supposed to fail when the advice can't be * taken. */ if (kr != KERN_SUCCESS) { KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, task_pid(current_task()), start, kr); return KERN_SUCCESS; } } start += len; if (start >= end) { /* done */ KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, task_pid(current_task()), start, KERN_SUCCESS); return KERN_SUCCESS; } /* look up next entry */ vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &entry)) { /* * There's a new hole in the address range. */ vm_map_unlock_read(map); KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, task_pid(current_task()), start, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } vm_map_unlock_read(map); KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, task_pid(current_task()), start, KERN_SUCCESS); return KERN_SUCCESS; } static boolean_t vm_map_entry_is_reusable( vm_map_entry_t entry) { /* Only user map entries */ vm_object_t object; if (entry->is_sub_map) { return FALSE; } switch (VME_ALIAS(entry)) { case VM_MEMORY_MALLOC: case VM_MEMORY_MALLOC_SMALL: case VM_MEMORY_MALLOC_LARGE: case VM_MEMORY_REALLOC: case VM_MEMORY_MALLOC_TINY: case VM_MEMORY_MALLOC_LARGE_REUSABLE: case VM_MEMORY_MALLOC_LARGE_REUSED: /* * This is a malloc() memory region: check if it's still * in its original state and can be re-used for more * malloc() allocations. */ break; default: /* * Not a malloc() memory region: let the caller decide if * it's re-usable. */ return TRUE; } if (/*entry->is_shared ||*/ entry->is_sub_map || entry->in_transition || entry->protection != VM_PROT_DEFAULT || entry->max_protection != VM_PROT_ALL || entry->inheritance != VM_INHERIT_DEFAULT || entry->no_cache || entry->vme_permanent || entry->superpage_size != FALSE || entry->zero_wired_pages || entry->wired_count != 0 || entry->user_wired_count != 0) { return FALSE; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { return TRUE; } if ( #if 0 /* * Let's proceed even if the VM object is potentially * shared. * We check for this later when processing the actual * VM pages, so the contents will be safe if shared. * * But we can still mark this memory region as "reusable" to * acknowledge that the caller did let us know that the memory * could be re-used and should not be penalized for holding * on to it. This allows its "resident size" to not include * the reusable range. */ object->ref_count == 1 && #endif object->vo_copy == VM_OBJECT_NULL && object->shadow == VM_OBJECT_NULL && object->internal && object->purgable == VM_PURGABLE_DENY && object->wimg_bits == VM_WIMG_USE_DEFAULT && !object->code_signed) { return TRUE; } return FALSE; } static kern_return_t vm_map_reuse_pages( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; vm_object_t object; vm_object_offset_t start_offset, end_offset; /* * The MADV_REUSE operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* * XXX TODO4K * need to figure out what reusable means for a * portion of a native page. */ return KERN_SUCCESS; } vm_map_lock_read(map); assert(map->pmap != kernel_pmap); /* protect alias access */ /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. */ if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_failure++; return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ for (; entry != vm_map_to_entry(map) && entry->vme_start < end; entry = entry->vme_next) { /* * Sanity check on the VM map entry. */ if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_failure++; return KERN_INVALID_ADDRESS; } /* * The first time through, the start address could be anywhere * within the vm_map_entry we found. So adjust the offset to * correspond. */ if (entry->vme_start < start) { start_offset = start - entry->vme_start; } else { start_offset = 0; } end_offset = MIN(end, entry->vme_end) - entry->vme_start; start_offset += VME_OFFSET(entry); end_offset += VME_OFFSET(entry); object = VME_OBJECT(entry); if (object != VM_OBJECT_NULL) { vm_object_lock(object); vm_object_reuse_pages(object, start_offset, end_offset, TRUE); vm_object_unlock(object); } if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) { /* * XXX * We do not hold the VM map exclusively here. * The "alias" field is not that critical, so it's * safe to update it here, as long as it is the only * one that can be modified while holding the VM map * "shared". */ VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED); } } vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_success++; return KERN_SUCCESS; } static kern_return_t vm_map_reusable_pages( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; vm_object_t object; vm_object_offset_t start_offset, end_offset; vm_map_offset_t pmap_offset; if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* * XXX TODO4K * need to figure out what reusable means for a portion * of a native page. */ return KERN_SUCCESS; } /* * The MADV_REUSABLE operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); assert(map->pmap != kernel_pmap); /* protect alias access */ /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. */ if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_failure++; return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ for (; entry != vm_map_to_entry(map) && entry->vme_start < end; entry = entry->vme_next) { int kill_pages = 0; boolean_t reusable_no_write = FALSE; /* * Sanity check on the VM map entry. */ if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_failure++; return KERN_INVALID_ADDRESS; } if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit #if __arm64e__ && !entry->used_for_tpro #endif ) { /* not writable: can't discard contents */ vm_map_unlock_read(map); vm_page_stats_reusable.reusable_nonwritable++; vm_page_stats_reusable.reusable_pages_failure++; return KERN_PROTECTION_FAILURE; } /* * The first time through, the start address could be anywhere * within the vm_map_entry we found. So adjust the offset to * correspond. */ if (entry->vme_start < start) { start_offset = start - entry->vme_start; pmap_offset = start; } else { start_offset = 0; pmap_offset = entry->vme_start; } end_offset = MIN(end, entry->vme_end) - entry->vme_start; start_offset += VME_OFFSET(entry); end_offset += VME_OFFSET(entry); object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { continue; } if (entry->protection & VM_PROT_EXECUTE) { /* * Executable mappings might be write-protected by * hardware, so do not attempt to write to these pages. */ reusable_no_write = TRUE; } if (entry->vme_xnu_user_debug) { /* * User debug pages might be write-protected by hardware, * so do not attempt to write to these pages. */ reusable_no_write = TRUE; } vm_object_lock(object); if (((os_ref_get_count_raw(&object->ref_count) == 1) || (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC && object->vo_copy == VM_OBJECT_NULL)) && object->shadow == VM_OBJECT_NULL && /* * "iokit_acct" entries are billed for their virtual size * (rather than for their resident pages only), so they * wouldn't benefit from making pages reusable, and it * would be hard to keep track of pages that are both * "iokit_acct" and "reusable" in the pmap stats and * ledgers. */ !(entry->iokit_acct || (!entry->is_sub_map && !entry->use_pmap))) { if (os_ref_get_count_raw(&object->ref_count) != 1) { vm_page_stats_reusable.reusable_shared++; } kill_pages = 1; } else { kill_pages = -1; } if (kill_pages != -1) { vm_object_deactivate_pages(object, start_offset, end_offset - start_offset, kill_pages, TRUE /*reusable_pages*/, reusable_no_write, map->pmap, pmap_offset); } else { vm_page_stats_reusable.reusable_pages_shared++; DTRACE_VM4(vm_map_reusable_pages_shared, unsigned int, VME_ALIAS(entry), vm_map_t, map, vm_map_entry_t, entry, vm_object_t, object); } vm_object_unlock(object); if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE || VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) { /* * XXX * We do not hold the VM map exclusively here. * The "alias" field is not that critical, so it's * safe to update it here, as long as it is the only * one that can be modified while holding the VM map * "shared". */ VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE); } } vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_success++; return KERN_SUCCESS; } static kern_return_t vm_map_can_reuse( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; /* * The MADV_REUSABLE operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); assert(map->pmap != kernel_pmap); /* protect alias access */ /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. */ if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_failure++; return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ for (; entry != vm_map_to_entry(map) && entry->vme_start < end; entry = entry->vme_next) { /* * Sanity check on the VM map entry. */ if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_failure++; return KERN_INVALID_ADDRESS; } } vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_success++; return KERN_SUCCESS; } #if MACH_ASSERT static kern_return_t vm_map_pageout( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; /* * The MADV_PAGEOUT operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. */ if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ for (; entry != vm_map_to_entry(map) && entry->vme_start < end; entry = entry->vme_next) { vm_object_t object; /* * Sanity check on the VM map entry. */ if (entry->is_sub_map) { vm_map_t submap; vm_map_offset_t submap_start; vm_map_offset_t submap_end; vm_map_entry_t submap_entry; submap = VME_SUBMAP(entry); submap_start = VME_OFFSET(entry); submap_end = submap_start + (entry->vme_end - entry->vme_start); vm_map_lock_read(submap); if (!vm_map_range_check(submap, submap_start, submap_end, &submap_entry)) { vm_map_unlock_read(submap); vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } if (submap_entry->is_sub_map) { vm_map_unlock_read(submap); continue; } object = VME_OBJECT(submap_entry); if (object == VM_OBJECT_NULL || !object->internal) { vm_map_unlock_read(submap); continue; } vm_object_pageout(object); vm_map_unlock_read(submap); submap = VM_MAP_NULL; submap_entry = VM_MAP_ENTRY_NULL; continue; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL || !object->internal) { continue; } vm_object_pageout(object); } vm_map_unlock_read(map); return KERN_SUCCESS; } #endif /* MACH_ASSERT */ /* * This function determines if the zero operation can be run on the * respective entry. Additional checks on the object are in * vm_object_zero_preflight. */ static kern_return_t vm_map_zero_entry_preflight(vm_map_entry_t entry) { /* * Zeroing is restricted to writable non-executable entries and non-JIT * regions. */ if (!(entry->protection & VM_PROT_WRITE) || (entry->protection & VM_PROT_EXECUTE) || entry->used_for_jit || entry->vme_xnu_user_debug) { return KERN_PROTECTION_FAILURE; } /* * Zeroing for copy on write isn't yet supported. Zeroing is also not * allowed for submaps. */ if (entry->needs_copy || entry->is_sub_map) { return KERN_NO_ACCESS; } return KERN_SUCCESS; } /* * This function translates entry's start and end to offsets in the object */ static void vm_map_get_bounds_in_object( vm_map_entry_t entry, vm_map_offset_t start, vm_map_offset_t end, vm_map_offset_t *start_offset, vm_map_offset_t *end_offset) { if (entry->vme_start < start) { *start_offset = start - entry->vme_start; } else { *start_offset = 0; } *end_offset = MIN(end, entry->vme_end) - entry->vme_start; *start_offset += VME_OFFSET(entry); *end_offset += VME_OFFSET(entry); } /* * This function iterates through the entries in the requested range * and zeroes any resident pages in the corresponding objects. Compressed * pages are dropped instead of being faulted in and zeroed. */ static kern_return_t vm_map_zero( vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; vm_map_offset_t cur = start; kern_return_t ret; /* * This operation isn't supported where the map page size is less than * the hardware page size. Caller will need to handle error and * explicitly zero memory if needed. */ if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { return KERN_NO_ACCESS; } /* * The MADV_ZERO operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); assert(map->pmap != kernel_pmap); /* protect alias access */ /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return * an error. This check needs to be redone if the map has changed. */ if (!vm_map_range_check(map, cur, end, &entry)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } /* * Examine each vm_map_entry_t in the range. */ while (entry != vm_map_to_entry(map) && entry->vme_start < end) { vm_map_offset_t cur_offset; vm_map_offset_t end_offset; unsigned int last_timestamp = map->timestamp; vm_object_t object = VME_OBJECT(entry); ret = vm_map_zero_entry_preflight(entry); if (ret != KERN_SUCCESS) { vm_map_unlock_read(map); return ret; } if (object == VM_OBJECT_NULL) { entry = entry->vme_next; continue; } vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset); vm_object_lock(object); /* * Take a reference on the object as vm_object_zero will drop the object * lock when it encounters a busy page. */ vm_object_reference_locked(object); vm_map_unlock_read(map); ret = vm_object_zero(object, cur_offset, end_offset); vm_object_unlock(object); vm_object_deallocate(object); if (ret != KERN_SUCCESS) { return ret; } /* * Update cur as vm_object_zero has succeeded. */ cur += (end_offset - cur_offset); if (cur == end) { return KERN_SUCCESS; } /* * If the map timestamp has changed, restart by relooking up cur in the * map */ vm_map_lock_read(map); if (last_timestamp != map->timestamp) { /* * Relookup cur in the map */ if (!vm_map_range_check(map, cur, end, &entry)) { vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } continue; } /* * If the map hasn't changed proceed with the next entry */ entry = entry->vme_next; } vm_map_unlock_read(map); return KERN_SUCCESS; } /* * Routine: vm_map_entry_insert * * Description: This routine inserts a new vm_entry in a locked map. */ static vm_map_entry_t vm_map_entry_insert( vm_map_t map, vm_map_entry_t insp_entry, vm_map_offset_t start, vm_map_offset_t end, vm_object_t object, vm_object_offset_t offset, vm_map_kernel_flags_t vmk_flags, boolean_t needs_copy, vm_prot_t cur_protection, vm_prot_t max_protection, vm_inherit_t inheritance, boolean_t clear_map_aligned) { vm_map_entry_t new_entry; boolean_t map_aligned = FALSE; assert(insp_entry != (vm_map_entry_t)0); vm_map_lock_assert_exclusive(map); __assert_only vm_object_offset_t end_offset = 0; assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset); if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { map_aligned = TRUE; } if (clear_map_aligned && (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) || !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) { map_aligned = FALSE; } if (map_aligned) { assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); } else { assert(page_aligned(start)); assert(page_aligned(end)); } assert(start < end); new_entry = vm_map_entry_create(map); new_entry->vme_start = start; new_entry->vme_end = end; if (vmk_flags.vmkf_submap) { new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic; VME_SUBMAP_SET(new_entry, (vm_map_t)object); } else { VME_OBJECT_SET(new_entry, object, false, 0); } VME_OFFSET_SET(new_entry, offset); VME_ALIAS_SET(new_entry, vmk_flags.vm_tag); new_entry->map_aligned = map_aligned; new_entry->needs_copy = needs_copy; new_entry->inheritance = inheritance; new_entry->protection = cur_protection; new_entry->max_protection = max_protection; /* * submap: "use_pmap" means "nested". * default: false. * * object: "use_pmap" means "use pmap accounting" for footprint. * default: true. */ new_entry->use_pmap = !vmk_flags.vmkf_submap; new_entry->no_cache = vmk_flags.vmf_no_cache; new_entry->vme_permanent = vmk_flags.vmf_permanent; new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute; new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read; new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0); if (vmk_flags.vmkf_map_jit) { if (!(map->jit_entry_exists) || VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) { new_entry->used_for_jit = TRUE; map->jit_entry_exists = TRUE; } } /* * Insert the new entry into the list. */ vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags); map->size += end - start; /* * Update the free space hint and the lookup hint. */ SAVE_HINT_MAP_WRITE(map, new_entry); return new_entry; } /* * Routine: vm_map_remap_extract * * Description: This routine returns a vm_entry list from a map. */ static kern_return_t vm_map_remap_extract( vm_map_t map, vm_map_offset_t addr, vm_map_size_t size, boolean_t copy, vm_map_copy_t map_copy, vm_prot_t *cur_protection, /* IN/OUT */ vm_prot_t *max_protection, /* IN/OUT */ /* What, no behavior? */ vm_inherit_t inheritance, vm_map_kernel_flags_t vmk_flags) { struct vm_map_header *map_header = &map_copy->cpy_hdr; kern_return_t result; vm_map_size_t mapped_size; vm_map_size_t tmp_size; vm_map_entry_t src_entry; /* result of last map lookup */ vm_map_entry_t new_entry; vm_object_offset_t offset; vm_map_offset_t map_address; vm_map_offset_t src_start; /* start of entry to map */ vm_map_offset_t src_end; /* end of region to be mapped */ vm_object_t object; vm_map_version_t version; boolean_t src_needs_copy; boolean_t new_entry_needs_copy; vm_map_entry_t saved_src_entry; boolean_t src_entry_was_wired; vm_prot_t max_prot_for_prot_copy; vm_map_offset_t effective_page_mask; bool pageable, same_map; boolean_t vm_remap_legacy; vm_prot_t required_cur_prot, required_max_prot; vm_object_t new_copy_object; /* vm_object_copy_* result */ boolean_t saved_used_for_jit; /* Saved used_for_jit. */ pageable = vmk_flags.vmkf_copy_pageable; same_map = vmk_flags.vmkf_copy_same_map; effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map)); assert(map != VM_MAP_NULL); assert(size != 0); assert(size == vm_map_round_page(size, effective_page_mask)); assert(inheritance == VM_INHERIT_NONE || inheritance == VM_INHERIT_COPY || inheritance == VM_INHERIT_SHARE); assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC))); assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC))); assert((*cur_protection & *max_protection) == *cur_protection); /* * Compute start and end of region. */ src_start = vm_map_trunc_page(addr, effective_page_mask); src_end = vm_map_round_page(src_start + size, effective_page_mask); /* * Initialize map_header. */ map_header->nentries = 0; map_header->entries_pageable = pageable; // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT); map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map); map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE; vm_map_store_init(map_header); if (copy && vmk_flags.vmkf_remap_prot_copy) { /* * Special case for vm_map_protect(VM_PROT_COPY): * we want to set the new mappings' max protection to the * specified *max_protection... */ max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC); /* ... but we want to use the vm_remap() legacy mode */ vmk_flags.vmkf_remap_legacy_mode = true; *max_protection = VM_PROT_NONE; *cur_protection = VM_PROT_NONE; } else { max_prot_for_prot_copy = VM_PROT_NONE; } if (vmk_flags.vmkf_remap_legacy_mode) { /* * vm_remap() legacy mode: * Extract all memory regions in the specified range and * collect the strictest set of protections allowed on the * entire range, so the caller knows what they can do with * the remapped range. * We start with VM_PROT_ALL and we'll remove the protections * missing from each memory region. */ vm_remap_legacy = TRUE; *cur_protection = VM_PROT_ALL; *max_protection = VM_PROT_ALL; required_cur_prot = VM_PROT_NONE; required_max_prot = VM_PROT_NONE; } else { /* * vm_remap_new() mode: * Extract all memory regions in the specified range and * ensure that they have at least the protections specified * by the caller via *cur_protection and *max_protection. * The resulting mapping should have these protections. */ vm_remap_legacy = FALSE; if (copy) { required_cur_prot = VM_PROT_NONE; required_max_prot = VM_PROT_READ; } else { required_cur_prot = *cur_protection; required_max_prot = *max_protection; } } map_address = 0; mapped_size = 0; result = KERN_SUCCESS; /* * The specified source virtual space might correspond to * multiple map entries, need to loop on them. */ vm_map_lock(map); if (map->pmap == kernel_pmap) { map_copy->is_kernel_range = true; map_copy->orig_range = kmem_addr_get_range(addr, size); #if CONFIG_MAP_RANGES } else if (map->uses_user_ranges) { map_copy->is_user_range = true; map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL); #endif /* CONFIG_MAP_RANGES */ } if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* * This address space uses sub-pages so the range might * not be re-mappable in an address space with larger * pages. Re-assemble any broken-up VM map entries to * improve our chances of making it work. */ vm_map_simplify_range(map, src_start, src_end); } while (mapped_size != size) { vm_map_size_t entry_size; /* * Find the beginning of the region. */ if (!vm_map_lookup_entry(map, src_start, &src_entry)) { result = KERN_INVALID_ADDRESS; break; } if (src_start < src_entry->vme_start || (mapped_size && src_start != src_entry->vme_start)) { result = KERN_INVALID_ADDRESS; break; } tmp_size = size - mapped_size; if (src_end > src_entry->vme_end) { tmp_size -= (src_end - src_entry->vme_end); } entry_size = (vm_map_size_t)(src_entry->vme_end - src_entry->vme_start); if (src_entry->is_sub_map && vmk_flags.vmkf_copy_single_object) { vm_map_t submap; vm_map_offset_t submap_start; vm_map_size_t submap_size; boolean_t submap_needs_copy; /* * No check for "required protection" on "src_entry" * because the protections that matter are the ones * on the submap's VM map entry, which will be checked * during the call to vm_map_remap_extract() below. */ object = VM_OBJECT_NULL; submap_size = src_entry->vme_end - src_start; if (submap_size > size) { submap_size = size; } submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; submap = VME_SUBMAP(src_entry); if (copy) { /* * The caller wants a copy-on-write re-mapping, * so let's extract from the submap accordingly. */ submap_needs_copy = TRUE; } else if (src_entry->needs_copy) { /* * The caller wants a shared re-mapping but the * submap is mapped with "needs_copy", so its * contents can't be shared as is. Extract the * contents of the submap as "copy-on-write". * The re-mapping won't be shared with the * original mapping but this is equivalent to * what happened with the original "remap from * submap" code. * The shared region is mapped "needs_copy", for * example. */ submap_needs_copy = TRUE; } else { /* * The caller wants a shared re-mapping and * this mapping can be shared (no "needs_copy"), * so let's extract from the submap accordingly. * Kernel submaps are mapped without * "needs_copy", for example. */ submap_needs_copy = FALSE; } vm_map_reference(submap); vm_map_unlock(map); src_entry = NULL; if (vm_remap_legacy) { *cur_protection = VM_PROT_NONE; *max_protection = VM_PROT_NONE; } DTRACE_VM7(remap_submap_recurse, vm_map_t, map, vm_map_offset_t, addr, vm_map_size_t, size, boolean_t, copy, vm_map_offset_t, submap_start, vm_map_size_t, submap_size, boolean_t, submap_needs_copy); result = vm_map_remap_extract(submap, submap_start, submap_size, submap_needs_copy, map_copy, cur_protection, max_protection, inheritance, vmk_flags); vm_map_deallocate(submap); if (result == KERN_SUCCESS && submap_needs_copy && !copy) { /* * We were asked for a "shared" * re-mapping but had to ask for a * "copy-on-write" remapping of the * submap's mapping to honor the * submap's "needs_copy". * We now need to resolve that * pending "copy-on-write" to * get something we can share. */ vm_map_entry_t copy_entry; vm_object_offset_t copy_offset; vm_map_size_t copy_size; vm_object_t copy_object; copy_entry = vm_map_copy_first_entry(map_copy); copy_size = copy_entry->vme_end - copy_entry->vme_start; copy_object = VME_OBJECT(copy_entry); copy_offset = VME_OFFSET(copy_entry); if (copy_object == VM_OBJECT_NULL) { assert(copy_offset == 0); assert(!copy_entry->needs_copy); if (copy_entry->max_protection == VM_PROT_NONE) { assert(copy_entry->protection == VM_PROT_NONE); /* nothing to share */ } else { assert(copy_offset == 0); copy_object = vm_object_allocate(copy_size); VME_OFFSET_SET(copy_entry, 0); VME_OBJECT_SET(copy_entry, copy_object, false, 0); assert(copy_entry->use_pmap); } } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* already shareable */ assert(!copy_entry->needs_copy); } else if (copy_entry->needs_copy || copy_object->shadowed || (copy_object->internal && !copy_object->true_share && !copy_entry->is_shared && copy_object->vo_size > copy_size)) { VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE); assert(copy_entry->use_pmap); if (copy_entry->needs_copy) { /* already write-protected */ } else { vm_prot_t prot; prot = copy_entry->protection & ~VM_PROT_WRITE; vm_object_pmap_protect(copy_object, copy_offset, copy_size, PMAP_NULL, PAGE_SIZE, 0, prot); } copy_entry->needs_copy = FALSE; } copy_object = VME_OBJECT(copy_entry); copy_offset = VME_OFFSET(copy_entry); if (copy_object && copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; copy_object->true_share = TRUE; } } return result; } if (src_entry->is_sub_map) { /* protections for submap mapping are irrelevant here */ } else if (((src_entry->protection & required_cur_prot) != required_cur_prot) || ((src_entry->max_protection & required_max_prot) != required_max_prot)) { if (vmk_flags.vmkf_copy_single_object && mapped_size != 0) { /* * Single object extraction. * We can't extract more with the required * protection but we've extracted some, so * stop there and declare success. * The caller should check the size of * the copy entry we've extracted. */ result = KERN_SUCCESS; } else { /* * VM range extraction. * Required proctection is not available * for this part of the range: fail. */ result = KERN_PROTECTION_FAILURE; } break; } if (src_entry->is_sub_map) { vm_map_t submap; vm_map_offset_t submap_start; vm_map_size_t submap_size; vm_map_copy_t submap_copy; vm_prot_t submap_curprot, submap_maxprot; boolean_t submap_needs_copy; /* * No check for "required protection" on "src_entry" * because the protections that matter are the ones * on the submap's VM map entry, which will be checked * during the call to vm_map_copy_extract() below. */ object = VM_OBJECT_NULL; submap_copy = VM_MAP_COPY_NULL; /* find equivalent range in the submap */ submap = VME_SUBMAP(src_entry); submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; submap_size = tmp_size; if (copy) { /* * The caller wants a copy-on-write re-mapping, * so let's extract from the submap accordingly. */ submap_needs_copy = TRUE; } else if (src_entry->needs_copy) { /* * The caller wants a shared re-mapping but the * submap is mapped with "needs_copy", so its * contents can't be shared as is. Extract the * contents of the submap as "copy-on-write". * The re-mapping won't be shared with the * original mapping but this is equivalent to * what happened with the original "remap from * submap" code. * The shared region is mapped "needs_copy", for * example. */ submap_needs_copy = TRUE; } else { /* * The caller wants a shared re-mapping and * this mapping can be shared (no "needs_copy"), * so let's extract from the submap accordingly. * Kernel submaps are mapped without * "needs_copy", for example. */ submap_needs_copy = FALSE; } /* extra ref to keep submap alive */ vm_map_reference(submap); DTRACE_VM7(remap_submap_recurse, vm_map_t, map, vm_map_offset_t, addr, vm_map_size_t, size, boolean_t, copy, vm_map_offset_t, submap_start, vm_map_size_t, submap_size, boolean_t, submap_needs_copy); /* * The map can be safely unlocked since we * already hold a reference on the submap. * * No timestamp since we don't care if the map * gets modified while we're down in the submap. * We'll resume the extraction at src_start + tmp_size * anyway. */ vm_map_unlock(map); src_entry = NULL; /* not valid once map is unlocked */ if (vm_remap_legacy) { submap_curprot = VM_PROT_NONE; submap_maxprot = VM_PROT_NONE; if (max_prot_for_prot_copy) { submap_maxprot = max_prot_for_prot_copy; } } else { assert(!max_prot_for_prot_copy); submap_curprot = *cur_protection; submap_maxprot = *max_protection; } result = vm_map_copy_extract(submap, submap_start, submap_size, submap_needs_copy, &submap_copy, &submap_curprot, &submap_maxprot, inheritance, vmk_flags); /* release extra ref on submap */ vm_map_deallocate(submap); submap = VM_MAP_NULL; if (result != KERN_SUCCESS) { vm_map_lock(map); break; } /* transfer submap_copy entries to map_header */ while (vm_map_copy_first_entry(submap_copy) != vm_map_copy_to_entry(submap_copy)) { vm_map_entry_t copy_entry; vm_map_size_t copy_entry_size; copy_entry = vm_map_copy_first_entry(submap_copy); /* * Prevent kernel_object from being exposed to * user space. */ if (__improbable(copy_entry->vme_kernel_object)) { printf("%d[%s]: rejecting attempt to extract from kernel_object\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?")); DTRACE_VM(extract_kernel_only); result = KERN_INVALID_RIGHT; vm_map_copy_discard(submap_copy); submap_copy = VM_MAP_COPY_NULL; vm_map_lock(map); break; } vm_map_copy_entry_unlink(submap_copy, copy_entry); copy_entry_size = copy_entry->vme_end - copy_entry->vme_start; copy_entry->vme_start = map_address; copy_entry->vme_end = map_address + copy_entry_size; map_address += copy_entry_size; mapped_size += copy_entry_size; src_start += copy_entry_size; assert(src_start <= src_end); _vm_map_store_entry_link(map_header, map_header->links.prev, copy_entry); } /* done with submap_copy */ vm_map_copy_discard(submap_copy); if (vm_remap_legacy) { *cur_protection &= submap_curprot; *max_protection &= submap_maxprot; } /* re-acquire the map lock and continue to next entry */ vm_map_lock(map); continue; } else { object = VME_OBJECT(src_entry); /* * Prevent kernel_object from being exposed to * user space. */ if (__improbable(is_kernel_object(object))) { printf("%d[%s]: rejecting attempt to extract from kernel_object\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?")); DTRACE_VM(extract_kernel_only); result = KERN_INVALID_RIGHT; break; } if (src_entry->iokit_acct) { /* * This entry uses "IOKit accounting". */ } else if (object != VM_OBJECT_NULL && object->internal && (object->purgable != VM_PURGABLE_DENY || object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) { /* * Purgeable objects have their own accounting: * no pmap accounting for them. */ assertf(!src_entry->use_pmap, "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d", map, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, src_entry->protection, src_entry->max_protection, VME_ALIAS(src_entry)); } else { /* * Not IOKit or purgeable: * must be accounted by pmap stats. */ assertf(src_entry->use_pmap, "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d", map, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, src_entry->protection, src_entry->max_protection, VME_ALIAS(src_entry)); } if (object == VM_OBJECT_NULL) { assert(!src_entry->needs_copy); if (src_entry->max_protection == VM_PROT_NONE) { assert(src_entry->protection == VM_PROT_NONE); /* * No VM object and no permissions: * this must be a reserved range with * nothing to share or copy. * There could also be all sorts of * pmap shenanigans within that reserved * range, so let's just copy the map * entry as is to remap a similar * reserved range. */ offset = 0; /* no object => no offset */ goto copy_src_entry; } object = vm_object_allocate(entry_size); VME_OFFSET_SET(src_entry, 0); VME_OBJECT_SET(src_entry, object, false, 0); assert(src_entry->use_pmap); assert(!map->mapped_in_other_pmaps); } else if (src_entry->wired_count || object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* * A wired memory region should not have * any pending copy-on-write and needs to * keep pointing at the VM object that * contains the wired pages. * If we're sharing this memory (copy=false), * we'll share this VM object. * If we're copying this memory (copy=true), * we'll call vm_object_copy_slowly() below * and use the new VM object for the remapping. * * Or, we are already using an asymmetric * copy, and therefore we already have * the right object. */ assert(!src_entry->needs_copy); } else if (src_entry->needs_copy || object->shadowed || (object->internal && !object->true_share && !src_entry->is_shared && object->vo_size > entry_size)) { bool is_writable; VME_OBJECT_SHADOW(src_entry, entry_size, vm_map_always_shadow(map)); assert(src_entry->use_pmap); is_writable = false; if (src_entry->protection & VM_PROT_WRITE) { is_writable = true; #if __arm64e__ } else if (src_entry->used_for_tpro) { is_writable = true; #endif /* __arm64e__ */ } if (!src_entry->needs_copy && is_writable) { vm_prot_t prot; if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, src_entry->protection); } prot = src_entry->protection & ~VM_PROT_WRITE; if (override_nx(map, VME_ALIAS(src_entry)) && prot) { prot |= VM_PROT_EXECUTE; } if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, prot); } if (map->mapped_in_other_pmaps) { vm_object_pmap_protect( VME_OBJECT(src_entry), VME_OFFSET(src_entry), entry_size, PMAP_NULL, PAGE_SIZE, src_entry->vme_start, prot); #if MACH_ASSERT } else if (__improbable(map->pmap == PMAP_NULL)) { /* * Some VM tests (in vm_tests.c) * sometimes want to use a VM * map without a pmap. * Otherwise, this should never * happen. */ if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) { panic("null pmap"); } #endif /* MACH_ASSERT */ } else { pmap_protect(vm_map_pmap(map), src_entry->vme_start, src_entry->vme_end, prot); } } object = VME_OBJECT(src_entry); src_entry->needs_copy = FALSE; } vm_object_lock(object); vm_object_reference_locked(object); /* object ref. for new entry */ assert(!src_entry->needs_copy); if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { /* * If we want to share this object (copy==0), * it needs to be COPY_DELAY. * If we want to copy this object (copy==1), * we can't just set "needs_copy" on our side * and expect the other side to do the same * (symmetrically), so we can't let the object * stay COPY_SYMMETRIC. * So we always switch from COPY_SYMMETRIC to * COPY_DELAY. */ object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; VM_OBJECT_SET_TRUE_SHARE(object, TRUE); } vm_object_unlock(object); } offset = (VME_OFFSET(src_entry) + (src_start - src_entry->vme_start)); copy_src_entry: new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy(map, new_entry, src_entry); if (new_entry->is_sub_map) { /* clr address space specifics */ new_entry->use_pmap = FALSE; } else if (copy) { /* * We're dealing with a copy-on-write operation, * so the resulting mapping should not inherit the * original mapping's accounting settings. * "use_pmap" should be reset to its default (TRUE) * so that the new mapping gets accounted for in * the task's memory footprint. */ new_entry->use_pmap = TRUE; } /* "iokit_acct" was cleared in vm_map_entry_copy() */ assert(!new_entry->iokit_acct); new_entry->map_aligned = FALSE; new_entry->vme_start = map_address; new_entry->vme_end = map_address + tmp_size; assert(new_entry->vme_start < new_entry->vme_end); if (copy && vmk_flags.vmkf_remap_prot_copy) { /* security: keep "permanent" and "csm_associated" */ new_entry->vme_permanent = src_entry->vme_permanent; new_entry->csm_associated = src_entry->csm_associated; /* * Remapping for vm_map_protect(VM_PROT_COPY) * to convert a read-only mapping into a * copy-on-write version of itself but * with write access: * keep the original inheritance but let's not * add VM_PROT_WRITE to the max protection yet * since we want to do more security checks against * the target map. */ new_entry->inheritance = src_entry->inheritance; new_entry->protection &= max_prot_for_prot_copy; #ifdef __arm64e__ /* * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO * region to be explicitly writable without TPRO is only permitted * if TPRO enforcement has been overridden. * * In this case we ensure any entries reset the TPRO state * and we permit the region to be downgraded from permanent. */ if (new_entry->used_for_tpro) { if (vmk_flags.vmkf_tpro_enforcement_override) { new_entry->used_for_tpro = FALSE; new_entry->vme_permanent = FALSE; } else { result = KERN_PROTECTION_FAILURE; vm_object_deallocate(object); vm_map_entry_dispose(new_entry); new_entry = VM_MAP_ENTRY_NULL; break; } } #endif } else { new_entry->inheritance = inheritance; if (!vm_remap_legacy) { new_entry->protection = *cur_protection; new_entry->max_protection = *max_protection; } } VME_OFFSET_SET(new_entry, offset); /* * The new region has to be copied now if required. */ RestartCopy: if (!copy) { if (src_entry->used_for_jit == TRUE) { if (same_map) { } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) { /* * Cannot allow an entry describing a JIT * region to be shared across address spaces. */ result = KERN_INVALID_ARGUMENT; vm_object_deallocate(object); vm_map_entry_dispose(new_entry); new_entry = VM_MAP_ENTRY_NULL; break; } } if (!src_entry->is_sub_map && VME_OBJECT(src_entry) == VM_OBJECT_NULL) { /* no accessible memory; nothing to share */ assert(src_entry->protection == VM_PROT_NONE); assert(src_entry->max_protection == VM_PROT_NONE); src_entry->is_shared = FALSE; } else { src_entry->is_shared = TRUE; } if (!new_entry->is_sub_map && VME_OBJECT(new_entry) == VM_OBJECT_NULL) { /* no accessible memory; nothing to share */ assert(new_entry->protection == VM_PROT_NONE); assert(new_entry->max_protection == VM_PROT_NONE); new_entry->is_shared = FALSE; } else { new_entry->is_shared = TRUE; } if (!(new_entry->is_sub_map)) { new_entry->needs_copy = FALSE; } } else if (src_entry->is_sub_map) { /* make this a COW sub_map if not already */ assert(new_entry->wired_count == 0); new_entry->needs_copy = TRUE; object = VM_OBJECT_NULL; } else if (src_entry->wired_count == 0 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) && vm_object_copy_quickly(VME_OBJECT(new_entry), VME_OFFSET(new_entry), (new_entry->vme_end - new_entry->vme_start), &src_needs_copy, &new_entry_needs_copy)) { new_entry->needs_copy = new_entry_needs_copy; new_entry->is_shared = FALSE; assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry); /* * Handle copy_on_write semantics. */ if (src_needs_copy && !src_entry->needs_copy) { vm_prot_t prot; if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, src_entry->protection); } prot = src_entry->protection & ~VM_PROT_WRITE; if (override_nx(map, VME_ALIAS(src_entry)) && prot) { prot |= VM_PROT_EXECUTE; } if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) { panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", __FUNCTION__, map, map->pmap, src_entry, (uint64_t)src_entry->vme_start, (uint64_t)src_entry->vme_end, prot); } vm_object_pmap_protect(object, offset, entry_size, ((src_entry->is_shared || map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap), VM_MAP_PAGE_SIZE(map), src_entry->vme_start, prot); assert(src_entry->wired_count == 0); src_entry->needs_copy = TRUE; } /* * Throw away the old object reference of the new entry. */ vm_object_deallocate(object); } else { new_entry->is_shared = FALSE; assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry); src_entry_was_wired = (src_entry->wired_count > 0); saved_src_entry = src_entry; src_entry = VM_MAP_ENTRY_NULL; /* * The map can be safely unlocked since we * already hold a reference on the object. * * Record the timestamp of the map for later * verification, and unlock the map. */ version.main_timestamp = map->timestamp; vm_map_unlock(map); /* Increments timestamp once! */ /* * Perform the copy. */ if (src_entry_was_wired > 0 || (debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) { vm_object_lock(object); result = vm_object_copy_slowly( object, offset, (new_entry->vme_end - new_entry->vme_start), THREAD_UNINT, &new_copy_object); /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */ saved_used_for_jit = new_entry->used_for_jit; VME_OBJECT_SET(new_entry, new_copy_object, false, 0); new_entry->used_for_jit = saved_used_for_jit; VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset)); new_entry->needs_copy = FALSE; } else { vm_object_offset_t new_offset; new_offset = VME_OFFSET(new_entry); result = vm_object_copy_strategically( object, offset, (new_entry->vme_end - new_entry->vme_start), false, /* forking */ &new_copy_object, &new_offset, &new_entry_needs_copy); /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */ saved_used_for_jit = new_entry->used_for_jit; VME_OBJECT_SET(new_entry, new_copy_object, false, 0); new_entry->used_for_jit = saved_used_for_jit; if (new_offset != VME_OFFSET(new_entry)) { VME_OFFSET_SET(new_entry, new_offset); } new_entry->needs_copy = new_entry_needs_copy; } /* * Throw away the old object reference of the new entry. */ vm_object_deallocate(object); if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { vm_map_entry_dispose(new_entry); vm_map_lock(map); break; } /* * Verify that the map has not substantially * changed while the copy was being made. */ vm_map_lock(map); if (version.main_timestamp + 1 != map->timestamp) { /* * Simple version comparison failed. * * Retry the lookup and verify that the * same object/offset are still present. */ saved_src_entry = VM_MAP_ENTRY_NULL; vm_object_deallocate(VME_OBJECT(new_entry)); vm_map_entry_dispose(new_entry); if (result == KERN_MEMORY_RESTART_COPY) { result = KERN_SUCCESS; } continue; } /* map hasn't changed: src_entry is still valid */ src_entry = saved_src_entry; saved_src_entry = VM_MAP_ENTRY_NULL; if (result == KERN_MEMORY_RESTART_COPY) { vm_object_reference(object); goto RestartCopy; } } _vm_map_store_entry_link(map_header, map_header->links.prev, new_entry); /* protections for submap mapping are irrelevant here */ if (vm_remap_legacy && !src_entry->is_sub_map) { *cur_protection &= src_entry->protection; *max_protection &= src_entry->max_protection; } map_address += tmp_size; mapped_size += tmp_size; src_start += tmp_size; if (vmk_flags.vmkf_copy_single_object) { if (mapped_size != size) { DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n", map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size); if (src_entry->vme_next != vm_map_to_entry(map) && src_entry->vme_next->vme_object_value == src_entry->vme_object_value) { /* XXX TODO4K */ DEBUG4K_ERROR("could have extended copy to next entry...\n"); } } break; } } /* end while */ vm_map_unlock(map); if (result != KERN_SUCCESS) { /* * Free all allocated elements. */ for (src_entry = map_header->links.next; src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links); src_entry = new_entry) { new_entry = src_entry->vme_next; _vm_map_store_entry_unlink(map_header, src_entry, false); if (src_entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(src_entry)); } else { vm_object_deallocate(VME_OBJECT(src_entry)); } vm_map_entry_dispose(src_entry); } } return result; } bool vm_map_is_exotic( vm_map_t map) { return VM_MAP_IS_EXOTIC(map); } bool vm_map_is_alien( vm_map_t map) { return VM_MAP_IS_ALIEN(map); } #if XNU_TARGET_OS_OSX void vm_map_mark_alien( vm_map_t map) { vm_map_lock(map); map->is_alien = true; vm_map_unlock(map); } void vm_map_single_jit( vm_map_t map) { vm_map_lock(map); map->single_jit = true; vm_map_unlock(map); } #endif /* XNU_TARGET_OS_OSX */ /* * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ static kern_return_t vm_map_copy_to_physcopy( vm_map_copy_t copy_map, vm_map_t target_map) { vm_map_size_t size; vm_map_entry_t entry; vm_map_entry_t new_entry; vm_object_t new_object; unsigned int pmap_flags; pmap_t new_pmap; vm_map_t new_map; vm_map_address_t src_start, src_end, src_cur; vm_map_address_t dst_start, dst_end, dst_cur; kern_return_t kr; void *kbuf; /* * Perform the equivalent of vm_allocate() and memcpy(). * Replace the mappings in "copy_map" with the newly allocated mapping. */ DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size); assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map)); /* create a new pmap to map "copy_map" */ pmap_flags = 0; assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT); #if PMAP_CREATE_FORCE_4K_PAGES pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES; #endif /* PMAP_CREATE_FORCE_4K_PAGES */ pmap_flags |= PMAP_CREATE_64BIT; new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags); if (new_pmap == NULL) { return KERN_RESOURCE_SHORTAGE; } /* allocate new VM object */ size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK); new_object = vm_object_allocate(size); assert(new_object); /* allocate new VM map entry */ new_entry = vm_map_copy_entry_create(copy_map); assert(new_entry); /* finish initializing new VM map entry */ new_entry->protection = VM_PROT_DEFAULT; new_entry->max_protection = VM_PROT_DEFAULT; new_entry->use_pmap = TRUE; /* make new VM map entry point to new VM object */ new_entry->vme_start = 0; new_entry->vme_end = size; VME_OBJECT_SET(new_entry, new_object, false, 0); VME_OFFSET_SET(new_entry, 0); /* create a new pageable VM map to map "copy_map" */ new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS, VM_MAP_CREATE_PAGEABLE); assert(new_map); vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift); /* map "copy_map" in the new VM map */ src_start = 0; kr = vm_map_copyout_internal( new_map, &src_start, copy_map, copy_map->size, FALSE, /* consume_on_success */ VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); src_end = src_start + copy_map->size; /* map "new_object" in the new VM map */ vm_object_reference(new_object); dst_start = 0; kr = vm_map_enter(new_map, &dst_start, size, 0, /* mask */ VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK), new_object, 0, /* offset */ FALSE, /* needs copy */ VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); dst_end = dst_start + size; /* get a kernel buffer */ kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL); /* physically copy "copy_map" mappings to new VM object */ for (src_cur = src_start, dst_cur = dst_start; src_cur < src_end; src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) { vm_size_t bytes; bytes = PAGE_SIZE; if (src_cur + PAGE_SIZE > src_end) { /* partial copy for last page */ bytes = src_end - src_cur; assert(bytes > 0 && bytes < PAGE_SIZE); /* rest of dst page should be zero-filled */ } /* get bytes from src mapping */ kr = copyinmap(new_map, src_cur, kbuf, bytes); if (kr != KERN_SUCCESS) { DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr); } /* put bytes in dst mapping */ assert(dst_cur < dst_end); assert(dst_cur + bytes <= dst_end); kr = copyoutmap(new_map, kbuf, dst_cur, bytes); if (kr != KERN_SUCCESS) { DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr); } } /* free kernel buffer */ kfree_data(kbuf, PAGE_SIZE); /* destroy new map */ vm_map_destroy(new_map); new_map = VM_MAP_NULL; /* dispose of the old map entries in "copy_map" */ while (vm_map_copy_first_entry(copy_map) != vm_map_copy_to_entry(copy_map)) { entry = vm_map_copy_first_entry(copy_map); vm_map_copy_entry_unlink(copy_map, entry); if (entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(entry)); } else { vm_object_deallocate(VME_OBJECT(entry)); } vm_map_copy_entry_dispose(entry); } /* change "copy_map"'s page_size to match "target_map" */ copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map); copy_map->offset = 0; copy_map->size = size; /* insert new map entry in "copy_map" */ assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map)); vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry); DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size); return KERN_SUCCESS; } void vm_map_copy_adjust_get_target_copy_map( vm_map_copy_t copy_map, vm_map_copy_t *target_copy_map_p); void vm_map_copy_adjust_get_target_copy_map( vm_map_copy_t copy_map, vm_map_copy_t *target_copy_map_p) { vm_map_copy_t target_copy_map; vm_map_entry_t entry, target_entry; if (*target_copy_map_p != VM_MAP_COPY_NULL) { /* the caller already has a "target_copy_map": use it */ return; } /* the caller wants us to create a new copy of "copy_map" */ assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); target_copy_map = vm_map_copy_allocate(copy_map->type); target_copy_map->offset = copy_map->offset; target_copy_map->size = copy_map->size; target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift; for (entry = vm_map_copy_first_entry(copy_map); entry != vm_map_copy_to_entry(copy_map); entry = entry->vme_next) { target_entry = vm_map_copy_entry_create(target_copy_map); vm_map_entry_copy_full(target_entry, entry); if (target_entry->is_sub_map) { vm_map_reference(VME_SUBMAP(target_entry)); } else { vm_object_reference(VME_OBJECT(target_entry)); } vm_map_copy_entry_link( target_copy_map, vm_map_copy_last_entry(target_copy_map), target_entry); } entry = VM_MAP_ENTRY_NULL; *target_copy_map_p = target_copy_map; } /* * Callers of this function must call vm_map_copy_require on * previously created vm_map_copy_t or pass a newly created * one to ensure that it hasn't been forged. */ static void vm_map_copy_trim( vm_map_copy_t copy_map, uint16_t new_page_shift, vm_map_offset_t trim_start, vm_map_offset_t trim_end) { uint16_t copy_page_shift; vm_map_entry_t entry, next_entry; assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); assert(copy_map->cpy_hdr.nentries > 0); trim_start += vm_map_copy_first_entry(copy_map)->vme_start; trim_end += vm_map_copy_first_entry(copy_map)->vme_start; /* use the new page_shift to do the clipping */ copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map); copy_map->cpy_hdr.page_shift = new_page_shift; for (entry = vm_map_copy_first_entry(copy_map); entry != vm_map_copy_to_entry(copy_map); entry = next_entry) { next_entry = entry->vme_next; if (entry->vme_end <= trim_start) { /* entry fully before trim range: skip */ continue; } if (entry->vme_start >= trim_end) { /* entry fully after trim range: done */ break; } /* clip entry if needed */ vm_map_copy_clip_start(copy_map, entry, trim_start); vm_map_copy_clip_end(copy_map, entry, trim_end); /* dispose of entry */ copy_map->size -= entry->vme_end - entry->vme_start; vm_map_copy_entry_unlink(copy_map, entry); if (entry->is_sub_map) { vm_map_deallocate(VME_SUBMAP(entry)); } else { vm_object_deallocate(VME_OBJECT(entry)); } vm_map_copy_entry_dispose(entry); entry = VM_MAP_ENTRY_NULL; } /* restore copy_map's original page_shift */ copy_map->cpy_hdr.page_shift = copy_page_shift; } /* * Make any necessary adjustments to "copy_map" to allow it to be * mapped into "target_map". * If no changes were necessary, "target_copy_map" points to the * untouched "copy_map". * If changes are necessary, changes will be made to "target_copy_map". * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and * copy the original "copy_map" to it before applying the changes. * The caller should discard "target_copy_map" if it's not the same as * the original "copy_map". */ /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */ kern_return_t vm_map_copy_adjust_to_target( vm_map_copy_t src_copy_map, vm_map_offset_ut offset_u, vm_map_size_ut size_u, vm_map_t target_map, boolean_t copy, vm_map_copy_t *target_copy_map_p, vm_map_offset_t *overmap_start_p, vm_map_offset_t *overmap_end_p, vm_map_offset_t *trimmed_start_p) { vm_map_copy_t copy_map, target_copy_map; vm_map_size_t target_size; vm_map_size_t src_copy_map_size; vm_map_size_t overmap_start, overmap_end; int misalignments; vm_map_entry_t entry, target_entry; vm_map_offset_t addr_adjustment; vm_map_offset_t new_start, new_end; int copy_page_mask, target_page_mask; uint16_t copy_page_shift, target_page_shift; vm_map_offset_t trimmed_end; vm_map_size_t map_size; kern_return_t kr; /* * Sanitize any input parameters that are addr/size/prot/inherit */ kr = vm_map_copy_addr_size_sanitize( target_map, offset_u, size_u, VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE, &new_start, &new_end, &map_size); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(src_copy_map); assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST); /* * Start working with "src_copy_map" but we'll switch * to "target_copy_map" as soon as we start making adjustments. */ copy_map = src_copy_map; src_copy_map_size = src_copy_map->size; copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map); copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map); target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map); target_page_mask = VM_MAP_PAGE_MASK(target_map); DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p); target_copy_map = *target_copy_map_p; if (target_copy_map != VM_MAP_COPY_NULL) { vm_map_copy_require(target_copy_map); } if (new_end > copy_map->size) { DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u)); return KERN_INVALID_ARGUMENT; } /* trim the end */ trimmed_end = 0; new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask); if (new_end < copy_map->size) { trimmed_end = src_copy_map_size - new_end; DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size); /* get "target_copy_map" if needed and adjust it */ vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map); copy_map = target_copy_map; vm_map_copy_trim(target_copy_map, target_page_shift, new_end, copy_map->size); } /* trim the start */ new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask); if (new_start != 0) { DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start); /* get "target_copy_map" if needed and adjust it */ vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map); copy_map = target_copy_map; vm_map_copy_trim(target_copy_map, target_page_shift, 0, new_start); } *trimmed_start_p = new_start; /* target_size starts with what's left after trimming */ target_size = copy_map->size; assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end, "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n", (uint64_t)target_size, (uint64_t)src_copy_map_size, (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end); /* check for misalignments but don't adjust yet */ misalignments = 0; overmap_start = 0; overmap_end = 0; if (copy_page_shift < target_page_shift) { /* * Remapping from 4K to 16K: check the VM object alignments * throughout the range. * If the start and end of the range are mis-aligned, we can * over-map to re-align, and adjust the "overmap" start/end * and "target_size" of the range accordingly. * If there is any mis-alignment within the range: * if "copy": * we can do immediate-copy instead of copy-on-write, * else: * no way to remap and share; fail. */ for (entry = vm_map_copy_first_entry(copy_map); entry != vm_map_copy_to_entry(copy_map); entry = entry->vme_next) { vm_object_offset_t object_offset_start, object_offset_end; object_offset_start = VME_OFFSET(entry); object_offset_end = object_offset_start; object_offset_end += entry->vme_end - entry->vme_start; if (object_offset_start & target_page_mask) { if (entry == vm_map_copy_first_entry(copy_map) && !copy) { overmap_start++; } else { misalignments++; } } if (object_offset_end & target_page_mask) { if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) { overmap_end++; } else { misalignments++; } } } } entry = VM_MAP_ENTRY_NULL; /* decide how to deal with misalignments */ assert(overmap_start <= 1); assert(overmap_end <= 1); if (!overmap_start && !overmap_end && !misalignments) { /* copy_map is properly aligned for target_map ... */ if (*trimmed_start_p) { /* ... but we trimmed it, so still need to adjust */ } else { /* ... and we didn't trim anything: we're done */ if (target_copy_map == VM_MAP_COPY_NULL) { target_copy_map = copy_map; } *target_copy_map_p = target_copy_map; *overmap_start_p = 0; *overmap_end_p = 0; DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); return KERN_SUCCESS; } } else if (misalignments && !copy) { /* can't "share" if misaligned */ DEBUG4K_ADJUST("unsupported sharing\n"); #if MACH_ASSERT if (debug4k_panic_on_misaligned_sharing) { panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__); } #endif /* MACH_ASSERT */ DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p); return KERN_NOT_SUPPORTED; } else { /* can't virtual-copy if misaligned (but can physical-copy) */ DEBUG4K_ADJUST("mis-aligned copying\n"); } /* get a "target_copy_map" if needed and switch to it */ vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map); copy_map = target_copy_map; if (misalignments && copy) { vm_map_size_t target_copy_map_size; /* * Can't do copy-on-write with misaligned mappings. * Replace the mappings with a physical copy of the original * mappings' contents. */ target_copy_map_size = target_copy_map->size; kr = vm_map_copy_to_physcopy(target_copy_map, target_map); if (kr != KERN_SUCCESS) { return kr; } *target_copy_map_p = target_copy_map; *overmap_start_p = 0; *overmap_end_p = target_copy_map->size - target_copy_map_size; DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); return KERN_SUCCESS; } /* apply the adjustments */ misalignments = 0; overmap_start = 0; overmap_end = 0; /* remove copy_map->offset, so that everything starts at offset 0 */ addr_adjustment = copy_map->offset; /* also remove whatever we trimmed from the start */ addr_adjustment += *trimmed_start_p; for (target_entry = vm_map_copy_first_entry(target_copy_map); target_entry != vm_map_copy_to_entry(target_copy_map); target_entry = target_entry->vme_next) { vm_object_offset_t object_offset_start, object_offset_end; DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); object_offset_start = VME_OFFSET(target_entry); if (object_offset_start & target_page_mask) { DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); if (target_entry == vm_map_copy_first_entry(target_copy_map)) { /* * start of 1st entry is mis-aligned: * re-adjust by over-mapping. */ overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask); DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start); VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start); } else { misalignments++; DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments); assert(copy); } } if (target_entry == vm_map_copy_first_entry(target_copy_map)) { target_size += overmap_start; } else { target_entry->vme_start += overmap_start; } target_entry->vme_end += overmap_start; object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start; if (object_offset_end & target_page_mask) { DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) { /* * end of last entry is mis-aligned: re-adjust by over-mapping. */ overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end; DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end); target_entry->vme_end += overmap_end; target_size += overmap_end; } else { misalignments++; DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments); assert(copy); } } target_entry->vme_start -= addr_adjustment; target_entry->vme_end -= addr_adjustment; DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); } target_copy_map->size = target_size; target_copy_map->offset += overmap_start; target_copy_map->offset -= addr_adjustment; target_copy_map->cpy_hdr.page_shift = target_page_shift; // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask)); // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK)); assert(overmap_start < VM_MAP_PAGE_SIZE(target_map)); assert(overmap_end < VM_MAP_PAGE_SIZE(target_map)); *target_copy_map_p = target_copy_map; *overmap_start_p = overmap_start; *overmap_end_p = overmap_end; DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); return KERN_SUCCESS; } kern_return_t vm_map_range_physical_size( vm_map_t map, vm_map_address_t start, mach_vm_size_t size, mach_vm_size_t * phys_size) { kern_return_t kr; vm_map_copy_t copy_map, target_copy_map; vm_map_offset_t adjusted_start, adjusted_end; vm_map_size_t adjusted_size; vm_prot_t cur_prot, max_prot; vm_map_offset_t overmap_start, overmap_end, trimmed_start, end; vm_map_kernel_flags_t vmk_flags; if (size == 0) { DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size); *phys_size = 0; return KERN_SUCCESS; } adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)); adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map)); if (__improbable(os_add_overflow(start, size, &end) || adjusted_end <= adjusted_start)) { /* wraparound */ printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map)); *phys_size = 0; return KERN_INVALID_ARGUMENT; } if (__improbable(vm_map_range_overflows(map, start, size))) { *phys_size = 0; return KERN_INVALID_ADDRESS; } assert(adjusted_end > adjusted_start); adjusted_size = adjusted_end - adjusted_start; *phys_size = adjusted_size; if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) { return KERN_SUCCESS; } if (start == 0) { adjusted_start = vm_map_trunc_page(start, PAGE_MASK); adjusted_end = vm_map_round_page(start + size, PAGE_MASK); if (__improbable(adjusted_end <= adjusted_start)) { /* wraparound */ printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK); *phys_size = 0; return KERN_INVALID_ARGUMENT; } assert(adjusted_end > adjusted_start); adjusted_size = adjusted_end - adjusted_start; *phys_size = adjusted_size; return KERN_SUCCESS; } vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; vmk_flags.vmkf_copy_pageable = TRUE; vmk_flags.vmkf_copy_same_map = TRUE; assert(adjusted_size != 0); cur_prot = VM_PROT_NONE; /* legacy mode */ max_prot = VM_PROT_NONE; /* legacy mode */ vmk_flags.vmkf_remap_legacy_mode = true; kr = vm_map_copy_extract(map, adjusted_start, adjusted_size, FALSE /* copy */, ©_map, &cur_prot, &max_prot, VM_INHERIT_DEFAULT, vmk_flags); if (kr != KERN_SUCCESS) { DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr); //assert(0); *phys_size = 0; return kr; } assert(copy_map != VM_MAP_COPY_NULL); target_copy_map = copy_map; DEBUG4K_ADJUST("adjusting...\n"); kr = vm_map_copy_adjust_to_target( copy_map, start - adjusted_start, /* offset */ size, /* size */ kernel_map, FALSE, /* copy */ &target_copy_map, &overmap_start, &overmap_end, &trimmed_start); if (kr == KERN_SUCCESS) { if (target_copy_map->size != *phys_size) { DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size); } *phys_size = target_copy_map->size; } else { DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr); //assert(0); *phys_size = 0; } vm_map_copy_discard(copy_map); copy_map = VM_MAP_COPY_NULL; return kr; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_remap_sanitize( vm_map_t src_map, vm_map_t target_map, vm_map_address_ut address_u, vm_map_size_ut size_u, vm_map_offset_ut mask_u, vm_map_offset_ut memory_address_u, vm_prot_ut cur_protection_u, vm_prot_ut max_protection_u, vm_inherit_ut inheritance_u, vm_map_kernel_flags_t vmk_flags, vm_map_address_t *target_addr, vm_map_address_t *mask, vm_map_offset_t *memory_address, vm_map_offset_t *memory_end, vm_map_size_t *memory_size, vm_prot_t *cur_protection, vm_prot_t *max_protection, vm_inherit_t *inheritance) { kern_return_t result; vm_sanitize_flags_t vm_sanitize_flags; result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, inheritance); if (__improbable(result != KERN_SUCCESS)) { return result; } result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map, cur_protection, max_protection); if (__improbable(result != KERN_SUCCESS)) { return result; } result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask); if (__improbable(result != KERN_SUCCESS)) { return result; } /* * If the user is requesting that we return the address of the * first byte of the data (rather than the base of the page), * then we use different rounding semantics: specifically, * we assume that (memory_address, size) describes a region * all of whose pages we must cover, rather than a base to be truncated * down and a size to be added to that base. So we figure out * the highest page that the requested region includes and make * sure that the size will cover it. * * The key example we're worried about it is of the form: * * memory_address = 0x1ff0, size = 0x20 * * With the old semantics, we round down the memory_address to 0x1000 * and round up the size to 0x1000, resulting in our covering *only* * page 0x1000. With the new semantics, we'd realize that the region covers * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page * 0x1000 and page 0x2000 in the region we remap. * * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics. */ vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS; if (!vmk_flags.vmf_return_data_addr) { vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START; } result = vm_sanitize_addr_size(memory_address_u, size_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map, vm_sanitize_flags, memory_address, memory_end, memory_size); if (__improbable(result != KERN_SUCCESS)) { return result; } *target_addr = vm_sanitize_addr(target_map, address_u); return KERN_SUCCESS; } /* * Routine: vm_remap * * Map portion of a task's address space. * Mapped region must not overlap more than * one vm memory object. Protections and * inheritance attributes remain the same * as in the original task and are out parameters. * Source and Target task can be identical * Other attributes are identical as for vm_map() */ kern_return_t vm_map_remap( vm_map_t target_map, vm_map_address_ut *address_u, vm_map_size_ut size_u, vm_map_offset_ut mask_u, vm_map_kernel_flags_t vmk_flags, vm_map_t src_map, vm_map_offset_ut memory_address_u, boolean_t copy, vm_prot_ut *cur_protection_u, /* IN/OUT */ vm_prot_ut *max_protection_u, /* IN/OUT */ vm_inherit_ut inheritance_u) { vm_map_address_t target_addr, mask; vm_map_size_t target_size; vm_map_offset_t memory_address, memory_end; vm_map_size_t memory_size; vm_prot_t cur_protection, max_protection; vm_inherit_t inheritance; kern_return_t result; vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL; vm_map_copy_t copy_map; vm_map_offset_t offset_in_mapping; vm_map_size_t src_page_mask, target_page_mask; vm_map_size_t initial_size; VM_MAP_ZAP_DECLARE(zap_list); if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; } src_page_mask = VM_MAP_PAGE_MASK(src_map); target_page_mask = VM_MAP_PAGE_MASK(target_map); if (src_page_mask != target_page_mask) { if (copy) { DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map)); } else { DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map)); } } /* * Sanitize any input parameters that are addr/size/prot/inherit */ result = vm_map_remap_sanitize(src_map, target_map, *address_u, size_u, mask_u, memory_address_u, *cur_protection_u, *max_protection_u, inheritance_u, vmk_flags, &target_addr, &mask, &memory_address, &memory_end, &memory_size, &cur_protection, &max_protection, &inheritance); if (__improbable(result != KERN_SUCCESS)) { return vm_sanitize_get_kr(result); } if (vmk_flags.vmf_return_data_addr) { /* * This is safe to unwrap now that the quantities * have been validated and rounded up normally. */ offset_in_mapping = vm_sanitize_offset_in_page(src_map, memory_address_u); initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u); } else { /* * IMPORTANT: * This legacy code path is broken: for the range mentioned * above [ memory_address = 0x1ff0,size = 0x20 ], which spans * two 4k pages, it yields [ memory_address = 0x1000, * size = 0x1000 ], which covers only the first 4k page. * BUT some code unfortunately depends on this bug, so we * can't fix it without breaking something. * New code should get automatically opted in the new * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags. */ offset_in_mapping = 0; initial_size = memory_size; } if (vmk_flags.vmf_resilient_media) { /* must be copy-on-write to be "media resilient" */ if (!copy) { return KERN_INVALID_ARGUMENT; } } vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable; vmk_flags.vmkf_copy_same_map = (src_map == target_map); assert(memory_size != 0); result = vm_map_copy_extract(src_map, memory_address, memory_size, copy, ©_map, &cur_protection, /* IN/OUT */ &max_protection, /* IN/OUT */ inheritance, vmk_flags); if (result != KERN_SUCCESS) { return result; } assert(copy_map != VM_MAP_COPY_NULL); /* * Handle the policy for vm map ranges * * If the maps differ, the target_map policy applies like for vm_map() * For same mapping remaps, we preserve the range. */ if (vmk_flags.vmkf_copy_same_map) { vmk_flags.vmkf_range_id = copy_map->orig_range; } else { vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size); } target_size = memory_size; if (src_page_mask != target_page_mask) { vm_map_copy_t target_copy_map; vm_map_offset_t overmap_start = 0; vm_map_offset_t overmap_end = 0; vm_map_offset_t trimmed_start = 0; target_copy_map = copy_map; /* can modify "copy_map" itself */ DEBUG4K_ADJUST("adjusting...\n"); result = vm_map_copy_adjust_to_target( copy_map, offset_in_mapping, /* offset */ initial_size, target_map, copy, &target_copy_map, &overmap_start, &overmap_end, &trimmed_start); if (result != KERN_SUCCESS) { DEBUG4K_COPY("failed to adjust 0x%x\n", result); vm_map_copy_discard(copy_map); return result; } if (trimmed_start == 0) { /* nothing trimmed: no adjustment needed */ } else if (trimmed_start >= offset_in_mapping) { /* trimmed more than offset_in_mapping: nothing left */ assert(overmap_start == 0); assert(overmap_end == 0); offset_in_mapping = 0; } else { /* trimmed some of offset_in_mapping: adjust */ assert(overmap_start == 0); assert(overmap_end == 0); offset_in_mapping -= trimmed_start; } offset_in_mapping += overmap_start; target_size = target_copy_map->size; } /* * Allocate/check a range of free virtual address * space for the target */ target_size = vm_map_round_page(target_size, target_page_mask); if (target_size == 0) { vm_map_copy_discard(copy_map); return KERN_INVALID_ARGUMENT; } vm_map_lock(target_map); if (!vmk_flags.vmf_fixed) { result = vm_map_locate_space_anywhere(target_map, target_size, mask, vmk_flags, &target_addr, &insp_entry); } else { /* * vm_map_locate_space_fixed will reject overflowing * target_addr + target_size values */ result = vm_map_locate_space_fixed(target_map, target_addr, target_size, mask, vmk_flags, &insp_entry, &zap_list); if (result == KERN_MEMORY_PRESENT) { assert(!vmk_flags.vmkf_already); insp_entry = VM_MAP_ENTRY_NULL; result = KERN_NO_SPACE; } } if (result == KERN_SUCCESS) { while (vm_map_copy_first_entry(copy_map) != vm_map_copy_to_entry(copy_map)) { vm_map_entry_t entry = vm_map_copy_first_entry(copy_map); vm_map_copy_entry_unlink(copy_map, entry); if (vmk_flags.vmkf_remap_prot_copy) { /* * This vm_map_remap() is for a * vm_protect(VM_PROT_COPY), so the caller * expects to be allowed to add write access * to this new mapping. This is done by * adding VM_PROT_WRITE to each entry's * max_protection... unless some security * settings disallow it. */ bool allow_write = false; if (entry->vme_permanent) { /* immutable mapping... */ if ((entry->max_protection & VM_PROT_EXECUTE) && developer_mode_state()) { /* * ... but executable and * possibly being debugged, * so let's allow it to become * writable, for breakpoints * and dtrace probes, for * example. */ allow_write = true; } else { printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), (uint64_t)memory_address, (uint64_t)memory_size, entry->protection, entry->max_protection, developer_mode_state()); DTRACE_VM6(vm_map_delete_permanent_deny_protcopy, vm_map_entry_t, entry, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, vm_prot_t, entry->protection, vm_prot_t, entry->max_protection, int, VME_ALIAS(entry)); } } else { allow_write = true; } /* * VM_PROT_COPY: allow this mapping to become * writable, unless it was "permanent". */ if (allow_write) { entry->max_protection |= VM_PROT_WRITE; } } if (vmk_flags.vmf_resilient_codesign) { /* no codesigning -> read-only access */ entry->max_protection = VM_PROT_READ; entry->protection = VM_PROT_READ; entry->vme_resilient_codesign = TRUE; } entry->vme_start += target_addr; entry->vme_end += target_addr; assert(!entry->map_aligned); if (vmk_flags.vmf_resilient_media && !entry->is_sub_map && (VME_OBJECT(entry) == VM_OBJECT_NULL || VME_OBJECT(entry)->internal)) { entry->vme_resilient_media = TRUE; } assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK))); assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK))); assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK))); vm_map_store_entry_link(target_map, insp_entry, entry, vmk_flags); insp_entry = entry; } } if (vmk_flags.vmf_resilient_codesign) { cur_protection = VM_PROT_READ; max_protection = VM_PROT_READ; } if (result == KERN_SUCCESS) { target_map->size += target_size; SAVE_HINT_MAP_WRITE(target_map, insp_entry); } vm_map_unlock(target_map); vm_map_zap_dispose(&zap_list); if (result == KERN_SUCCESS && target_map->wiring_required) { result = vm_map_wire_nested(target_map, target_addr, target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK, TRUE, PMAP_NULL, 0, NULL); } if (result == KERN_SUCCESS) { #if KASAN if (target_map->pmap == kernel_pmap) { kasan_notify_address(target_addr, target_size); } #endif /* * If requested, return the address of the data pointed to by the * request, rather than the base of the resulting page. */ if (vmk_flags.vmf_return_data_addr) { target_addr += offset_in_mapping; } /* * Update OUT parameters. */ *address_u = vm_sanitize_wrap_addr(target_addr); *cur_protection_u = vm_sanitize_wrap_prot(cur_protection); *max_protection_u = vm_sanitize_wrap_prot(max_protection); } if (src_page_mask != target_page_mask) { DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result); } vm_map_copy_discard(copy_map); copy_map = VM_MAP_COPY_NULL; return result; } /* * vm_map_switch: * * Set the address map for the current thread to the specified map */ vm_map_t vm_map_switch( vm_map_t map) { thread_t thread = current_thread(); vm_map_t oldmap = thread->map; /* * Deactivate the current map and activate the requested map */ mp_disable_preemption(); PMAP_SWITCH_USER(thread, map, cpu_number()); mp_enable_preemption(); return oldmap; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_rw_user_sanitize( vm_map_t map, vm_map_address_ut addr_u, vm_size_ut size_u, vm_sanitize_caller_t vm_sanitize_caller, vm_map_address_t *addr, vm_map_address_t *end, vm_map_size_t *size) { vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES; return vm_sanitize_addr_size(addr_u, size_u, vm_sanitize_caller, map, flags, addr, end, size); } /* * Routine: vm_map_write_user * * Description: * Copy out data from a kernel space into space in the * destination map. The space must already exist in the * destination map. * NOTE: This routine should only be called by threads * which can block on a page fault. i.e. kernel mode user * threads. * */ kern_return_t vm_map_write_user( vm_map_t map, void *src_p, vm_map_address_ut dst_addr_u, vm_size_ut size_u) { kern_return_t kr; vm_map_address_t dst_addr, dst_end; vm_map_size_t size; /* * src_p isn't validated: [src_p, src_p + size_u) * is trusted kernel input. * * dst_addr_u and size_u are untrusted and need to be sanitized. */ kr = vm_map_rw_user_sanitize(map, dst_addr_u, size_u, VM_SANITIZE_CALLER_VM_MAP_WRITE_USER, &dst_addr, &dst_end, &size); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } if (current_map() == map) { if (copyout(src_p, dst_addr, size)) { kr = KERN_INVALID_ADDRESS; } } else { vm_map_t oldmap; /* take on the identity of the target map while doing */ /* the transfer */ vm_map_reference(map); oldmap = vm_map_switch(map); if (copyout(src_p, dst_addr, size)) { kr = KERN_INVALID_ADDRESS; } vm_map_switch(oldmap); vm_map_deallocate(map); } return kr; } /* * Routine: vm_map_read_user * * Description: * Copy in data from a user space source map into the * kernel map. The space must already exist in the * kernel map. * NOTE: This routine should only be called by threads * which can block on a page fault. i.e. kernel mode user * threads. * */ kern_return_t vm_map_read_user( vm_map_t map, vm_map_address_ut src_addr_u, void *dst_p, vm_size_ut size_u) { kern_return_t kr; vm_map_address_t src_addr, src_end; vm_map_size_t size; /* * dst_p isn't validated: [dst_p, dst_p + size_u) * is trusted kernel input. * * src_addr_u and size_u are untrusted and need to be sanitized. */ kr = vm_map_rw_user_sanitize(map, src_addr_u, size_u, VM_SANITIZE_CALLER_VM_MAP_READ_USER, &src_addr, &src_end, &size); if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } if (current_map() == map) { if (copyin(src_addr, dst_p, size)) { kr = KERN_INVALID_ADDRESS; } } else { vm_map_t oldmap; /* take on the identity of the target map while doing */ /* the transfer */ vm_map_reference(map); oldmap = vm_map_switch(map); if (copyin(src_addr, dst_p, size)) { kr = KERN_INVALID_ADDRESS; } vm_map_switch(oldmap); vm_map_deallocate(map); } return kr; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_check_protection_sanitize( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut protection_u, vm_sanitize_caller_t vm_sanitize_caller, vm_map_offset_t *start, vm_map_offset_t *end, vm_prot_t *protection) { kern_return_t kr; vm_map_size_t size; kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map, VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end, &size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } /* * Given that the protection is used only for comparisons below * no sanitization is being applied on it. */ *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u); return KERN_SUCCESS; } /* * vm_map_check_protection: * * Assert that the target map allows the specified * privilege on the entire address region given. * The entire region must be allocated. */ boolean_t vm_map_check_protection( vm_map_t map, vm_map_offset_ut start_u, vm_map_offset_ut end_u, vm_prot_ut protection_u, vm_sanitize_caller_t vm_sanitize_caller) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; vm_map_offset_t start; vm_map_offset_t end; vm_prot_t protection; kern_return_t kr; kr = vm_map_check_protection_sanitize(map, start_u, end_u, protection_u, vm_sanitize_caller, &start, &end, &protection); if (__improbable(kr != KERN_SUCCESS)) { kr = vm_sanitize_get_kr(kr); if (kr == KERN_SUCCESS) { return true; } return false; } vm_map_lock(map); if (start < vm_map_min(map) || end > vm_map_max(map)) { vm_map_unlock(map); return false; } if (!vm_map_lookup_entry(map, start, &tmp_entry)) { vm_map_unlock(map); return false; } entry = tmp_entry; while (start < end) { if (entry == vm_map_to_entry(map)) { vm_map_unlock(map); return false; } /* * No holes allowed! */ if (start < entry->vme_start) { vm_map_unlock(map); return false; } /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) { vm_map_unlock(map); return false; } /* go to next entry */ start = entry->vme_end; entry = entry->vme_next; } vm_map_unlock(map); return true; } kern_return_t vm_map_purgable_control( vm_map_t map, vm_map_offset_ut address_u, vm_purgable_t control, int *state) { vm_map_offset_t address; vm_map_entry_t entry; vm_object_t object; kern_return_t kr; boolean_t was_nonvolatile; /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ if (map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; } if (control != VM_PURGABLE_SET_STATE && control != VM_PURGABLE_GET_STATE && control != VM_PURGABLE_PURGE_ALL && control != VM_PURGABLE_SET_STATE_FROM_KERNEL) { return KERN_INVALID_ARGUMENT; } if (control == VM_PURGABLE_PURGE_ALL) { vm_purgeable_object_purge_all(); return KERN_SUCCESS; } if ((control == VM_PURGABLE_SET_STATE || control == VM_PURGABLE_SET_STATE_FROM_KERNEL) && (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) || ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) { return KERN_INVALID_ARGUMENT; } address = vm_sanitize_addr(map, address_u); vm_map_lock_read(map); if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) { /* * Must pass a valid non-submap address. */ vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } if ((entry->protection & VM_PROT_WRITE) == 0 && control != VM_PURGABLE_GET_STATE) { /* * Can't apply purgable controls to something you can't write. */ vm_map_unlock_read(map); return KERN_PROTECTION_FAILURE; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL || object->purgable == VM_PURGABLE_DENY) { /* * Object must already be present and be purgeable. */ vm_map_unlock_read(map); return KERN_INVALID_ARGUMENT; } vm_object_lock(object); #if 00 if (VME_OFFSET(entry) != 0 || entry->vme_end - entry->vme_start != object->vo_size) { /* * Can only apply purgable controls to the whole (existing) * object at once. */ vm_map_unlock_read(map); vm_object_unlock(object); return KERN_INVALID_ARGUMENT; } #endif assert(!entry->is_sub_map); assert(!entry->use_pmap); /* purgeable has its own accounting */ vm_map_unlock_read(map); was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE); kr = vm_object_purgable_control(object, control, state); if (was_nonvolatile && object->purgable != VM_PURGABLE_NONVOLATILE && map->pmap == kernel_pmap) { #if DEBUG object->vo_purgeable_volatilizer = kernel_task; #endif /* DEBUG */ } vm_object_unlock(object); return kr; } void vm_map_footprint_query_page_info( vm_map_t map, vm_map_entry_t map_entry, vm_map_offset_t curr_s_offset, int *disposition_p) { int pmap_disp; vm_object_t object = VM_OBJECT_NULL; int disposition; int effective_page_size; vm_map_lock_assert_held(map); assert(!map->has_corpse_footprint); assert(curr_s_offset >= map_entry->vme_start); assert(curr_s_offset < map_entry->vme_end); if (map_entry->is_sub_map) { if (!map_entry->use_pmap) { /* nested pmap: no footprint */ *disposition_p = 0; return; } } else { object = VME_OBJECT(map_entry); if (object == VM_OBJECT_NULL) { /* nothing mapped here: no need to ask */ *disposition_p = 0; return; } } effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map)); pmap_disp = 0; /* * Query the pmap. */ pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp); /* * Compute this page's disposition. */ disposition = 0; /* deal with "alternate accounting" first */ if (!map_entry->is_sub_map && object->vo_no_footprint) { /* does not count in footprint */ // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); } else if (!map_entry->is_sub_map && !object->internal && object->vo_ledger_tag && VM_OBJECT_OWNER(object) != NULL && VM_OBJECT_OWNER(object)->map == map) { /* owned external object: wired pages count in footprint */ assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); if ((((curr_s_offset - map_entry->vme_start + VME_OFFSET(map_entry)) / effective_page_size) < object->wired_page_count)) { /* * External object owned by this task: report the first * "#wired" pages as "resident" (to show that they * contribute to the footprint) but not "dirty" * (to avoid double-counting with the fake "owned" * region we'll report at the end of the address space * to account for all (mapped or not) owned memory * owned by this task. */ disposition |= VM_PAGE_QUERY_PAGE_PRESENT; } } else if (!map_entry->is_sub_map && object->internal && (object->purgable == VM_PURGABLE_NONVOLATILE || (object->purgable == VM_PURGABLE_DENY && object->vo_ledger_tag)) && VM_OBJECT_OWNER(object) != NULL && VM_OBJECT_OWNER(object)->map == map) { assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); if ((((curr_s_offset - map_entry->vme_start + VME_OFFSET(map_entry)) / effective_page_size) < (object->resident_page_count + vm_compressor_pager_get_count(object->pager)))) { /* * Non-volatile purgeable object owned * by this task: report the first * "#resident + #compressed" pages as * "resident" (to show that they * contribute to the footprint) but not * "dirty" (to avoid double-counting * with the fake "non-volatile" region * we'll report at the end of the * address space to account for all * (mapped or not) non-volatile memory * owned by this task. */ disposition |= VM_PAGE_QUERY_PAGE_PRESENT; } } else if (!map_entry->is_sub_map && object->internal && (object->purgable == VM_PURGABLE_VOLATILE || object->purgable == VM_PURGABLE_EMPTY) && VM_OBJECT_OWNER(object) != NULL && VM_OBJECT_OWNER(object)->map == map) { if (object->internal) { assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); } if ((((curr_s_offset - map_entry->vme_start + VME_OFFSET(map_entry)) / effective_page_size) < object->wired_page_count)) { /* * Volatile|empty purgeable object owned * by this task: report the first * "#wired" pages as "resident" (to * show that they contribute to the * footprint) but not "dirty" (to avoid * double-counting with the fake * "non-volatile" region we'll report * at the end of the address space to * account for all (mapped or not) * non-volatile memory owned by this * task. */ disposition |= VM_PAGE_QUERY_PAGE_PRESENT; } } else if (!map_entry->is_sub_map && map_entry->iokit_acct && object->internal && object->purgable == VM_PURGABLE_DENY) { /* * Non-purgeable IOKit memory: phys_footprint * includes the entire virtual mapping. */ assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); disposition |= VM_PAGE_QUERY_PAGE_PRESENT; disposition |= VM_PAGE_QUERY_PAGE_DIRTY; } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT | PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) { /* alternate accounting */ #if __arm64__ && (DEVELOPMENT || DEBUG) if (map->pmap->footprint_was_suspended) { /* * The assertion below can fail if dyld * suspended footprint accounting * while doing some adjustments to * this page; the mapping would say * "use pmap accounting" but the page * would be marked "alternate * accounting". */ } else #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */ { assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); } disposition = 0; } else { if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) { assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); disposition |= VM_PAGE_QUERY_PAGE_PRESENT; disposition |= VM_PAGE_QUERY_PAGE_REF; if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) { disposition |= VM_PAGE_QUERY_PAGE_DIRTY; } else { disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; } if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) { disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; } } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) { assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; } } *disposition_p = disposition; } kern_return_t vm_map_page_info( vm_map_t map, vm_map_offset_ut offset_u, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count) { return vm_map_page_range_info_internal(map, offset_u, /* start of range */ vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */ (int)-1, /* effective_page_shift: unspecified */ flavor, info, count); } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_page_range_info_sanitize( vm_map_t map, vm_map_offset_ut start_offset_u, vm_map_offset_ut end_offset_u, vm_map_offset_t effective_page_mask, vm_map_offset_t *start, vm_map_offset_t *end, vm_map_offset_t *offset_in_page) { kern_return_t retval; vm_map_size_t size; /* * Perform validation against map's mask but don't align start/end, * as we need for those to be aligned wrt effective_page_mask */ retval = vm_sanitize_addr_end(start_offset_u, end_offset_u, VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map, VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start, end, &size); if (retval != KERN_SUCCESS) { return retval; } retval = vm_sanitize_addr_end(start_offset_u, end_offset_u, VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask, VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end, &size); if (retval != KERN_SUCCESS) { return retval; } *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask, start_offset_u); return KERN_SUCCESS; } kern_return_t vm_map_page_range_info_internal( vm_map_t map, vm_map_offset_ut start_offset_u, vm_map_offset_ut end_offset_u, int effective_page_shift, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count) { vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL; vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL; vm_page_t m = VM_PAGE_NULL; kern_return_t retval = KERN_SUCCESS; int disposition = 0; int ref_count = 0; int depth = 0, info_idx = 0; vm_page_info_basic_t basic_info = 0; vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0; vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0; boolean_t do_region_footprint; ledger_amount_t ledger_resident, ledger_compressed; int effective_page_size; vm_map_offset_t effective_page_mask; switch (flavor) { case VM_PAGE_INFO_BASIC: if (*count != VM_PAGE_INFO_BASIC_COUNT) { /* * The "vm_page_info_basic_data" structure was not * properly padded, so allow the size to be off by * one to maintain backwards binary compatibility... */ if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) { return KERN_INVALID_ARGUMENT; } } break; default: return KERN_INVALID_ARGUMENT; } if (effective_page_shift == -1) { effective_page_shift = vm_self_region_page_shift_safely(map); if (effective_page_shift == -1) { return KERN_INVALID_ARGUMENT; } } effective_page_size = (1 << effective_page_shift); effective_page_mask = effective_page_size - 1; retval = vm_map_page_range_info_sanitize(map, start_offset_u, end_offset_u, effective_page_mask, &start, &end, &offset_in_page); if (retval != KERN_SUCCESS) { return vm_sanitize_get_kr(retval); } assert((end - start) <= MAX_PAGE_RANGE_QUERY); do_region_footprint = task_self_region_footprint(); disposition = 0; ref_count = 0; depth = 0; info_idx = 0; /* Tracks the next index within the info structure to be filled.*/ vm_map_lock_read(map); task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); for (curr_s_offset = start; curr_s_offset < end;) { /* * New lookup needs reset of these variables. */ curr_object = object = VM_OBJECT_NULL; offset_in_object = 0; ref_count = 0; depth = 0; if (do_region_footprint && curr_s_offset >= vm_map_last_entry(map)->vme_end) { /* * Request for "footprint" info about a page beyond * the end of address space: this must be for * the fake region vm_map_region_recurse_64() * reported to account for non-volatile purgeable * memory owned by this task. */ disposition = 0; if (curr_s_offset - vm_map_last_entry(map)->vme_end <= (unsigned) ledger_compressed) { /* * We haven't reported all the "non-volatile * compressed" pages yet, so report this fake * page as "compressed". */ disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; } else { /* * We've reported all the non-volatile * compressed page but not all the non-volatile * pages , so report this fake page as * "resident dirty". */ disposition |= VM_PAGE_QUERY_PAGE_PRESENT; disposition |= VM_PAGE_QUERY_PAGE_DIRTY; disposition |= VM_PAGE_QUERY_PAGE_REF; } switch (flavor) { case VM_PAGE_INFO_BASIC: basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); basic_info->disposition = disposition; basic_info->ref_count = 1; basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); basic_info->offset = 0; basic_info->depth = 0; info_idx++; break; } curr_s_offset += effective_page_size; continue; } /* * First, find the map entry covering "curr_s_offset", going down * submaps if necessary. */ if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) { /* no entry -> no object -> no page */ if (curr_s_offset < vm_map_min(map)) { /* * Illegal address that falls below map min. */ curr_e_offset = MIN(end, vm_map_min(map)); } else if (curr_s_offset >= vm_map_max(map)) { /* * Illegal address that falls on/after map max. */ curr_e_offset = end; } else if (map_entry == vm_map_to_entry(map)) { /* * Hit a hole. */ if (map_entry->vme_next == vm_map_to_entry(map)) { /* * Empty map. */ curr_e_offset = MIN(map->max_offset, end); } else { /* * Hole at start of the map. */ curr_e_offset = MIN(map_entry->vme_next->vme_start, end); } } else { if (map_entry->vme_next == vm_map_to_entry(map)) { /* * Hole at the end of the map. */ curr_e_offset = MIN(map->max_offset, end); } else { curr_e_offset = MIN(map_entry->vme_next->vme_start, end); } } assert(curr_e_offset >= curr_s_offset); uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift; void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic)); curr_s_offset = curr_e_offset; info_idx += num_pages; continue; } /* compute offset from this map entry's start */ offset_in_object = curr_s_offset - map_entry->vme_start; /* compute offset into this map entry's object (or submap) */ offset_in_object += VME_OFFSET(map_entry); if (map_entry->is_sub_map) { vm_map_t sub_map = VM_MAP_NULL; vm_page_info_t submap_info = 0; vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0; range_len = MIN(map_entry->vme_end, end) - curr_s_offset; submap_s_offset = offset_in_object; submap_e_offset = submap_s_offset + range_len; sub_map = VME_SUBMAP(map_entry); vm_map_reference(sub_map); vm_map_unlock_read(map); submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map), "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map)); retval = vm_map_page_range_info_internal(sub_map, submap_s_offset, submap_e_offset, effective_page_shift, VM_PAGE_INFO_BASIC, (vm_page_info_t) submap_info, count); assert(retval == KERN_SUCCESS); vm_map_lock_read(map); vm_map_deallocate(sub_map); /* Move the "info" index by the number of pages we inspected.*/ info_idx += range_len >> effective_page_shift; /* Move our current offset by the size of the range we inspected.*/ curr_s_offset += range_len; continue; } object = VME_OBJECT(map_entry); if (object == VM_OBJECT_NULL) { /* * We don't have an object here and, hence, * no pages to inspect. We'll fill up the * info structure appropriately. */ curr_e_offset = MIN(map_entry->vme_end, end); uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift; void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic)); curr_s_offset = curr_e_offset; info_idx += num_pages; continue; } if (do_region_footprint) { disposition = 0; if (map->has_corpse_footprint) { /* * Query the page info data we saved * while forking the corpse. */ vm_map_corpse_footprint_query_page_info( map, curr_s_offset, &disposition); } else { /* * Query the live pmap for footprint info * about this page. */ vm_map_footprint_query_page_info( map, map_entry, curr_s_offset, &disposition); } switch (flavor) { case VM_PAGE_INFO_BASIC: basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); basic_info->disposition = disposition; basic_info->ref_count = 1; basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); basic_info->offset = 0; basic_info->depth = 0; info_idx++; break; } curr_s_offset += effective_page_size; continue; } vm_object_reference(object); /* * Shared mode -- so we can allow other readers * to grab the lock too. */ vm_object_lock_shared(object); curr_e_offset = MIN(map_entry->vme_end, end); vm_map_unlock_read(map); map_entry = NULL; /* map is unlocked, the entry is no longer valid. */ curr_object = object; for (; curr_s_offset < curr_e_offset;) { if (object == curr_object) { /* account for our object reference above. */ ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1; } else { ref_count = os_ref_get_count_raw(&curr_object->ref_count); } curr_offset_in_object = offset_in_object; for (;;) { m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object)); if (m != VM_PAGE_NULL) { disposition |= VM_PAGE_QUERY_PAGE_PRESENT; break; } else { if (curr_object->internal && curr_object->alive && !curr_object->terminating && curr_object->pager_ready) { if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object)) == VM_EXTERNAL_STATE_EXISTS) { /* the pager has that page */ disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; break; } } /* * Go down the VM object shadow chain until we find the page * we're looking for. */ if (curr_object->shadow != VM_OBJECT_NULL) { vm_object_t shadow = VM_OBJECT_NULL; curr_offset_in_object += curr_object->vo_shadow_offset; shadow = curr_object->shadow; vm_object_lock_shared(shadow); vm_object_unlock(curr_object); curr_object = shadow; depth++; continue; } else { break; } } } /* The ref_count is not strictly accurate, it measures the number */ /* of entities holding a ref on the object, they may not be mapping */ /* the object or may not be mapping the section holding the */ /* target page but its still a ball park number and though an over- */ /* count, it picks up the copy-on-write cases */ /* We could also get a picture of page sharing from pmap_attributes */ /* but this would under count as only faulted-in mappings would */ /* show up. */ if ((curr_object == object) && curr_object->shadow) { disposition |= VM_PAGE_QUERY_PAGE_COPIED; } if (!curr_object->internal) { disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; } if (m != VM_PAGE_NULL) { if (m->vmp_fictitious) { disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; } else { if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) { disposition |= VM_PAGE_QUERY_PAGE_DIRTY; } if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) { disposition |= VM_PAGE_QUERY_PAGE_REF; } if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) { disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; } /* * XXX TODO4K: * when this routine deals with 4k * pages, check the appropriate CS bit * here. */ if (m->vmp_cs_validated) { disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED; } if (m->vmp_cs_tainted) { disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED; } if (m->vmp_cs_nx) { disposition |= VM_PAGE_QUERY_PAGE_CS_NX; } if (m->vmp_reusable || curr_object->all_reusable) { disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; } } } switch (flavor) { case VM_PAGE_INFO_BASIC: basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); basic_info->disposition = disposition; basic_info->ref_count = ref_count; basic_info->object_id = (vm_object_id_t) (uintptr_t) VM_KERNEL_ADDRHASH(curr_object); basic_info->offset = (memory_object_offset_t) curr_offset_in_object + offset_in_page; basic_info->depth = depth; info_idx++; break; } disposition = 0; offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset. /* * Move to next offset in the range and in our object. */ curr_s_offset += effective_page_size; offset_in_object += effective_page_size; curr_offset_in_object = offset_in_object; if (curr_object != object) { vm_object_unlock(curr_object); curr_object = object; vm_object_lock_shared(curr_object); } else { vm_object_lock_yield_shared(curr_object); } } vm_object_unlock(curr_object); vm_object_deallocate(curr_object); vm_map_lock_read(map); } vm_map_unlock_read(map); return retval; } static __attribute__((always_inline, warn_unused_result)) kern_return_t vm_map_msync_sanitize( vm_map_t map, vm_map_address_ut address_u, vm_map_size_ut size_u, vm_object_offset_t *address, vm_map_size_t *size) { vm_object_offset_t end; return vm_sanitize_addr_size(address_u, size_u, VM_SANITIZE_CALLER_VM_MAP_MSYNC, map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, address, &end, size); } /* * vm_map_msync * * Synchronises the memory range specified with its backing store * image by either flushing or cleaning the contents to the appropriate * memory manager engaging in a memory object synchronize dialog with * the manager. The client doesn't return until the manager issues * m_o_s_completed message. MIG Magically converts user task parameter * to the task's address map. * * interpretation of sync_flags * VM_SYNC_INVALIDATE - discard pages, only return precious * pages to manager. * * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS) * - discard pages, write dirty or precious * pages back to memory manager. * * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS * - write dirty or precious pages back to * the memory manager. * * VM_SYNC_CONTIGUOUS - does everything normally, but if there * is a hole in the region, and we would * have returned KERN_SUCCESS, return * KERN_INVALID_ADDRESS instead. * * NOTE * The memory object attributes have not yet been implemented, this * function will have to deal with the invalidate attribute * * RETURNS * KERN_INVALID_TASK Bad task parameter * KERN_INVALID_ARGUMENT both sync and async were specified. * KERN_SUCCESS The usual. * KERN_INVALID_ADDRESS There was a hole in the region. */ kern_return_t vm_map_msync( vm_map_t map, vm_map_address_ut address_u, vm_map_size_ut size_u, vm_sync_t sync_flags) { vm_map_entry_t entry; vm_map_size_t size, amount_left; vm_object_offset_t address, offset; vm_object_offset_t start_offset, end_offset; boolean_t do_sync_req; boolean_t had_hole = FALSE; vm_map_offset_t pmap_offset; kern_return_t kr; if ((sync_flags & VM_SYNC_ASYNCHRONOUS) && (sync_flags & VM_SYNC_SYNCHRONOUS)) { return KERN_INVALID_ARGUMENT; } if (map == VM_MAP_NULL) { return KERN_INVALID_TASK; } kr = vm_map_msync_sanitize(map, address_u, size_u, &address, &size); if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags); } if (__improbable(kr != KERN_SUCCESS)) { return vm_sanitize_get_kr(kr); } amount_left = size; while (amount_left > 0) { vm_object_size_t flush_size; vm_object_t object; vm_map_lock(map); if (!vm_map_lookup_entry(map, address, &entry)) { vm_map_size_t skip; /* * hole in the address map. */ had_hole = TRUE; if (sync_flags & VM_SYNC_KILLPAGES) { /* * For VM_SYNC_KILLPAGES, there should be * no holes in the range, since we couldn't * prevent someone else from allocating in * that hole and we wouldn't want to "kill" * their pages. */ vm_map_unlock(map); break; } /* * Check for empty map. */ if (entry == vm_map_to_entry(map) && entry->vme_next == entry) { vm_map_unlock(map); break; } /* * Check that we don't wrap and that * we have at least one real map entry. */ if ((map->hdr.nentries == 0) || (entry->vme_next->vme_start < address)) { vm_map_unlock(map); break; } /* * Move up to the next entry if needed */ skip = (entry->vme_next->vme_start - address); if (skip >= amount_left) { amount_left = 0; } else { amount_left -= skip; } address = entry->vme_next->vme_start; vm_map_unlock(map); continue; } offset = address - entry->vme_start; pmap_offset = address; /* * do we have more to flush than is contained in this * entry ? */ if (amount_left + entry->vme_start + offset > entry->vme_end) { flush_size = entry->vme_end - (entry->vme_start + offset); } else { flush_size = amount_left; } amount_left -= flush_size; address += flush_size; if (entry->is_sub_map == TRUE) { vm_map_t local_map; vm_map_offset_t local_offset; local_map = VME_SUBMAP(entry); local_offset = VME_OFFSET(entry); vm_map_reference(local_map); vm_map_unlock(map); if (vm_map_msync( local_map, local_offset, flush_size, sync_flags) == KERN_INVALID_ADDRESS) { had_hole = TRUE; } vm_map_deallocate(local_map); continue; } object = VME_OBJECT(entry); /* * We can't sync this object if the object has not been * created yet */ if (object == VM_OBJECT_NULL) { vm_map_unlock(map); continue; } offset += VME_OFFSET(entry); vm_object_lock(object); if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) { int kill_pages = 0; if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { /* * This is a destructive operation and so we * err on the side of limiting the range of * the operation. */ start_offset = vm_object_round_page(offset); end_offset = vm_object_trunc_page(offset + flush_size); if (end_offset <= start_offset) { vm_object_unlock(object); vm_map_unlock(map); continue; } pmap_offset += start_offset - offset; } else { start_offset = offset; end_offset = offset + flush_size; } if (sync_flags & VM_SYNC_KILLPAGES) { if (((os_ref_get_count_raw(&object->ref_count) == 1) || ((object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) && (object->vo_copy == VM_OBJECT_NULL))) && (object->shadow == VM_OBJECT_NULL)) { if (os_ref_get_count_raw(&object->ref_count) != 1) { vm_page_stats_reusable.free_shared++; } kill_pages = 1; } else { kill_pages = -1; } } if (kill_pages != -1) { vm_object_deactivate_pages( object, start_offset, (vm_object_size_t) (end_offset - start_offset), kill_pages, FALSE, /* reusable_pages */ FALSE, /* reusable_no_write */ map->pmap, pmap_offset); } vm_object_unlock(object); vm_map_unlock(map); continue; } /* * We can't sync this object if there isn't a pager. * Don't bother to sync internal objects, since there can't * be any "permanent" storage for these objects anyway. */ if ((object->pager == MEMORY_OBJECT_NULL) || (object->internal) || (object->private)) { vm_object_unlock(object); vm_map_unlock(map); continue; } /* * keep reference on the object until syncing is done */ vm_object_reference_locked(object); vm_object_unlock(object); vm_map_unlock(map); if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { start_offset = vm_object_trunc_page(offset); end_offset = vm_object_round_page(offset + flush_size); } else { start_offset = offset; end_offset = offset + flush_size; } do_sync_req = vm_object_sync(object, start_offset, (end_offset - start_offset), sync_flags & VM_SYNC_INVALIDATE, ((sync_flags & VM_SYNC_SYNCHRONOUS) || (sync_flags & VM_SYNC_ASYNCHRONOUS)), sync_flags & VM_SYNC_SYNCHRONOUS); if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) { /* * clear out the clustering and read-ahead hints */ vm_object_lock(object); object->pages_created = 0; object->pages_used = 0; object->sequential = 0; object->last_alloc = 0; vm_object_unlock(object); } vm_object_deallocate(object); } /* while */ /* for proper msync() behaviour */ if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) { return KERN_INVALID_ADDRESS; } return KERN_SUCCESS; }/* vm_msync */ void vm_named_entry_associate_vm_object( vm_named_entry_t named_entry, vm_object_t object, vm_object_offset_t offset, vm_object_size_t size, vm_prot_t prot) { vm_map_copy_t copy; vm_map_entry_t copy_entry; assert(!named_entry->is_sub_map); assert(!named_entry->is_copy); assert(!named_entry->is_object); assert(!named_entry->internal); assert(named_entry->backing.copy == VM_MAP_COPY_NULL); copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST); copy->offset = offset; copy->size = size; copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT; copy_entry = vm_map_copy_entry_create(copy); copy_entry->protection = prot; copy_entry->max_protection = prot; copy_entry->use_pmap = TRUE; copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK); copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK); VME_OBJECT_SET(copy_entry, object, false, 0); VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset)); vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry); named_entry->backing.copy = copy; named_entry->is_object = TRUE; if (object->internal) { named_entry->internal = TRUE; } DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, object, offset, size, prot); } vm_object_t vm_named_entry_to_vm_object( vm_named_entry_t named_entry) { vm_map_copy_t copy; vm_map_entry_t copy_entry; vm_object_t object; assert(!named_entry->is_sub_map); assert(!named_entry->is_copy); assert(named_entry->is_object); copy = named_entry->backing.copy; assert(copy != VM_MAP_COPY_NULL); /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged */ vm_map_copy_require(copy); assert(copy->cpy_hdr.nentries == 1); copy_entry = vm_map_copy_first_entry(copy); object = VME_OBJECT(copy_entry); DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection); return object; } /* * Routine: convert_port_entry_to_map * Purpose: * Convert from a port specifying an entry or a task * to a map. Doesn't consume the port ref; produces a map ref, * which may be null. Unlike convert_port_to_map, the * port may be task or a named entry backed. * Conditions: * Nothing locked. */ vm_map_t convert_port_entry_to_map( ipc_port_t port) { vm_map_t map = VM_MAP_NULL; vm_named_entry_t named_entry; if (!IP_VALID(port)) { return VM_MAP_NULL; } if (ip_kotype(port) != IKOT_NAMED_ENTRY) { return convert_port_to_map(port); } named_entry = mach_memory_entry_from_port(port); if ((named_entry->is_sub_map) && (named_entry->protection & VM_PROT_WRITE)) { map = named_entry->backing.map; if (map->pmap != PMAP_NULL) { if (map->pmap == kernel_pmap) { panic("userspace has access " "to a kernel map %p", map); } pmap_require(map->pmap); } vm_map_reference(map); } return map; } /* * Export routines to other components for the things we access locally through * macros. */ #undef current_map vm_map_t current_map(void) { return current_map_fast(); } /* * vm_map_reference: * * Takes a reference on the specified map. */ void vm_map_reference( vm_map_t map) { if (__probable(map != VM_MAP_NULL)) { vm_map_require(map); os_ref_retain_raw(&map->map_refcnt, &map_refgrp); } } /* * vm_map_deallocate: * * Removes a reference from the specified map, * destroying it if no references remain. * The map should not be locked. */ void vm_map_deallocate( vm_map_t map) { if (__probable(map != VM_MAP_NULL)) { vm_map_require(map); if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) { vm_map_destroy(map); } } } void vm_map_inspect_deallocate( vm_map_inspect_t map) { vm_map_deallocate((vm_map_t)map); } void vm_map_read_deallocate( vm_map_read_t map) { vm_map_deallocate((vm_map_t)map); } void vm_map_disable_NX(vm_map_t map) { if (map == NULL) { return; } if (map->pmap == NULL) { return; } pmap_disable_NX(map->pmap); } void vm_map_disallow_data_exec(vm_map_t map) { if (map == NULL) { return; } map->map_disallow_data_exec = TRUE; } /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS) * more descriptive. */ void vm_map_set_32bit(vm_map_t map) { #if defined(__arm64__) map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE); #else map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS; #endif } void vm_map_set_64bit(vm_map_t map) { #if defined(__arm64__) map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE); #else map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS; #endif } /* * Expand the maximum size of an existing map to 64GB. */ void vm_map_set_jumbo(vm_map_t map) { #if defined (__arm64__) && !XNU_TARGET_OS_OSX vm_map_set_max_addr(map, ~0, false); #else /* arm64 */ (void) map; #endif } #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT /* * Expand the maximum size of an existing map to the maximum supported. */ void vm_map_set_extra_jumbo(vm_map_t map) { #if defined (__arm64__) && !XNU_TARGET_OS_OSX vm_map_set_max_addr(map, ~0, true); #else /* arm64 */ (void) map; #endif } #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */ /* * This map has a JIT entitlement */ void vm_map_set_jit_entitled(vm_map_t map) { #if defined (__arm64__) pmap_set_jit_entitled(map->pmap); #else /* arm64 */ (void) map; #endif } /* * Get status of this maps TPRO flag */ boolean_t vm_map_tpro(vm_map_t map) { #if defined (__arm64e__) return pmap_get_tpro(map->pmap); #else /* arm64e */ (void) map; return FALSE; #endif } /* * This map has TPRO enabled */ void vm_map_set_tpro(vm_map_t map) { #if defined (__arm64e__) pmap_set_tpro(map->pmap); #else /* arm64e */ (void) map; #endif } /* * Does this map have TPRO enforcement enabled */ boolean_t vm_map_tpro_enforcement(vm_map_t map) { return map->tpro_enforcement; } /* * Set TPRO enforcement for this map */ void vm_map_set_tpro_enforcement(vm_map_t map) { if (vm_map_tpro(map)) { vm_map_lock(map); map->tpro_enforcement = TRUE; vm_map_unlock(map); } } /* * Enable TPRO on the requested region * * Note: * This routine is primarily intended to be called during/soon after map * creation before the associated task has been released to run. It is only * currently safe when we have no resident pages. */ boolean_t vm_map_set_tpro_range( __unused vm_map_t map, __unused vm_map_address_t start, __unused vm_map_address_t end) { return TRUE; } /* * Expand the maximum size of an existing map. */ void vm_map_set_max_addr( vm_map_t map, vm_map_offset_t new_max_offset, __unused bool extra_jumbo) { #if defined(__arm64__) vm_map_offset_t max_supported_offset; vm_map_offset_t old_max_offset; unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO; vm_map_lock(map); old_max_offset = map->max_offset; #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT if (extra_jumbo) { option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO; } #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */ max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option); new_max_offset = trunc_page(new_max_offset); /* The address space cannot be shrunk using this routine. */ if (old_max_offset >= new_max_offset) { vm_map_unlock(map); return; } if (max_supported_offset < new_max_offset) { new_max_offset = max_supported_offset; } map->max_offset = new_max_offset; /* * Disable the following chunk of code that extends the "holes" list * to accomodate a larger VM map. * In `vm_map_create_options()`, we now set the end of the "holes" list to * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms. * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS. * The "holes" list does not need to be adjusted. */ #if 0 if (map->holelistenabled) { if (map->holes_list->prev->vme_end == old_max_offset) { /* * There is already a hole at the end of the map; simply make it bigger. */ map->holes_list->prev->vme_end = map->max_offset; } else { /* * There is no hole at the end, so we need to create a new hole * for the new empty space we're creating. */ struct vm_map_links *new_hole; new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL); new_hole->start = old_max_offset; new_hole->end = map->max_offset; new_hole->prev = map->holes_list->prev; new_hole->next = (struct vm_map_entry *)map->holes_list; map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole; map->holes_list->prev = (struct vm_map_entry *)new_hole; } } #endif vm_map_unlock(map); #else (void)map; (void)new_max_offset; #endif } vm_map_offset_t vm_compute_max_offset(boolean_t is64) { #if defined(__arm64__) return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE); #else return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS; #endif } void vm_map_get_max_aslr_slide_section( vm_map_t map __unused, int64_t *max_sections, int64_t *section_size) { #if defined(__arm64__) *max_sections = 3; *section_size = ARM_TT_TWIG_SIZE; #else *max_sections = 1; *section_size = 0; #endif } uint64_t vm_map_get_max_aslr_slide_pages(vm_map_t map) { #if defined(__arm64__) /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more * limited embedded address space; this is also meant to minimize pmap * memory usage on 16KB page systems. */ return 1 << (24 - VM_MAP_PAGE_SHIFT(map)); #else return 1 << (vm_map_is_64bit(map) ? 16 : 8); #endif } uint64_t vm_map_get_max_loader_aslr_slide_pages(vm_map_t map) { #if defined(__arm64__) /* We limit the loader slide to 4MB, in order to ensure at least 8 bits * of independent entropy on 16KB page systems. */ return 1 << (22 - VM_MAP_PAGE_SHIFT(map)); #else return 1 << (vm_map_is_64bit(map) ? 16 : 8); #endif } boolean_t vm_map_is_64bit( vm_map_t map) { return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS); } boolean_t vm_map_has_hard_pagezero( vm_map_t map, vm_map_offset_t pagezero_size) { /* * XXX FBDP * We should lock the VM map (for read) here but we can get away * with it for now because there can't really be any race condition: * the VM map's min_offset is changed only when the VM map is created * and when the zero page is established (when the binary gets loaded), * and this routine gets called only when the task terminates and the * VM map is being torn down, and when a new map is created via * load_machfile()/execve(). */ return map->min_offset >= pagezero_size; } /* * Raise a VM map's maximun offset. */ kern_return_t vm_map_raise_max_offset( vm_map_t map, vm_map_offset_t new_max_offset) { kern_return_t ret; vm_map_lock(map); ret = KERN_INVALID_ADDRESS; if (new_max_offset >= map->max_offset) { if (!vm_map_is_64bit(map)) { if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) { map->max_offset = new_max_offset; ret = KERN_SUCCESS; } } else { if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) { map->max_offset = new_max_offset; ret = KERN_SUCCESS; } } } vm_map_unlock(map); return ret; } /* * Raise a VM map's minimum offset. * To strictly enforce "page zero" reservation. */ kern_return_t vm_map_raise_min_offset( vm_map_t map, vm_map_offset_t new_min_offset) { vm_map_entry_t first_entry; new_min_offset = vm_map_round_page(new_min_offset, VM_MAP_PAGE_MASK(map)); vm_map_lock(map); if (new_min_offset < map->min_offset) { /* * Can't move min_offset backwards, as that would expose * a part of the address space that was previously, and for * possibly good reasons, inaccessible. */ vm_map_unlock(map); return KERN_INVALID_ADDRESS; } if (new_min_offset >= map->max_offset) { /* can't go beyond the end of the address space */ vm_map_unlock(map); return KERN_INVALID_ADDRESS; } first_entry = vm_map_first_entry(map); if (first_entry != vm_map_to_entry(map) && first_entry->vme_start < new_min_offset) { /* * Some memory was already allocated below the new * minimun offset. It's too late to change it now... */ vm_map_unlock(map); return KERN_NO_SPACE; } map->min_offset = new_min_offset; if (map->holelistenabled) { assert(map->holes_list); map->holes_list->start = new_min_offset; assert(new_min_offset < map->holes_list->end); } vm_map_unlock(map); return KERN_SUCCESS; } /* * Set the limit on the maximum amount of address space and user wired memory allowed for this map. * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't * have to reach over to the BSD data structures. */ uint64_t vm_map_set_size_limit_count = 0; kern_return_t vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit) { kern_return_t kr; vm_map_lock(map); if (new_size_limit < map->size) { /* new limit should not be lower than its current size */ DTRACE_VM2(vm_map_set_size_limit_fail, vm_map_size_t, map->size, uint64_t, new_size_limit); kr = KERN_FAILURE; } else if (new_size_limit == map->size_limit) { /* no change */ kr = KERN_SUCCESS; } else { /* set new limit */ DTRACE_VM2(vm_map_set_size_limit, vm_map_size_t, map->size, uint64_t, new_size_limit); if (new_size_limit != RLIM_INFINITY) { vm_map_set_size_limit_count++; } map->size_limit = new_size_limit; kr = KERN_SUCCESS; } vm_map_unlock(map); return kr; } uint64_t vm_map_set_data_limit_count = 0; kern_return_t vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit) { kern_return_t kr; vm_map_lock(map); if (new_data_limit < map->size) { /* new limit should not be lower than its current size */ DTRACE_VM2(vm_map_set_data_limit_fail, vm_map_size_t, map->size, uint64_t, new_data_limit); kr = KERN_FAILURE; } else if (new_data_limit == map->data_limit) { /* no change */ kr = KERN_SUCCESS; } else { /* set new limit */ DTRACE_VM2(vm_map_set_data_limit, vm_map_size_t, map->size, uint64_t, new_data_limit); if (new_data_limit != RLIM_INFINITY) { vm_map_set_data_limit_count++; } map->data_limit = new_data_limit; kr = KERN_SUCCESS; } vm_map_unlock(map); return kr; } void vm_map_set_user_wire_limit(vm_map_t map, vm_size_t limit) { vm_map_lock(map); map->user_wire_limit = limit; vm_map_unlock(map); } void vm_map_switch_protect(vm_map_t map, boolean_t val) { vm_map_lock(map); map->switch_protect = val; vm_map_unlock(map); } extern int cs_process_enforcement_enable; boolean_t vm_map_cs_enforcement( vm_map_t map) { if (cs_process_enforcement_enable) { return TRUE; } return map->cs_enforcement; } kern_return_t vm_map_cs_wx_enable( __unused vm_map_t map) { #if CODE_SIGNING_MONITOR kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map)); if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) { return KERN_SUCCESS; } return ret; #else /* The VM manages WX memory entirely on its own */ return KERN_SUCCESS; #endif } kern_return_t vm_map_csm_allow_jit( __unused vm_map_t map) { #if CODE_SIGNING_MONITOR return csm_allow_jit_region(vm_map_pmap(map)); #else /* No code signing monitor to enforce JIT policy */ return KERN_SUCCESS; #endif } void vm_map_cs_debugged_set( vm_map_t map, boolean_t val) { vm_map_lock(map); map->cs_debugged = val; vm_map_unlock(map); } void vm_map_cs_enforcement_set( vm_map_t map, boolean_t val) { vm_map_lock(map); map->cs_enforcement = val; pmap_set_vm_map_cs_enforced(map->pmap, val); vm_map_unlock(map); } /* * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately. * phys_footprint is a composite limit consisting of iokit + physmem, so we need to * bump both counters. */ void vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes) { pmap_t pmap = vm_map_pmap(map); ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes); ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes); } void vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes) { pmap_t pmap = vm_map_pmap(map); ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes); ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes); } /* Add (generate) code signature for memory range */ #if CONFIG_DYNAMIC_CODE_SIGNING kern_return_t vm_map_sign(vm_map_t map, vm_map_offset_t start, vm_map_offset_t end) { vm_map_entry_t entry; vm_page_t m; vm_object_t object; /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ if (map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; } if (__improbable(vm_map_range_overflows(map, start, end - start))) { return KERN_INVALID_ADDRESS; } vm_map_lock_read(map); if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) { /* * Must pass a valid non-submap address. */ vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } if ((entry->vme_start > start) || (entry->vme_end < end)) { /* * Map entry doesn't cover the requested range. Not handling * this situation currently. */ vm_map_unlock_read(map); return KERN_INVALID_ARGUMENT; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { /* * Object must already be present or we can't sign. */ vm_map_unlock_read(map); return KERN_INVALID_ARGUMENT; } vm_object_lock(object); vm_map_unlock_read(map); while (start < end) { uint32_t refmod; m = vm_page_lookup(object, start - entry->vme_start + VME_OFFSET(entry)); if (m == VM_PAGE_NULL) { /* shoud we try to fault a page here? we can probably * demand it exists and is locked for this request */ vm_object_unlock(object); return KERN_FAILURE; } /* deal with special page status */ if (m->vmp_busy || (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) { vm_object_unlock(object); return KERN_FAILURE; } /* Page is OK... now "validate" it */ /* This is the place where we'll call out to create a code * directory, later */ /* XXX TODO4K: deal with 4k subpages individually? */ m->vmp_cs_validated = VMP_CS_ALL_TRUE; /* The page is now "clean" for codesigning purposes. That means * we don't consider it as modified (wpmapped) anymore. But * we'll disconnect the page so we note any future modification * attempts. */ m->vmp_wpmapped = FALSE; refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); /* Pull the dirty status from the pmap, since we cleared the * wpmapped bit */ if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) { SET_PAGE_DIRTY(m, FALSE); } /* On to the next page */ start += PAGE_SIZE; } vm_object_unlock(object); return KERN_SUCCESS; } #endif kern_return_t vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed) { vm_map_entry_t entry = VM_MAP_ENTRY_NULL; vm_map_entry_t next_entry; kern_return_t kr = KERN_SUCCESS; VM_MAP_ZAP_DECLARE(zap_list); vm_map_lock(map); for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = next_entry) { next_entry = entry->vme_next; if (!entry->is_sub_map && VME_OBJECT(entry) && (VME_OBJECT(entry)->internal == TRUE) && (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) { *reclaimed_resident += VME_OBJECT(entry)->resident_page_count; *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager); (void)vm_map_delete(map, entry->vme_start, entry->vme_end, VM_MAP_REMOVE_NO_YIELD, KMEM_GUARD_NONE, &zap_list); } } vm_map_unlock(map); vm_map_zap_dispose(&zap_list); return kr; } #if DEVELOPMENT || DEBUG int vm_map_disconnect_page_mappings( vm_map_t map, boolean_t do_unnest) { vm_map_entry_t entry; ledger_amount_t byte_count = 0; if (do_unnest == TRUE) { #ifndef NO_NESTED_PMAP vm_map_lock(map); for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { if (entry->is_sub_map && entry->use_pmap) { /* * Make sure the range between the start of this entry and * the end of this entry is no longer nested, so that * we will only remove mappings from the pmap in use by this * this task */ vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end); } } vm_map_unlock(map); #endif } vm_map_lock_read(map); ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count); for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) || (VME_OBJECT(entry)->phys_contiguous))) { continue; } if (entry->is_sub_map) { assert(!entry->use_pmap); } pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0); } vm_map_unlock_read(map); return (int) (byte_count / VM_MAP_PAGE_SIZE(map)); } kern_return_t vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr) { vm_object_t object = NULL; vm_object_offset_t offset; vm_prot_t prot; boolean_t wired; vm_map_version_t version; vm_map_t real_map; int result = KERN_FAILURE; vaddr = vm_map_trunc_page(vaddr, PAGE_MASK); vm_map_lock(map); result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ, OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired, NULL, &real_map, NULL); if (object == NULL) { result = KERN_MEMORY_ERROR; } else if (object->pager) { result = vm_compressor_pager_inject_error(object->pager, offset); } else { result = KERN_MEMORY_PRESENT; } if (object != NULL) { vm_object_unlock(object); } if (real_map != map) { vm_map_unlock(real_map); } vm_map_unlock(map); return result; } /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry * returns: KERN_SUCCESS if iteration completed ok, * error code if callback returned an error * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries * iterated is different from the number in the first call */ static kern_return_t vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries), kern_return_t (^entry_handler)(void* entry)) { vm_map_lock_assert_held(map); int nentries = map->hdr.nentries; kern_return_t error = count_handler(nentries); if (error) { return error; } /* iterate until we loop back to the map, see get_vmmap_entries() */ vm_map_entry_t entry = vm_map_first_entry(map); int count = 0; while (entry != vm_map_to_entry(map)) { error = entry_handler(entry); if (error != KERN_SUCCESS) { return error; } entry = entry->vme_next; ++count; if (count > nentries) { /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */ return KERN_FAILURE; } } if (count < nentries) { return KERN_FAILURE; } return KERN_SUCCESS; } kern_return_t vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries), kern_return_t (^entry_handler)(void* entry)) { vm_map_lock_read(map); kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler); vm_map_unlock_read(map); return error; } /* * Dump info about the entry into the given buffer. * return true on success, false if there was not enough space in the give buffer * argument size in: bytes free in the given buffer, out: bytes written */ kern_return_t vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size) { size_t insize = *size; kern_return_t kr; size_t offset = 0; *size = 0; if (sizeof(struct vm_map_entry_info) > insize) { return KERN_NO_SPACE; } vm_map_entry_t entry = (vm_map_entry_t)pentry; struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf; out_entry->vmei_start = entry->vme_start; out_entry->vmei_end = entry->vme_end; out_entry->vmei_alias = VME_ALIAS(entry); out_entry->vmei_offset = VME_OFFSET(entry); out_entry->vmei_is_sub_map = entry->is_sub_map; out_entry->vmei_protection = entry->protection; offset += sizeof(struct vm_map_entry_info); out_entry->vmei_slot_mapping_count = 0; out_entry->vmei_is_compressor_pager = false; *size = offset; if (out_entry->vmei_is_sub_map) { return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet } /* have a vm_object? */ vm_object_t object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL || !object->internal) { return KERN_SUCCESS; } /* objects has a pager? */ memory_object_t pager = object->pager; if (pager != MEMORY_OBJECT_NULL) { return KERN_SUCCESS; } bool is_compressor = false; unsigned int slot_mapping_count = 0; size_t pager_info_size = insize - offset; kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count); if (kr != KERN_SUCCESS) { /* didn't have enough space for everything we want to write, caller needs to retry */ return kr; } offset += pager_info_size; /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment * is just for sanity sake */ out_entry->vmei_is_compressor_pager = is_compressor; out_entry->vmei_slot_mapping_count = slot_mapping_count; *size = offset; return KERN_SUCCESS; } #endif #if CONFIG_FREEZE extern struct freezer_context freezer_context_global; AbsoluteTime c_freezer_last_yield_ts = 0; extern unsigned int memorystatus_freeze_private_shared_pages_ratio; extern unsigned int memorystatus_freeze_shared_mb_per_process_max; kern_return_t vm_map_freeze( task_t task, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, unsigned int dirty_budget, unsigned int *shared_count, int *freezer_error_code, boolean_t eval_only) { vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; kern_return_t kr = KERN_SUCCESS; boolean_t evaluation_phase = TRUE; vm_object_t cur_shared_object = NULL; int cur_shared_obj_ref_cnt = 0; unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0; *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0; /* * We need the exclusive lock here so that we can * block any page faults or lookups while we are * in the middle of freezing this vm map. */ vm_map_t map = task->map; vm_map_lock(map); assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { if (vm_compressor_low_on_space()) { *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; } if (vm_swap_low_on_space()) { *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; } kr = KERN_NO_SPACE; goto done; } if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { /* * In-memory compressor backing the freezer. No disk. * So no need to do the evaluation phase. */ evaluation_phase = FALSE; if (eval_only == TRUE) { /* * We don't support 'eval_only' mode * in this non-swap config. */ *freezer_error_code = FREEZER_ERROR_GENERIC; kr = KERN_INVALID_ARGUMENT; goto done; } freezer_context_global.freezer_ctx_uncompressed_pages = 0; clock_get_uptime(&c_freezer_last_yield_ts); } again: for (entry2 = vm_map_first_entry(map); entry2 != vm_map_to_entry(map); entry2 = entry2->vme_next) { vm_object_t src_object; if (entry2->is_sub_map) { continue; } src_object = VME_OBJECT(entry2); if (!src_object || src_object->phys_contiguous || !src_object->internal) { continue; } /* If eligible, scan the entry, moving eligible pages over to our parent object */ if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* * We skip purgeable objects during evaluation phase only. * If we decide to freeze this process, we'll explicitly * purge these objects before we go around again with * 'evaluation_phase' set to FALSE. */ if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) { /* * We want to purge objects that may not belong to this task but are mapped * in this task alone. Since we already purged this task's purgeable memory * at the end of a successful evaluation phase, we want to avoid doing no-op calls * on this task's purgeable objects. Hence the check for only volatile objects. */ if (evaluation_phase || src_object->purgable != VM_PURGABLE_VOLATILE || os_ref_get_count_raw(&src_object->ref_count) != 1) { continue; } vm_object_lock(src_object); if (src_object->purgable == VM_PURGABLE_VOLATILE && os_ref_get_count_raw(&src_object->ref_count) == 1) { purgeable_q_t old_queue; /* object should be on a purgeable queue */ assert(src_object->objq.next != NULL && src_object->objq.prev != NULL); /* move object from its volatile queue to the nonvolatile queue */ old_queue = vm_purgeable_object_remove(src_object); assert(old_queue); if (src_object->purgeable_when_ripe) { /* remove a token from that volatile queue */ vm_page_lock_queues(); vm_purgeable_token_delete_first(old_queue); vm_page_unlock_queues(); } /* purge the object */ vm_object_purge(src_object, 0); } vm_object_unlock(src_object); continue; } /* * Pages belonging to this object could be swapped to disk. * Make sure it's not a shared object because we could end * up just bringing it back in again. * * We try to optimize somewhat by checking for objects that are mapped * more than once within our own map. But we don't do full searches, * we just look at the entries following our current entry. */ if (os_ref_get_count_raw(&src_object->ref_count) > 1) { if (src_object != cur_shared_object) { obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); dirty_shared_count += obj_pages_snapshot; cur_shared_object = src_object; cur_shared_obj_ref_cnt = 1; continue; } else { cur_shared_obj_ref_cnt++; if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) { /* * Fall through to below and treat this object as private. * So deduct its pages from our shared total and add it to the * private total. */ dirty_shared_count -= obj_pages_snapshot; dirty_private_count += obj_pages_snapshot; } else { continue; } } } if (os_ref_get_count_raw(&src_object->ref_count) == 1) { dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); } if (evaluation_phase == TRUE) { continue; } } uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget); *wired_count += src_object->wired_page_count; if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { if (vm_compressor_low_on_space()) { *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; } if (vm_swap_low_on_space()) { *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; } kr = KERN_NO_SPACE; break; } if (paged_out_count >= dirty_budget) { break; } dirty_budget -= paged_out_count; } *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); if (evaluation_phase) { unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; if (dirty_shared_count > shared_pages_threshold) { *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY; kr = KERN_FAILURE; goto done; } if (dirty_shared_count && ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) { *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO; kr = KERN_FAILURE; goto done; } evaluation_phase = FALSE; dirty_shared_count = dirty_private_count = 0; freezer_context_global.freezer_ctx_uncompressed_pages = 0; clock_get_uptime(&c_freezer_last_yield_ts); if (eval_only) { kr = KERN_SUCCESS; goto done; } vm_purgeable_purge_task_owned(task); goto again; } else { kr = KERN_SUCCESS; } done: vm_map_unlock(map); if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) { vm_object_compressed_freezer_done(); } return kr; } #endif /* * vm_map_entry_should_cow_for_true_share: * * Determines if the map entry should be clipped and setup for copy-on-write * to avoid applying "true_share" to a large VM object when only a subset is * targeted. * * For now, we target only the map entries created for the Objective C * Garbage Collector, which initially have the following properties: * - alias == VM_MEMORY_MALLOC * - wired_count == 0 * - !needs_copy * and a VM object with: * - internal * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC * - !true_share * - vo_size == ANON_CHUNK_SIZE * * Only non-kernel map entries. */ boolean_t vm_map_entry_should_cow_for_true_share( vm_map_entry_t entry) { vm_object_t object; if (entry->is_sub_map) { /* entry does not point at a VM object */ return FALSE; } if (entry->needs_copy) { /* already set for copy_on_write: done! */ return FALSE; } if (VME_ALIAS(entry) != VM_MEMORY_MALLOC && VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) { /* not a malloc heap or Obj-C Garbage Collector heap */ return FALSE; } if (entry->wired_count) { /* wired: can't change the map entry... */ vm_counters.should_cow_but_wired++; return FALSE; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { /* no object yet... */ return FALSE; } if (!object->internal) { /* not an internal object */ return FALSE; } if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* not the default copy strategy */ return FALSE; } if (object->true_share) { /* already true_share: too late to avoid it */ return FALSE; } if (VME_ALIAS(entry) == VM_MEMORY_MALLOC && object->vo_size != ANON_CHUNK_SIZE) { /* ... not an object created for the ObjC Garbage Collector */ return FALSE; } if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL && object->vo_size != 2048 * 4096) { /* ... not a "MALLOC_SMALL" heap */ return FALSE; } /* * All the criteria match: we have a large object being targeted for "true_share". * To limit the adverse side-effects linked with "true_share", tell the caller to * try and avoid setting up the entire object for "true_share" by clipping the * targeted range and setting it up for copy-on-write. */ return TRUE; } uint64_t vm_map_range_overflows_count = 0; TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE); bool vm_map_range_overflows( vm_map_t map, vm_map_offset_t addr, vm_map_size_t size) { vm_map_offset_t start, end, sum; vm_map_offset_t pgmask; if (size == 0) { /* empty range -> no overflow */ return false; } pgmask = vm_map_page_mask(map); start = vm_map_trunc_page_mask(addr, pgmask); end = vm_map_round_page_mask(addr + size, pgmask); if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) { vm_map_range_overflows_count++; if (vm_map_range_overflows_log) { printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n", proc_selfpid(), proc_best_name(current_proc()), (uint64_t)addr, (uint64_t)size, (uint64_t)pgmask); } DTRACE_VM4(vm_map_range_overflows, vm_map_t, map, uint32_t, pgmask, uint64_t, (uint64_t)addr, uint64_t, (uint64_t)size); return true; } return false; } vm_map_offset_t vm_map_round_page_mask( vm_map_offset_t offset, vm_map_offset_t mask) { return VM_MAP_ROUND_PAGE(offset, mask); } vm_map_offset_t vm_map_trunc_page_mask( vm_map_offset_t offset, vm_map_offset_t mask) { return VM_MAP_TRUNC_PAGE(offset, mask); } boolean_t vm_map_page_aligned( vm_map_offset_t offset, vm_map_offset_t mask) { return ((offset) & mask) == 0; } int vm_map_page_shift( vm_map_t map) { return VM_MAP_PAGE_SHIFT(map); } int vm_map_page_size( vm_map_t map) { return VM_MAP_PAGE_SIZE(map); } vm_map_offset_t vm_map_page_mask( vm_map_t map) { return VM_MAP_PAGE_MASK(map); } kern_return_t vm_map_set_page_shift( vm_map_t map, int pageshift) { if (map->hdr.nentries != 0) { /* too late to change page size */ return KERN_FAILURE; } map->hdr.page_shift = (uint16_t)pageshift; return KERN_SUCCESS; } kern_return_t vm_map_query_volatile( vm_map_t map, mach_vm_size_t *volatile_virtual_size_p, mach_vm_size_t *volatile_resident_size_p, mach_vm_size_t *volatile_compressed_size_p, mach_vm_size_t *volatile_pmap_size_p, mach_vm_size_t *volatile_compressed_pmap_size_p) { mach_vm_size_t volatile_virtual_size; mach_vm_size_t volatile_resident_count; mach_vm_size_t volatile_compressed_count; mach_vm_size_t volatile_pmap_count; mach_vm_size_t volatile_compressed_pmap_count; mach_vm_size_t resident_count; vm_map_entry_t entry; vm_object_t object; /* map should be locked by caller */ volatile_virtual_size = 0; volatile_resident_count = 0; volatile_compressed_count = 0; volatile_pmap_count = 0; volatile_compressed_pmap_count = 0; for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes; if (entry->is_sub_map) { continue; } if (!(entry->protection & VM_PROT_WRITE)) { continue; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { continue; } if (object->purgable != VM_PURGABLE_VOLATILE && object->purgable != VM_PURGABLE_EMPTY) { continue; } if (VME_OFFSET(entry)) { /* * If the map entry has been split and the object now * appears several times in the VM map, we don't want * to count the object's resident_page_count more than * once. We count it only for the first one, starting * at offset 0 and ignore the other VM map entries. */ continue; } resident_count = object->resident_page_count; if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) { resident_count = 0; } else { resident_count -= (VME_OFFSET(entry) / PAGE_SIZE); } volatile_virtual_size += entry->vme_end - entry->vme_start; volatile_resident_count += resident_count; if (object->pager) { volatile_compressed_count += vm_compressor_pager_get_count(object->pager); } pmap_compressed_bytes = 0; pmap_resident_bytes = pmap_query_resident(map->pmap, entry->vme_start, entry->vme_end, &pmap_compressed_bytes); volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE); volatile_compressed_pmap_count += (pmap_compressed_bytes / PAGE_SIZE); } /* map is still locked on return */ *volatile_virtual_size_p = volatile_virtual_size; *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE; *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE; *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE; *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE; return KERN_SUCCESS; } void vm_map_sizes(vm_map_t map, vm_map_size_t * psize, vm_map_size_t * pfree, vm_map_size_t * plargest_free) { vm_map_entry_t entry; vm_map_offset_t prev; vm_map_size_t free, total_free, largest_free; boolean_t end; if (!map) { *psize = *pfree = *plargest_free = 0; return; } total_free = largest_free = 0; vm_map_lock_read(map); if (psize) { *psize = map->max_offset - map->min_offset; } prev = map->min_offset; for (entry = vm_map_first_entry(map);; entry = entry->vme_next) { end = (entry == vm_map_to_entry(map)); if (end) { free = entry->vme_end - prev; } else { free = entry->vme_start - prev; } total_free += free; if (free > largest_free) { largest_free = free; } if (end) { break; } prev = entry->vme_end; } vm_map_unlock_read(map); if (pfree) { *pfree = total_free; } if (plargest_free) { *plargest_free = largest_free; } } #if VM_SCAN_FOR_SHADOW_CHAIN int vm_map_shadow_max( vm_map_t map) { int shadows, shadows_max; vm_map_entry_t entry; vm_object_t object, next_object; if (map == NULL) { return 0; } shadows_max = 0; vm_map_lock_read(map); for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { if (entry->is_sub_map) { continue; } object = VME_OBJECT(entry); if (object == NULL) { continue; } vm_object_lock_shared(object); for (shadows = 0; object->shadow != NULL; shadows++, object = next_object) { next_object = object->shadow; vm_object_lock_shared(next_object); vm_object_unlock(object); } vm_object_unlock(object); if (shadows > shadows_max) { shadows_max = shadows; } } vm_map_unlock_read(map); return shadows_max; } #endif /* VM_SCAN_FOR_SHADOW_CHAIN */ void vm_commit_pagezero_status(vm_map_t lmap) { pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset); } #if __x86_64__ void vm_map_set_high_start( vm_map_t map, vm_map_offset_t high_start) { map->vmmap_high_start = high_start; } #endif /* __x86_64__ */ #if CODE_SIGNING_MONITOR kern_return_t vm_map_entry_cs_associate( vm_map_t map, vm_map_entry_t entry, vm_map_kernel_flags_t vmk_flags) { vm_object_t cs_object, cs_shadow, backing_object; vm_object_offset_t cs_offset, backing_offset; void *cs_blobs; struct vnode *cs_vnode; kern_return_t cs_ret; if (map->pmap == NULL || entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */ (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) || VME_OBJECT(entry) == VM_OBJECT_NULL) { return KERN_SUCCESS; } if (!(entry->protection & VM_PROT_EXECUTE)) { /* * This memory region is not executable, so the code-signing * monitor would usually not care about it... */ if (vmk_flags.vmkf_remap_prot_copy && (entry->max_protection & VM_PROT_EXECUTE)) { /* * ... except if the memory region is being remapped * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY) * which is what a debugger or dtrace would be doing * to prepare to modify an executable page to insert * a breakpoint or activate a probe. * In that case, fall through so that we can mark * this region as being "debugged" and no longer * strictly code-signed. */ } else { /* * Really not executable, so no need to tell the * code-signing monitor. */ return KERN_SUCCESS; } } vm_map_lock_assert_exclusive(map); /* * Check for a debug association mapping before we check for used_for_jit. This * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT * since they are mapped with RW or RX permissions, which the page table monitor * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG, * they will be mapped as USER_EXEC, and that will cause another page table monitor * violation when those USER_EXEC pages are mapped as RW. * * Since these pages switch between RW and RX through mprotect, they mimic what * we expect a debugger to do. As the code signing monitor does not enforce mappings * on macOS systems, this works in our favor here and allows us to continue to * support these legacy-programmed applications without sacrificing security on * the page table or the code signing monitor. We don't need to explicitly check * for entry_for_jit here and the mapping permissions. If the initial mapping is * created with RX, then the application must map it as RW in order to first write * to the page (MAP_JIT mappings must be private and anonymous). The switch to * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy. * Similarly, if the mapping was created as RW, and then switched to RX, * vm_map_protect will again mark the entry as a copy, and both these cases * lead to this if-statement being entered. * * For more information: rdar://115313336. */ if (vmk_flags.vmkf_remap_prot_copy) { cs_ret = csm_associate_debug_region( map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); /* * csm_associate_debug_region returns not supported when the code signing * monitor is disabled. This is intentional, since cs_ret is checked towards * the end of the function, and if it is not supported, then we still want the * VM to perform code-signing enforcement on this entry. That said, if we don't * mark this as a xnu_user_debug page when the code-signing monitor is disabled, * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some * cases, which will cause a violation when attempted to be mapped as writable). */ if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) { entry->vme_xnu_user_debug = TRUE; } #if DEVELOPMENT || DEBUG if (vm_log_xnu_user_debug) { printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n", proc_selfpid(), (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"), __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->vme_xnu_user_debug, cs_ret); } #endif /* DEVELOPMENT || DEBUG */ goto done; } if (entry->used_for_jit) { cs_ret = csm_associate_jit_region( map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); goto done; } cs_object = VME_OBJECT(entry); vm_object_lock_shared(cs_object); cs_offset = VME_OFFSET(entry); /* find the VM object backed by the code-signed vnode */ for (;;) { /* go to the bottom of cs_object's shadow chain */ for (; cs_object->shadow != VM_OBJECT_NULL; cs_object = cs_shadow) { cs_shadow = cs_object->shadow; cs_offset += cs_object->vo_shadow_offset; vm_object_lock_shared(cs_shadow); vm_object_unlock(cs_object); } if (cs_object->internal || cs_object->pager == MEMORY_OBJECT_NULL) { vm_object_unlock(cs_object); return KERN_SUCCESS; } cs_offset += cs_object->paging_offset; /* * cs_object could be backed by a: * vnode_pager * apple_protect_pager * shared_region_pager * fourk_pager (multiple backing objects -> fail?) * ask the pager if it has a backing VM object */ if (!memory_object_backing_object(cs_object->pager, cs_offset, &backing_object, &backing_offset)) { /* no backing object: cs_object is it */ break; } /* look down the backing object's shadow chain */ vm_object_lock_shared(backing_object); vm_object_unlock(cs_object); cs_object = backing_object; cs_offset = backing_offset; } cs_vnode = vnode_pager_lookup_vnode(cs_object->pager); if (cs_vnode == NULL) { /* no vnode, no code signatures to associate */ cs_ret = KERN_SUCCESS; } else { cs_ret = vnode_pager_get_cs_blobs(cs_vnode, &cs_blobs); assert(cs_ret == KERN_SUCCESS); cs_ret = cs_associate_blob_with_mapping(map->pmap, entry->vme_start, (entry->vme_end - entry->vme_start), cs_offset, cs_blobs); } vm_object_unlock(cs_object); cs_object = VM_OBJECT_NULL; done: if (cs_ret == KERN_SUCCESS) { DTRACE_VM2(vm_map_entry_cs_associate_success, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end); if (vm_map_executable_immutable) { /* * Prevent this executable * mapping from being unmapped * or modified. */ entry->vme_permanent = TRUE; } /* * pmap says it will validate the * code-signing validity of pages * faulted in via this mapping, so * this map entry should be marked so * that vm_fault() bypasses code-signing * validation for faults coming through * this mapping. */ entry->csm_associated = TRUE; } else if (cs_ret == KERN_NOT_SUPPORTED) { /* * pmap won't check the code-signing * validity of pages faulted in via * this mapping, so VM should keep * doing it. */ DTRACE_VM3(vm_map_entry_cs_associate_off, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, int, cs_ret); } else { /* * A real error: do not allow * execution in this mapping. */ DTRACE_VM3(vm_map_entry_cs_associate_failure, vm_map_offset_t, entry->vme_start, vm_map_offset_t, entry->vme_end, int, cs_ret); if (vmk_flags.vmkf_overwrite_immutable) { /* * We can get here when we remap an apple_protect pager * on top of an already cs_associated executable mapping * with the same code signatures, so we don't want to * lose VM_PROT_EXECUTE in that case... */ } else { entry->protection &= ~VM_PROT_ALLEXEC; entry->max_protection &= ~VM_PROT_ALLEXEC; } } return cs_ret; } #endif /* CODE_SIGNING_MONITOR */ inline bool vm_map_is_corpse_source(vm_map_t map) { bool status = false; if (map) { vm_map_lock_read(map); status = map->corpse_source; vm_map_unlock_read(map); } return status; } inline void vm_map_set_corpse_source(vm_map_t map) { if (map) { vm_map_lock(map); map->corpse_source = true; vm_map_unlock(map); } } inline void vm_map_unset_corpse_source(vm_map_t map) { if (map) { vm_map_lock(map); map->corpse_source = false; vm_map_unlock(map); } } /* * FORKED CORPSE FOOTPRINT * * A forked corpse gets a copy of the original VM map but its pmap is mostly * empty since it never ran and never got to fault in any pages. * Collecting footprint info (via "sysctl vm.self_region_footprint") for * a forked corpse would therefore return very little information. * * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option * to vm_map_fork() to collect footprint information from the original VM map * and its pmap, and store it in the forked corpse's VM map. That information * is stored in place of the VM map's "hole list" since we'll never need to * lookup for holes in the corpse's map. * * The corpse's footprint info looks like this: * * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out * as follows: * +---------------------------------------+ * header-> | cf_size | * +-------------------+-------------------+ * | cf_last_region | cf_last_zeroes | * +-------------------+-------------------+ * region1-> | cfr_vaddr | * +-------------------+-------------------+ * | cfr_num_pages | d0 | d1 | d2 | d3 | * +---------------------------------------+ * | d4 | d5 | ... | * +---------------------------------------+ * | ... | * +-------------------+-------------------+ * | dy | dz | na | na | cfr_vaddr... | <-region2 * +-------------------+-------------------+ * | cfr_vaddr (ctd) | cfr_num_pages | * +---------------------------------------+ * | d0 | d1 ... | * +---------------------------------------+ * ... * +---------------------------------------+ * last region-> | cfr_vaddr | * +---------------------------------------+ * + cfr_num_pages | d0 | d1 | d2 | d3 | * +---------------------------------------+ * ... * +---------------------------------------+ * | dx | dy | dz | na | na | na | na | na | * +---------------------------------------+ * * where: * cf_size: total size of the buffer (rounded to page size) * cf_last_region: offset in the buffer of the last "region" sub-header * cf_last_zeroes: number of trailing "zero" dispositions at the end * of last region * cfr_vaddr: virtual address of the start of the covered "region" * cfr_num_pages: number of pages in the covered "region" * d*: disposition of the page at that virtual address * Regions in the buffer are word-aligned. * * We estimate the size of the buffer based on the number of memory regions * and the virtual size of the address space. While copying each memory region * during vm_map_fork(), we also collect the footprint info for that region * and store it in the buffer, packing it as much as possible (coalescing * contiguous memory regions to avoid having too many region headers and * avoiding long streaks of "zero" page dispositions by splitting footprint * "regions", so the number of regions in the footprint buffer might not match * the number of memory regions in the address space. * * We also have to copy the original task's "nonvolatile" ledgers since that's * part of the footprint and will need to be reported to any tool asking for * the footprint information of the forked corpse. */ uint64_t vm_map_corpse_footprint_count = 0; uint64_t vm_map_corpse_footprint_size_avg = 0; uint64_t vm_map_corpse_footprint_size_max = 0; uint64_t vm_map_corpse_footprint_full = 0; uint64_t vm_map_corpse_footprint_no_buf = 0; struct vm_map_corpse_footprint_header { vm_size_t cf_size; /* allocated buffer size */ uint32_t cf_last_region; /* offset of last region in buffer */ union { uint32_t cfu_last_zeroes; /* during creation: * number of "zero" dispositions at * end of last region */ uint32_t cfu_hint_region; /* during lookup: * offset of last looked up region */ #define cf_last_zeroes cfu.cfu_last_zeroes #define cf_hint_region cfu.cfu_hint_region } cfu; }; typedef uint8_t cf_disp_t; struct vm_map_corpse_footprint_region { vm_map_offset_t cfr_vaddr; /* region start virtual address */ uint32_t cfr_num_pages; /* number of pages in this "region" */ cf_disp_t cfr_disposition[0]; /* disposition of each page */ } __attribute__((packed)); static cf_disp_t vm_page_disposition_to_cf_disp( int disposition) { assert(sizeof(cf_disp_t) == 1); /* relocate bits that don't fit in a "uint8_t" */ if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) { disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; } /* cast gets rid of extra bits */ return (cf_disp_t) disposition; } static int vm_page_cf_disp_to_disposition( cf_disp_t cf_disp) { int disposition; assert(sizeof(cf_disp_t) == 1); disposition = (int) cf_disp; /* move relocated bits back in place */ if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) { disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS; } return disposition; } /* * vm_map_corpse_footprint_new_region: * closes the current footprint "region" and creates a new one * * Returns NULL if there's not enough space in the buffer for a new region. */ static struct vm_map_corpse_footprint_region * vm_map_corpse_footprint_new_region( struct vm_map_corpse_footprint_header *footprint_header) { uintptr_t footprint_edge; uint32_t new_region_offset; struct vm_map_corpse_footprint_region *footprint_region; struct vm_map_corpse_footprint_region *new_footprint_region; footprint_edge = ((uintptr_t)footprint_header + footprint_header->cf_size); footprint_region = ((struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_header->cf_last_region)); assert((uintptr_t)footprint_region + sizeof(*footprint_region) <= footprint_edge); /* get rid of trailing zeroes in the last region */ assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes); footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes; footprint_header->cf_last_zeroes = 0; /* reuse this region if it's now empty */ if (footprint_region->cfr_num_pages == 0) { return footprint_region; } /* compute offset of new region */ new_region_offset = footprint_header->cf_last_region; new_region_offset += sizeof(*footprint_region); new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t)); new_region_offset = roundup(new_region_offset, sizeof(int)); /* check if we're going over the edge */ if (((uintptr_t)footprint_header + new_region_offset + sizeof(*footprint_region)) >= footprint_edge) { /* over the edge: no new region */ return NULL; } /* adjust offset of last region in header */ footprint_header->cf_last_region = new_region_offset; new_footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_header->cf_last_region); new_footprint_region->cfr_vaddr = 0; new_footprint_region->cfr_num_pages = 0; /* caller needs to initialize new region */ return new_footprint_region; } /* * vm_map_corpse_footprint_collect: * collect footprint information for "old_entry" in "old_map" and * stores it in "new_map"'s vmmap_footprint_info. */ kern_return_t vm_map_corpse_footprint_collect( vm_map_t old_map, vm_map_entry_t old_entry, vm_map_t new_map) { vm_map_offset_t va; kern_return_t kr; struct vm_map_corpse_footprint_header *footprint_header; struct vm_map_corpse_footprint_region *footprint_region; struct vm_map_corpse_footprint_region *new_footprint_region; cf_disp_t *next_disp_p; uintptr_t footprint_edge; uint32_t num_pages_tmp; int effective_page_size; effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map)); va = old_entry->vme_start; vm_map_lock_assert_exclusive(old_map); vm_map_lock_assert_exclusive(new_map); assert(new_map->has_corpse_footprint); assert(!old_map->has_corpse_footprint); if (!new_map->has_corpse_footprint || old_map->has_corpse_footprint) { /* * This can only transfer footprint info from a * map with a live pmap to a map with a corpse footprint. */ return KERN_NOT_SUPPORTED; } if (new_map->vmmap_corpse_footprint == NULL) { vm_offset_t buf; vm_size_t buf_size; buf = 0; buf_size = (sizeof(*footprint_header) + (old_map->hdr.nentries * (sizeof(*footprint_region) + +3)) /* potential alignment for each region */ + ((old_map->size / effective_page_size) * sizeof(cf_disp_t))); /* disposition for each page */ // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size); buf_size = round_page(buf_size); /* limit buffer to 1 page to validate overflow detection */ // buf_size = PAGE_SIZE; /* limit size to a somewhat sane amount */ #if XNU_TARGET_OS_OSX #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */ #else /* XNU_TARGET_OS_OSX */ #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */ #endif /* XNU_TARGET_OS_OSX */ if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) { buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE; } /* * Allocate the pageable buffer (with a trailing guard page). * It will be zero-filled on demand. */ kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE, KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST, VM_KERN_MEMORY_DIAG); if (kr != KERN_SUCCESS) { vm_map_corpse_footprint_no_buf++; return kr; } /* initialize header and 1st region */ footprint_header = (struct vm_map_corpse_footprint_header *)buf; new_map->vmmap_corpse_footprint = footprint_header; footprint_header->cf_size = buf_size; footprint_header->cf_last_region = sizeof(*footprint_header); footprint_header->cf_last_zeroes = 0; footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_header->cf_last_region); footprint_region->cfr_vaddr = 0; footprint_region->cfr_num_pages = 0; } else { /* retrieve header and last region */ footprint_header = (struct vm_map_corpse_footprint_header *) new_map->vmmap_corpse_footprint; footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_header->cf_last_region); } footprint_edge = ((uintptr_t)footprint_header + footprint_header->cf_size); if ((footprint_region->cfr_vaddr + (((vm_map_offset_t)footprint_region->cfr_num_pages) * effective_page_size)) != old_entry->vme_start) { uint64_t num_pages_delta, num_pages_delta_size; uint32_t region_offset_delta_size; /* * Not the next contiguous virtual address: * start a new region or store "zero" dispositions for * the missing pages? */ /* size of gap in actual page dispositions */ num_pages_delta = ((old_entry->vme_start - footprint_region->cfr_vaddr) / effective_page_size) - footprint_region->cfr_num_pages; num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t); /* size of gap as a new footprint region header */ region_offset_delta_size = (sizeof(*footprint_region) + roundup(((footprint_region->cfr_num_pages - footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)), sizeof(int)) - ((footprint_region->cfr_num_pages - footprint_header->cf_last_zeroes) * sizeof(cf_disp_t))); // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta); if (region_offset_delta_size < num_pages_delta_size || os_add3_overflow(footprint_region->cfr_num_pages, (uint32_t) num_pages_delta, 1, &num_pages_tmp)) { /* * Storing data for this gap would take more space * than inserting a new footprint region header: * let's start a new region and save space. If it's a * tie, let's avoid using a new region, since that * would require more region hops to find the right * range during lookups. * * If the current region's cfr_num_pages would overflow * if we added "zero" page dispositions for the gap, * no choice but to start a new region. */ // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__); new_footprint_region = vm_map_corpse_footprint_new_region(footprint_header); /* check that we're not going over the edge */ if (new_footprint_region == NULL) { goto over_the_edge; } footprint_region = new_footprint_region; /* initialize new region as empty */ footprint_region->cfr_vaddr = old_entry->vme_start; footprint_region->cfr_num_pages = 0; } else { /* * Store "zero" page dispositions for the missing * pages. */ // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__); for (; num_pages_delta > 0; num_pages_delta--) { next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region + sizeof(*footprint_region)); next_disp_p += footprint_region->cfr_num_pages; /* check that we're not going over the edge */ if ((uintptr_t)next_disp_p >= footprint_edge) { goto over_the_edge; } /* store "zero" disposition for this gap page */ footprint_region->cfr_num_pages++; *next_disp_p = (cf_disp_t) 0; footprint_header->cf_last_zeroes++; } } } for (va = old_entry->vme_start; va < old_entry->vme_end; va += effective_page_size) { int disposition; cf_disp_t cf_disp; vm_map_footprint_query_page_info(old_map, old_entry, va, &disposition); cf_disp = vm_page_disposition_to_cf_disp(disposition); // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp); if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) { /* * Ignore "zero" dispositions at start of * region: just move start of region. */ footprint_region->cfr_vaddr += effective_page_size; continue; } /* would region's cfr_num_pages overflow? */ if (os_add_overflow(footprint_region->cfr_num_pages, 1, &num_pages_tmp)) { /* overflow: create a new region */ new_footprint_region = vm_map_corpse_footprint_new_region( footprint_header); if (new_footprint_region == NULL) { goto over_the_edge; } footprint_region = new_footprint_region; footprint_region->cfr_vaddr = va; footprint_region->cfr_num_pages = 0; } next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region + sizeof(*footprint_region)); next_disp_p += footprint_region->cfr_num_pages; /* check that we're not going over the edge */ if ((uintptr_t)next_disp_p >= footprint_edge) { goto over_the_edge; } /* store this dispostion */ *next_disp_p = cf_disp; footprint_region->cfr_num_pages++; if (cf_disp != 0) { /* non-zero disp: break the current zero streak */ footprint_header->cf_last_zeroes = 0; /* done */ continue; } /* zero disp: add to the current streak of zeroes */ footprint_header->cf_last_zeroes++; if ((footprint_header->cf_last_zeroes + roundup(((footprint_region->cfr_num_pages - footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) & (sizeof(int) - 1), sizeof(int))) < (sizeof(*footprint_header))) { /* * There are not enough trailing "zero" dispositions * (+ the extra padding we would need for the previous * region); creating a new region would not save space * at this point, so let's keep this "zero" disposition * in this region and reconsider later. */ continue; } /* * Create a new region to avoid having too many consecutive * "zero" dispositions. */ new_footprint_region = vm_map_corpse_footprint_new_region(footprint_header); if (new_footprint_region == NULL) { goto over_the_edge; } footprint_region = new_footprint_region; /* initialize the new region as empty ... */ footprint_region->cfr_num_pages = 0; /* ... and skip this "zero" disp */ footprint_region->cfr_vaddr = va + effective_page_size; } return KERN_SUCCESS; over_the_edge: // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va); vm_map_corpse_footprint_full++; return KERN_RESOURCE_SHORTAGE; } /* * vm_map_corpse_footprint_collect_done: * completes the footprint collection by getting rid of any remaining * trailing "zero" dispositions and trimming the unused part of the * kernel buffer */ void vm_map_corpse_footprint_collect_done( vm_map_t new_map) { struct vm_map_corpse_footprint_header *footprint_header; struct vm_map_corpse_footprint_region *footprint_region; vm_size_t buf_size, actual_size; kern_return_t kr; assert(new_map->has_corpse_footprint); if (!new_map->has_corpse_footprint || new_map->vmmap_corpse_footprint == NULL) { return; } footprint_header = (struct vm_map_corpse_footprint_header *) new_map->vmmap_corpse_footprint; buf_size = footprint_header->cf_size; footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_header->cf_last_region); /* get rid of trailing zeroes in last region */ assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes); footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes; footprint_header->cf_last_zeroes = 0; actual_size = (vm_size_t)(footprint_header->cf_last_region + sizeof(*footprint_region) + (footprint_region->cfr_num_pages * sizeof(cf_disp_t))); // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size); vm_map_corpse_footprint_size_avg = (((vm_map_corpse_footprint_size_avg * vm_map_corpse_footprint_count) + actual_size) / (vm_map_corpse_footprint_count + 1)); vm_map_corpse_footprint_count++; if (actual_size > vm_map_corpse_footprint_size_max) { vm_map_corpse_footprint_size_max = actual_size; } actual_size = round_page(actual_size); if (buf_size > actual_size) { kr = vm_deallocate(kernel_map, vm_sanitize_wrap_addr((vm_address_t)footprint_header + actual_size + PAGE_SIZE), /* trailing guard page */ vm_sanitize_wrap_size(buf_size - actual_size)); assertf(kr == KERN_SUCCESS, "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", footprint_header, (uint64_t) buf_size, (uint64_t) actual_size, kr); kr = vm_protect(kernel_map, (vm_address_t)footprint_header + actual_size, PAGE_SIZE, FALSE, /* set_maximum */ vm_sanitize_wrap_prot(VM_PROT_NONE)); assertf(kr == KERN_SUCCESS, "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", footprint_header, (uint64_t) buf_size, (uint64_t) actual_size, kr); } footprint_header->cf_size = actual_size; } /* * vm_map_corpse_footprint_query_page_info: * retrieves the disposition of the page at virtual address "vaddr" * in the forked corpse's VM map * * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse. */ kern_return_t vm_map_corpse_footprint_query_page_info( vm_map_t map, vm_map_offset_t va, int *disposition_p) { struct vm_map_corpse_footprint_header *footprint_header; struct vm_map_corpse_footprint_region *footprint_region; uint32_t footprint_region_offset; vm_map_offset_t region_start, region_end; int disp_idx; kern_return_t kr; int effective_page_size; cf_disp_t cf_disp; if (!map->has_corpse_footprint) { *disposition_p = 0; kr = KERN_INVALID_ARGUMENT; goto done; } footprint_header = map->vmmap_corpse_footprint; if (footprint_header == NULL) { *disposition_p = 0; // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); kr = KERN_INVALID_ARGUMENT; goto done; } /* start looking at the hint ("cf_hint_region") */ footprint_region_offset = footprint_header->cf_hint_region; effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map)); lookup_again: if (footprint_region_offset < sizeof(*footprint_header)) { /* hint too low: start from 1st region */ footprint_region_offset = sizeof(*footprint_header); } if (footprint_region_offset > footprint_header->cf_last_region) { /* hint too high: re-start from 1st region */ footprint_region_offset = sizeof(*footprint_header); } footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_region_offset); region_start = footprint_region->cfr_vaddr; region_end = (region_start + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * effective_page_size)); if (va < region_start && footprint_region_offset != sizeof(*footprint_header)) { /* our range starts before the hint region */ /* reset the hint (in a racy way...) */ footprint_header->cf_hint_region = sizeof(*footprint_header); /* lookup "va" again from 1st region */ footprint_region_offset = sizeof(*footprint_header); goto lookup_again; } while (va >= region_end) { if (footprint_region_offset >= footprint_header->cf_last_region) { break; } /* skip the region's header */ footprint_region_offset += sizeof(*footprint_region); /* skip the region's page dispositions */ footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t)); /* align to next word boundary */ footprint_region_offset = roundup(footprint_region_offset, sizeof(int)); footprint_region = (struct vm_map_corpse_footprint_region *) ((char *)footprint_header + footprint_region_offset); region_start = footprint_region->cfr_vaddr; region_end = (region_start + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * effective_page_size)); } if (va < region_start || va >= region_end) { /* page not found */ *disposition_p = 0; // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); kr = KERN_SUCCESS; goto done; } /* "va" found: set the lookup hint for next lookup (in a racy way...) */ footprint_header->cf_hint_region = footprint_region_offset; /* get page disposition for "va" in this region */ disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size); cf_disp = footprint_region->cfr_disposition[disp_idx]; *disposition_p = vm_page_cf_disp_to_disposition(cf_disp); kr = KERN_SUCCESS; done: // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */ DTRACE_VM4(footprint_query_page_info, vm_map_t, map, vm_map_offset_t, va, int, *disposition_p, kern_return_t, kr); return kr; } void vm_map_corpse_footprint_destroy( vm_map_t map) { if (map->has_corpse_footprint && map->vmmap_corpse_footprint != 0) { struct vm_map_corpse_footprint_header *footprint_header; vm_size_t buf_size; kern_return_t kr; footprint_header = map->vmmap_corpse_footprint; buf_size = footprint_header->cf_size; kr = vm_deallocate(kernel_map, vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint), vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */ assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr); map->vmmap_corpse_footprint = 0; map->has_corpse_footprint = FALSE; } } /* * vm_map_copy_footprint_ledgers: * copies any ledger that's relevant to the memory footprint of "old_task" * into the forked corpse's task ("new_task") */ void vm_map_copy_footprint_ledgers( task_t old_task, task_t new_task) { vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint); vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile); vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.internal); vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped); vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting); vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table); vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint); vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile); vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint); vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint); vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint); vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed); vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem); vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total); } /* * vm_map_copy_ledger: * copy a single ledger from "old_task" to "new_task" */ void vm_map_copy_ledger( task_t old_task, task_t new_task, int ledger_entry) { ledger_amount_t old_balance, new_balance, delta; assert(new_task->map->has_corpse_footprint); if (!new_task->map->has_corpse_footprint) { return; } /* turn off sanity checks for the ledger we're about to mess with */ ledger_disable_panic_on_negative(new_task->ledger, ledger_entry); /* adjust "new_task" to match "old_task" */ ledger_get_balance(old_task->ledger, ledger_entry, &old_balance); ledger_get_balance(new_task->ledger, ledger_entry, &new_balance); if (new_balance == old_balance) { /* new == old: done */ } else if (new_balance > old_balance) { /* new > old ==> new -= new - old */ delta = new_balance - old_balance; ledger_debit(new_task->ledger, ledger_entry, delta); } else { /* new < old ==> new += old - new */ delta = old_balance - new_balance; ledger_credit(new_task->ledger, ledger_entry, delta); } } /* * vm_map_get_pmap: * returns the pmap associated with the vm_map */ pmap_t vm_map_get_pmap(vm_map_t map) { return vm_map_pmap(map); } ppnum_t vm_map_get_phys_page( vm_map_t map, vm_offset_t addr) { vm_object_offset_t offset; vm_object_t object; vm_map_offset_t map_offset; vm_map_entry_t entry; ppnum_t phys_page = 0; map_offset = vm_map_trunc_page(addr, PAGE_MASK); vm_map_lock(map); while (vm_map_lookup_entry(map, map_offset, &entry)) { if (entry->is_sub_map) { vm_map_t old_map; vm_map_lock(VME_SUBMAP(entry)); old_map = map; map = VME_SUBMAP(entry); map_offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start)); vm_map_unlock(old_map); continue; } if (VME_OBJECT(entry) == VM_OBJECT_NULL) { vm_map_unlock(map); return (ppnum_t) 0; } if (VME_OBJECT(entry)->phys_contiguous) { /* These are not standard pageable memory mappings */ /* If they are not present in the object they will */ /* have to be picked up from the pager through the */ /* fault mechanism. */ if (VME_OBJECT(entry)->vo_shadow_offset == 0) { /* need to call vm_fault */ vm_map_unlock(map); vm_fault(map, map_offset, VM_PROT_NONE, FALSE /* change_wiring */, VM_KERN_MEMORY_NONE, THREAD_UNINT, NULL, 0); vm_map_lock(map); continue; } offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start)); phys_page = (ppnum_t) ((VME_OBJECT(entry)->vo_shadow_offset + offset) >> PAGE_SHIFT); break; } offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start)); object = VME_OBJECT(entry); vm_object_lock(object); while (TRUE) { vm_page_t dst_page = vm_page_lookup(object, offset); if (dst_page == VM_PAGE_NULL) { if (object->shadow) { vm_object_t old_object; vm_object_lock(object->shadow); old_object = object; offset = offset + object->vo_shadow_offset; object = object->shadow; vm_object_unlock(old_object); } else { vm_object_unlock(object); break; } } else { phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page)); vm_object_unlock(object); break; } } break; } vm_map_unlock(map); return phys_page; } #if CONFIG_MAP_RANGES static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)]; static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)]; static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT); static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA); /* * vm_map_range_map_init: * initializes the VM range ID map to enable index lookup * of user VM ranges based on VM tag from userspace. */ static void vm_map_range_map_init(void) { /* * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose: * - the former is malloc metadata which should be kept separate * - the latter has its own ranges */ bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA); bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT); bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR); bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE); } static struct mach_vm_range vm_map_range_random_uniform( vm_map_size_t req_size, vm_map_offset_t min_addr, vm_map_offset_t max_addr, vm_map_offset_t offmask) { vm_map_offset_t random_addr; struct mach_vm_range alloc; req_size = (req_size + offmask) & ~offmask; min_addr = (min_addr + offmask) & ~offmask; max_addr = max_addr & ~offmask; read_random(&random_addr, sizeof(random_addr)); random_addr %= (max_addr - req_size - min_addr); random_addr &= ~offmask; alloc.min_address = min_addr + random_addr; alloc.max_address = min_addr + random_addr + req_size; return alloc; } static vm_map_offset_t vm_map_range_offmask(void) { uint32_t pte_depth; /* * PTE optimizations * * * 16k pages systems * ~~~~~~~~~~~~~~~~~ * * A single L1 (sub-)page covers the address space. * - L2 pages cover 64G, * - L3 pages cover 32M. * * On embedded, the dynamic VA range is 64G and uses a single L2 page. * As a result, we really only need to align the ranges to 32M to avoid * partial L3 pages. * * On macOS, the usage of L2 pages will increase, so as a result we will * want to align ranges to 64G in order to utilize them fully. * * * 4k pages systems * ~~~~~~~~~~~~~~~~ * * A single L0 (sub-)page covers the address space. * - L1 pages cover 512G, * - L2 pages cover 1G, * - L3 pages cover 2M. * * The long tail of processes on a system will tend to have a VA usage * (ignoring the shared regions) in the 100s of MB order of magnitnude. * This is achievable with a single L1 and a few L2s without * randomization. * * However once randomization is introduced, the system will immediately * need several L1s and many more L2s. As a result: * * - on embedded devices, the cost of these extra pages isn't * sustainable, and we just disable the feature entirely, * * - on macOS we align ranges to a 512G boundary so that the extra L1 * pages can be used to their full potential. */ /* * note, this function assumes _non exotic mappings_ * which is why it uses the native kernel's PAGE_SHIFT. */ #if XNU_PLATFORM_MacOSX pte_depth = PAGE_SHIFT > 12 ? 2 : 3; #else /* !XNU_PLATFORM_MacOSX */ pte_depth = PAGE_SHIFT > 12 ? 1 : 0; #endif /* !XNU_PLATFORM_MacOSX */ if (pte_depth == 0) { return 0; } return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1; } /* * vm_map_range_configure: * configures the user vm_map ranges by increasing the maximum VA range of * the map and carving out a range at the end of VA space (searching backwards * in the newly expanded map). */ kern_return_t vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) { const vm_map_offset_t offmask = vm_map_range_offmask(); struct mach_vm_range data_range; vm_map_offset_t default_end; kern_return_t kr; if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) { /* * No point doing vm ranges in a 32bit address space. */ return KERN_NOT_SUPPORTED; } /* Should not be applying ranges to kernel map or kernel map submaps */ assert(vm_map_pmap(map) != kernel_pmap); #if XNU_PLATFORM_MacOSX /* * on macOS, the address space is a massive 47 bits (128T), * with several carve outs that processes can't use: * - the shared region * - the commpage region * - the GPU carve out (if applicable) * * and when nano-malloc is in use it desires memory at the 96T mark. * * However, their location is architecture dependent: * - On intel, the shared region and commpage are * at the very end of the usable address space (above +127T), * and there is no GPU carve out, and pthread wants to place * threads at the 112T mark (0x70T). * * - On arm64, these are in the same spot as on embedded devices: * o shared region: [ 6G, 10G) [ will likely grow over time ] * o commpage region: [63G, 64G) * o GPU carve out: [64G, 448G) * * This is conveninent because the mappings at the end of the address * space (when they exist) are made by the kernel. * * The policy is to allocate a random 1T for the data heap * in the end of the address-space in the: * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks) * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc). */ /* see NANOZONE_SIGNATURE in libmalloc */ #if __x86_64__ default_end = 0x71ull << 40; #else default_end = 0x61ull << 40; #endif data_range = vm_map_range_random_uniform(1ull << 40, default_end, 0x7full << 40, offmask); #else /* !XNU_PLATFORM_MacOSX */ /* * Embedded devices: * * The default VA Size scales with the device physical memory. * * Out of that: * - the "zero" page typically uses 4G + some slide * - the shared region uses SHARED_REGION_SIZE bytes (4G) * * Without the use of jumbo or any adjustment to the address space, * a default VM map typically looks like this: * * 0G -->╒════════════╕ * │ pagezero │ * │ + slide │ * ~4G -->╞════════════╡<-- vm_map_min(map) * │ │ * 6G -->├────────────┤ * │ shared │ * │ region │ * 10G -->├────────────┤ * │ │ * max_va -->├────────────┤<-- vm_map_max(map) * │ │ * ╎ jumbo ╎ * ╎ ╎ * │ │ * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS * │ commpage │ * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS * │ │ * ╎ GPU ╎ * ╎ carveout ╎ * │ │ * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS * │ │ * ╎ ╎ * ╎ ╎ * │ │ * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT) * * When this drawing was made, "max_va" was smaller than * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of * 12G of address space for the zero-page, slide, files, * binaries, heap ... * * We will want to make a "heap/data" carve out inside * the jumbo range of half of that usable space, assuming * that this is less than a forth of the jumbo range. * * The assert below intends to catch when max_va grows * too large for this heuristic. */ vm_map_lock_read(map); default_end = vm_map_max(map); vm_map_unlock_read(map); /* * Check that we're not already jumbo'd, * or our address space was somehow modified. * * If so we cannot guarantee that we can set up the ranges * safely without interfering with the existing map. */ if (default_end > vm_compute_max_offset(true)) { return KERN_NO_SPACE; } if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) { /* * an override boot-arg was set, disable user-ranges * * XXX: this is problematic because it means these boot-args * no longer test the behavior changing the value * of ARM64_MAX_OFFSET_DEVICE_* would have. */ return KERN_NOT_SUPPORTED; } /* expand the default VM space to 64GB */ vm_map_set_jumbo(map); assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end); data_range = vm_map_range_random_uniform(GiB(10), default_end + PAGE_SIZE, vm_map_max(map), offmask); #endif /* !XNU_PLATFORM_MacOSX */ /* * Poke holes so that ASAN or people listing regions * do not think this space is free. */ if (default_end != data_range.min_address) { kr = vm_map_enter(map, &default_end, data_range.min_address - default_end, 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL, 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); } if (data_range.max_address != vm_map_max(map)) { vm_map_entry_t entry; vm_size_t size; /* * Extend the end of the hole to the next VM entry or the end of the map, * whichever comes first. */ vm_map_lock_read(map); vm_map_lookup_entry_or_next(map, data_range.max_address, &entry); if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) { size = vm_map_max(map) - data_range.max_address; } else { size = entry->vme_start - data_range.max_address; } vm_map_unlock_read(map); kr = vm_map_enter(map, &data_range.max_address, size, 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL, 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); } #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT if (needs_extra_jumbo_va) { /* This will grow the address space to MACH_VM_MAX_ADDRESS */ vm_map_set_extra_jumbo(map); } #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */ vm_map_lock(map); map->default_range.min_address = vm_map_min(map); map->default_range.max_address = default_end; map->data_range = data_range; #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT /* If process has "extra jumbo" entitlement, enable large file range */ if (needs_extra_jumbo_va) { map->large_file_range = vm_map_range_random_uniform(TiB(1), MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask); } #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */ map->uses_user_ranges = true; vm_map_unlock(map); return KERN_SUCCESS; } /* * vm_map_range_fork: * clones the array of ranges from old_map to new_map in support * of a VM map fork. */ void vm_map_range_fork(vm_map_t new_map, vm_map_t old_map) { if (!old_map->uses_user_ranges) { /* nothing to do */ return; } new_map->default_range = old_map->default_range; new_map->data_range = old_map->data_range; if (old_map->extra_ranges_count) { vm_map_user_range_t otable, ntable; uint16_t count; otable = old_map->extra_ranges; count = old_map->extra_ranges_count; ntable = kalloc_data(count * sizeof(struct vm_map_user_range), Z_WAITOK | Z_ZERO | Z_NOFAIL); memcpy(ntable, otable, count * sizeof(struct vm_map_user_range)); new_map->extra_ranges_count = count; new_map->extra_ranges = ntable; } new_map->uses_user_ranges = true; } /* * vm_map_get_user_range: * copy the VM user range for the given VM map and range ID. */ kern_return_t vm_map_get_user_range( vm_map_t map, vm_map_range_id_t range_id, mach_vm_range_t range) { if (map == NULL || !map->uses_user_ranges || range == NULL) { return KERN_INVALID_ARGUMENT; } switch (range_id) { case UMEM_RANGE_ID_DEFAULT: *range = map->default_range; return KERN_SUCCESS; case UMEM_RANGE_ID_HEAP: *range = map->data_range; return KERN_SUCCESS; case UMEM_RANGE_ID_LARGE_FILE: /* * Because this function tells a user-space process about the user * ranges in its VM map, this case communicates whether the large file * range is in use. Note that this is different from how the large file * range ID is handled in `vm_map_get_range()`: there, we "resolve" the * VA policy and return either the large file range or data range, * depending on whether the large file range is enabled. */ if (map->large_file_range.min_address != map->large_file_range.max_address) { /* large file range is configured and should be used */ *range = map->large_file_range; } else { return KERN_INVALID_ARGUMENT; } return KERN_SUCCESS; default: return KERN_INVALID_ARGUMENT; } } static vm_map_range_id_t vm_map_user_range_resolve( vm_map_t map, mach_vm_address_t addr, mach_vm_size_t size, mach_vm_range_t range) { struct mach_vm_range tmp; vm_map_lock_assert_held(map); static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT); static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA); if (mach_vm_range_contains(&map->default_range, addr, size)) { if (range) { *range = map->default_range; } return UMEM_RANGE_ID_DEFAULT; } if (mach_vm_range_contains(&map->data_range, addr, size)) { if (range) { *range = map->data_range; } return UMEM_RANGE_ID_HEAP; } if (mach_vm_range_contains(&map->large_file_range, addr, size)) { if (range) { *range = map->large_file_range; } return UMEM_RANGE_ID_LARGE_FILE; } for (size_t i = 0; i < map->extra_ranges_count; i++) { vm_map_user_range_t r = &map->extra_ranges[i]; tmp.min_address = r->vmur_min_address; tmp.max_address = r->vmur_max_address; if (mach_vm_range_contains(&tmp, addr, size)) { if (range) { *range = tmp; } return r->vmur_range_id; } } if (range) { range->min_address = range->max_address = 0; } return UMEM_RANGE_ID_DEFAULT; } #endif /* CONFIG_MAP_RANGES */ void vm_map_kernel_flags_update_range_id( vm_map_kernel_flags_t *vmkf, vm_map_t map, __unused vm_map_size_t size) { if (map == kernel_map) { if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) { vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA; } #if CONFIG_MAP_RANGES } else if (vmkf->vm_tag < VM_MEMORY_COUNT && vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) { if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag) || size >= VM_LARGE_FILE_THRESHOLD) { /* * if the map doesn't have the large file range configured, * the range will get resolved to the heap range in `vm_map_get_range` */ vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE; } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) { vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP; } #endif /* CONFIG_MAP_RANGES */ } } /* * vm_map_entry_has_device_pager: * Check if the vm map entry specified by the virtual address has a device pager. * If the vm map entry does not exist or if the map is NULL, this returns FALSE. */ boolean_t vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr) { vm_map_entry_t entry; vm_object_t object; boolean_t result; if (map == NULL) { return FALSE; } vm_map_lock(map); while (TRUE) { if (!vm_map_lookup_entry(map, vaddr, &entry)) { result = FALSE; break; } if (entry->is_sub_map) { // Check the submap vm_map_t submap = VME_SUBMAP(entry); assert(submap != NULL); vm_map_lock(submap); vm_map_unlock(map); map = submap; continue; } object = VME_OBJECT(entry); if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) { result = TRUE; break; } result = FALSE; break; } vm_map_unlock(map); return result; } #if MACH_ASSERT extern int pmap_ledgers_panic; extern int pmap_ledgers_panic_leeway; #define LEDGER_DRIFT(__LEDGER) \ int __LEDGER##_over; \ ledger_amount_t __LEDGER##_over_total; \ ledger_amount_t __LEDGER##_over_max; \ int __LEDGER##_under; \ ledger_amount_t __LEDGER##_under_total; \ ledger_amount_t __LEDGER##_under_max struct { uint64_t num_pmaps_checked; LEDGER_DRIFT(phys_footprint); LEDGER_DRIFT(internal); LEDGER_DRIFT(internal_compressed); LEDGER_DRIFT(external); LEDGER_DRIFT(reusable); LEDGER_DRIFT(iokit_mapped); LEDGER_DRIFT(alternate_accounting); LEDGER_DRIFT(alternate_accounting_compressed); LEDGER_DRIFT(page_table); LEDGER_DRIFT(purgeable_volatile); LEDGER_DRIFT(purgeable_nonvolatile); LEDGER_DRIFT(purgeable_volatile_compressed); LEDGER_DRIFT(purgeable_nonvolatile_compressed); LEDGER_DRIFT(tagged_nofootprint); LEDGER_DRIFT(tagged_footprint); LEDGER_DRIFT(tagged_nofootprint_compressed); LEDGER_DRIFT(tagged_footprint_compressed); LEDGER_DRIFT(network_volatile); LEDGER_DRIFT(network_nonvolatile); LEDGER_DRIFT(network_volatile_compressed); LEDGER_DRIFT(network_nonvolatile_compressed); LEDGER_DRIFT(media_nofootprint); LEDGER_DRIFT(media_footprint); LEDGER_DRIFT(media_nofootprint_compressed); LEDGER_DRIFT(media_footprint_compressed); LEDGER_DRIFT(graphics_nofootprint); LEDGER_DRIFT(graphics_footprint); LEDGER_DRIFT(graphics_nofootprint_compressed); LEDGER_DRIFT(graphics_footprint_compressed); LEDGER_DRIFT(neural_nofootprint); LEDGER_DRIFT(neural_footprint); LEDGER_DRIFT(neural_nofootprint_compressed); LEDGER_DRIFT(neural_footprint_compressed); LEDGER_DRIFT(neural_nofootprint_total); } pmap_ledgers_drift; void vm_map_pmap_check_ledgers( pmap_t pmap, ledger_t ledger, int pid, char *procname) { ledger_amount_t bal; boolean_t do_panic; do_panic = FALSE; pmap_ledgers_drift.num_pmaps_checked++; #define LEDGER_CHECK_BALANCE(__LEDGER) \ MACRO_BEGIN \ int panic_on_negative = TRUE; \ ledger_get_balance(ledger, \ task_ledgers.__LEDGER, \ &bal); \ ledger_get_panic_on_negative(ledger, \ task_ledgers.__LEDGER, \ &panic_on_negative); \ if (bal != 0) { \ if (panic_on_negative || \ (pmap_ledgers_panic && \ pmap_ledgers_panic_leeway > 0 && \ (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ do_panic = TRUE; \ } \ printf("LEDGER BALANCE proc %d (%s) " \ "\"%s\" = %lld\n", \ pid, procname, #__LEDGER, bal); \ if (bal > 0) { \ pmap_ledgers_drift.__LEDGER##_over++; \ pmap_ledgers_drift.__LEDGER##_over_total += bal; \ if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ pmap_ledgers_drift.__LEDGER##_over_max = bal; \ } \ } else if (bal < 0) { \ pmap_ledgers_drift.__LEDGER##_under++; \ pmap_ledgers_drift.__LEDGER##_under_total += bal; \ if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ pmap_ledgers_drift.__LEDGER##_under_max = bal; \ } \ } \ } \ MACRO_END LEDGER_CHECK_BALANCE(phys_footprint); LEDGER_CHECK_BALANCE(internal); LEDGER_CHECK_BALANCE(internal_compressed); LEDGER_CHECK_BALANCE(external); LEDGER_CHECK_BALANCE(reusable); LEDGER_CHECK_BALANCE(iokit_mapped); LEDGER_CHECK_BALANCE(alternate_accounting); LEDGER_CHECK_BALANCE(alternate_accounting_compressed); LEDGER_CHECK_BALANCE(page_table); LEDGER_CHECK_BALANCE(purgeable_volatile); LEDGER_CHECK_BALANCE(purgeable_nonvolatile); LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); LEDGER_CHECK_BALANCE(tagged_nofootprint); LEDGER_CHECK_BALANCE(tagged_footprint); LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed); LEDGER_CHECK_BALANCE(tagged_footprint_compressed); LEDGER_CHECK_BALANCE(network_volatile); LEDGER_CHECK_BALANCE(network_nonvolatile); LEDGER_CHECK_BALANCE(network_volatile_compressed); LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); LEDGER_CHECK_BALANCE(media_nofootprint); LEDGER_CHECK_BALANCE(media_footprint); LEDGER_CHECK_BALANCE(media_nofootprint_compressed); LEDGER_CHECK_BALANCE(media_footprint_compressed); LEDGER_CHECK_BALANCE(graphics_nofootprint); LEDGER_CHECK_BALANCE(graphics_footprint); LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed); LEDGER_CHECK_BALANCE(graphics_footprint_compressed); LEDGER_CHECK_BALANCE(neural_nofootprint); LEDGER_CHECK_BALANCE(neural_footprint); LEDGER_CHECK_BALANCE(neural_nofootprint_compressed); LEDGER_CHECK_BALANCE(neural_footprint_compressed); LEDGER_CHECK_BALANCE(neural_nofootprint_total); if (do_panic) { if (pmap_ledgers_panic) { panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers", pmap, pid, procname); } else { printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", pmap, pid, procname); } } } void vm_map_pmap_set_process( vm_map_t map, int pid, char *procname) { pmap_set_process(vm_map_pmap(map), pid, procname); } #endif /* MACH_ASSERT */